Project

General

Profile

1 7190 leinfelder
-- use default values initially
2 7191 leinfelder
CREATE TABLE doi_registration AS
3
4 7189 leinfelder
select
5
text('testing') as ezid_account,
6
doc.doctype,
7
id.guid,
8 7193 leinfelder
'https://cn-stage.dataone.org/cn/v1/resolve/' || regexp_replace(regexp_replace(id.guid, '/', '%2F', 'g'), ':', '%3A', 'g') as url,
9 7190 leinfelder
text('Unknown') as title,
10 7191 leinfelder
text(null) as creator,
11
text(null) as publisher,
12 7190 leinfelder
to_char(doc.date_created, 'YYYY') as pub_date
13 7189 leinfelder
from identifier id, xml_documents doc
14
where guid like 'doi%'
15 7191 leinfelder
and id.docid = doc.docid
16
and id.rev = doc.rev
17 7189 leinfelder
18 7191 leinfelder
UNION ALL
19
20
select
21
text('testing') as ezid_account,
22
doc.doctype,
23
id.guid,
24 7193 leinfelder
'https://cn-stage.dataone.org/cn/v1/resolve/' || regexp_replace(regexp_replace(id.guid, '/', '%2F', 'g'), ':', '%3A', 'g') as url,
25 7191 leinfelder
text('Unknown') as title,
26
text(null) as creator,
27
text(null) as publisher,
28
to_char(doc.date_created, 'YYYY') as pub_date
29
from identifier id, xml_revisions doc
30
where guid like 'doi%'
31
and id.docid = doc.docid
32
and id.rev = doc.rev;
33
34 7190 leinfelder
--update defaults
35
update doi_registration
36
set title = 'Data file'
37
where doctype = 'BIN';
38
39
update doi_registration
40
set title = 'Legacy EML file'
41
where doctype like '%eml%2.0.0beta%';
42
43
-- update title using node information
44 7189 leinfelder
update doi_registration doi
45
set title = child.nodedata
46
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
47
where doi.guid = id.guid
48
and id.docid = docs.docid
49
and id.rev = docs.rev
50
and docs.rootnodeid = nodes.rootnodeid
51
and nodes.nodeid = child.parentnodeid
52
and nodes.parentnodeid = parent.nodeid
53
and nodes.nodename = 'title'
54
and parent.nodename = 'dataset';
55
56 7191 leinfelder
-- update pubDate using nodes
57 7190 leinfelder
update doi_registration doi
58
set pub_date = child.nodedata
59
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
60
where doi.guid = id.guid
61
and id.docid = docs.docid
62
and id.rev = docs.rev
63
and docs.rootnodeid = nodes.rootnodeid
64
and nodes.nodeid = child.parentnodeid
65
and nodes.parentnodeid = parent.nodeid
66
and nodes.nodename = 'pubDate'
67
and parent.nodename = 'dataset';
68
69 7191 leinfelder
-- update publisher using nodes
70 7190 leinfelder
update doi_registration doi
71
set publisher = child.nodedata
72
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
73
where doi.guid = id.guid
74
and id.docid = docs.docid
75
and id.rev = docs.rev
76
and docs.rootnodeid = nodes.rootnodeid
77
and nodes.nodeid = child.parentnodeid
78
and nodes.parentnodeid = parent.nodeid
79
and nodes.nodename = 'publisher'
80
and parent.nodename = 'dataset';
81
82 7191 leinfelder
-- update organization using nodes
83
update doi_registration doi
84
set creator = child.nodedata
85
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
86
where doi.guid = id.guid
87
and id.docid = docs.docid
88
and id.rev = docs.rev
89
and docs.rootnodeid = nodes.rootnodeid
90
and nodes.nodeid = child.parentnodeid
91
and nodes.parentnodeid = parent.nodeid
92
and nodes.nodename = 'organizationName'
93
and parent.nodename = 'creator'
94
and nodes.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = parent.nodeid and nodetype = 'ELEMENT');
95
96 7189 leinfelder
-- update creator using nodes
97
-- creator/individualName/surName
98
-- creator/individualName/givenName
99
update doi_registration doi
100 7191 leinfelder
set creator = child.nodedata
101 7189 leinfelder
from identifier id,
102
xml_documents docs,
103
xml_nodes nodes,
104
xml_nodes parent,
105
xml_nodes child,
106 7191 leinfelder
xml_nodes grandparent
107 7189 leinfelder
where doi.guid = id.guid
108
and id.docid = docs.docid
109
and id.rev = docs.rev
110
and docs.rootnodeid = nodes.rootnodeid
111
and nodes.nodeid = child.parentnodeid
112
and nodes.parentnodeid = parent.nodeid
113
and parent.parentnodeid = grandparent.nodeid
114
and nodes.nodename = 'surName'
115
and parent.nodename = 'individualName'
116
and grandparent.nodename = 'creator'
117 7191 leinfelder
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
118 7189 leinfelder
119 7191 leinfelder
-- add the first name if we have it
120
update doi_registration doi
121
set creator = child.nodedata ||' '|| creator
122
from identifier id,
123
xml_documents docs,
124
xml_nodes nodes,
125
xml_nodes parent,
126
xml_nodes child,
127
xml_nodes grandparent
128
where doi.guid = id.guid
129
and id.docid = docs.docid
130
and id.rev = docs.rev
131
and docs.rootnodeid = nodes.rootnodeid
132
and nodes.nodeid = child.parentnodeid
133
and nodes.parentnodeid = parent.nodeid
134
and parent.parentnodeid = grandparent.nodeid
135
and nodes.nodename = 'givenName'
136
and parent.nodename = 'individualName'
137
and grandparent.nodename = 'creator'
138
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
139 7189 leinfelder
140
-- update title using revisions
141
update doi_registration doi
142
set title = child.nodedata
143
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
144
where doi.guid = id.guid
145
and id.docid = docs.docid
146
and id.rev = docs.rev
147
and docs.rootnodeid = nodes.rootnodeid
148
and nodes.nodeid = child.parentnodeid
149
and nodes.parentnodeid = parent.nodeid
150
and nodes.nodename = 'title'
151
and parent.nodename = 'dataset';
152
153 7190 leinfelder
-- update pubDate using revisions
154
update doi_registration doi
155
set pub_date = child.nodedata
156
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
157
where doi.guid = id.guid
158
and id.docid = docs.docid
159
and id.rev = docs.rev
160
and docs.rootnodeid = nodes.rootnodeid
161
and nodes.nodeid = child.parentnodeid
162
and nodes.parentnodeid = parent.nodeid
163
and nodes.nodename = 'pubDate'
164
and parent.nodename = 'dataset';
165 7189 leinfelder
166 7190 leinfelder
-- update publisher using revisions
167
update doi_registration doi
168
set publisher = child.nodedata
169
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
170
where doi.guid = id.guid
171
and id.docid = docs.docid
172
and id.rev = docs.rev
173
and docs.rootnodeid = nodes.rootnodeid
174
and nodes.nodeid = child.parentnodeid
175
and nodes.parentnodeid = parent.nodeid
176
and nodes.nodename = 'publisher'
177
and parent.nodename = 'dataset';
178
179 7191 leinfelder
-- update organization using nodes
180
update doi_registration doi
181
set creator = child.nodedata
182
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
183
where doi.guid = id.guid
184
and id.docid = docs.docid
185
and id.rev = docs.rev
186
and docs.rootnodeid = nodes.rootnodeid
187
and nodes.nodeid = child.parentnodeid
188
and nodes.parentnodeid = parent.nodeid
189
and nodes.nodename = 'organizationName'
190
and parent.nodename = 'creator'
191
and nodes.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = parent.nodeid and nodetype = 'ELEMENT');
192 7190 leinfelder
193 7189 leinfelder
-- update creator using revisions
194
update doi_registration doi
195 7191 leinfelder
set creator = child.nodedata
196 7189 leinfelder
from identifier id,
197
xml_revisions docs,
198
xml_nodes_revisions nodes,
199
xml_nodes_revisions parent,
200
xml_nodes_revisions child,
201 7191 leinfelder
xml_nodes_revisions grandparent
202 7189 leinfelder
where doi.guid = id.guid
203
and id.docid = docs.docid
204
and id.rev = docs.rev
205
and docs.rootnodeid = nodes.rootnodeid
206
and nodes.nodeid = child.parentnodeid
207
and nodes.parentnodeid = parent.nodeid
208
and parent.parentnodeid = grandparent.nodeid
209
and nodes.nodename = 'surName'
210
and parent.nodename = 'individualName'
211
and grandparent.nodename = 'creator'
212 7191 leinfelder
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
213 7189 leinfelder
214 7191 leinfelder
-- add the first name if we have it
215
update doi_registration doi
216
set creator = child.nodedata ||' '|| creator
217
from identifier id,
218
xml_revisions docs,
219
xml_nodes_revisions nodes,
220
xml_nodes_revisions parent,
221
xml_nodes_revisions child,
222
xml_nodes_revisions grandparent
223
where doi.guid = id.guid
224
and id.docid = docs.docid
225
and id.rev = docs.rev
226
and docs.rootnodeid = nodes.rootnodeid
227
and nodes.nodeid = child.parentnodeid
228
and nodes.parentnodeid = parent.nodeid
229
and parent.parentnodeid = grandparent.nodeid
230
and nodes.nodename = 'givenName'
231
and parent.nodename = 'individualName'
232
and grandparent.nodename = 'creator'
233
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
234
235
-- use xml_documents for defaults that are still missing
236
update doi_registration doi
237
set creator = doc.user_owner
238
from xml_documents doc, identifier id
239
where doi.guid = id.guid
240
and id.docid = doc.docid
241
and id.rev = doc.rev
242
and doi.creator is null;
243
244
update doi_registration doi
245
set creator = doc.user_owner
246
from xml_revisions doc, identifier id
247
where doi.guid = id.guid
248
and id.docid = doc.docid
249
and id.rev = doc.rev
250
and doi.creator is null;
251
252
-- set publisher
253
update doi_registration
254
set publisher = creator
255
where publisher is null or publisher = '';
256
257 7189 leinfelder
-- clean up
258
update doi_registration
259
set title = replace(title, E'\n', ' ');
260
261 7190 leinfelder
update doi_registration
262 7193 leinfelder
set title = regexp_replace(title, E'\\s+', ' ', 'g');
263 7190 leinfelder
264
update doi_registration
265 7193 leinfelder
set title = regexp_replace(title, E'^\\s+', '', 'g');
266 7190 leinfelder
267
COPY (select ezid_account, guid as doi, url, title, creator, publisher, pub_date from doi_registration order by doi)
268 7189 leinfelder
TO '/tmp/doi_registration.csv'
269
WITH CSV HEADER;
270
271
--drop table doi_registration;