Project

General

Profile

1 7190 leinfelder
-- use default values initially
2 7191 leinfelder
CREATE TABLE doi_registration AS
3
4 7189 leinfelder
select
5
text('testing') as ezid_account,
6
doc.doctype,
7
id.guid,
8 7284 leinfelder
'https://cn.dataone.org/cn/v1/resolve/' || regexp_replace(regexp_replace(id.guid, '/', '%2F', 'g'), ':', '%3A', 'g') as url,
9 7190 leinfelder
text('Unknown') as title,
10 7191 leinfelder
text(null) as creator,
11
text(null) as publisher,
12 7190 leinfelder
to_char(doc.date_created, 'YYYY') as pub_date
13 7189 leinfelder
from identifier id, xml_documents doc
14
where guid like 'doi%'
15 7191 leinfelder
and id.docid = doc.docid
16
and id.rev = doc.rev
17 7189 leinfelder
18 7191 leinfelder
UNION ALL
19
20
select
21
text('testing') as ezid_account,
22
doc.doctype,
23
id.guid,
24 7284 leinfelder
'https://cn.dataone.org/cn/v1/resolve/' || regexp_replace(regexp_replace(id.guid, '/', '%2F', 'g'), ':', '%3A', 'g') as url,
25 7191 leinfelder
text('Unknown') as title,
26
text(null) as creator,
27
text(null) as publisher,
28
to_char(doc.date_created, 'YYYY') as pub_date
29
from identifier id, xml_revisions doc
30
where guid like 'doi%'
31
and id.docid = doc.docid
32
and id.rev = doc.rev;
33
34 7190 leinfelder
--update defaults
35
update doi_registration
36
set title = 'Data file'
37
where doctype = 'BIN';
38
39
update doi_registration
40
set title = 'Legacy EML file'
41
where doctype like '%eml%2.0.0beta%';
42
43
-- update title using node information
44 7189 leinfelder
update doi_registration doi
45
set title = child.nodedata
46
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
47
where doi.guid = id.guid
48
and id.docid = docs.docid
49
and id.rev = docs.rev
50
and docs.rootnodeid = nodes.rootnodeid
51
and nodes.nodeid = child.parentnodeid
52
and nodes.parentnodeid = parent.nodeid
53
and nodes.nodename = 'title'
54
and parent.nodename = 'dataset';
55
56 7191 leinfelder
-- update pubDate using nodes
57 7190 leinfelder
update doi_registration doi
58
set pub_date = child.nodedata
59
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
60
where doi.guid = id.guid
61
and id.docid = docs.docid
62
and id.rev = docs.rev
63
and docs.rootnodeid = nodes.rootnodeid
64
and nodes.nodeid = child.parentnodeid
65
and nodes.parentnodeid = parent.nodeid
66
and nodes.nodename = 'pubDate'
67
and parent.nodename = 'dataset';
68
69 7191 leinfelder
-- update publisher using nodes
70 7190 leinfelder
update doi_registration doi
71
set publisher = child.nodedata
72
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
73
where doi.guid = id.guid
74
and id.docid = docs.docid
75
and id.rev = docs.rev
76
and docs.rootnodeid = nodes.rootnodeid
77
and nodes.nodeid = child.parentnodeid
78
and nodes.parentnodeid = parent.nodeid
79
and nodes.nodename = 'publisher'
80
and parent.nodename = 'dataset';
81
82 7191 leinfelder
-- update organization using nodes
83
update doi_registration doi
84
set creator = child.nodedata
85
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
86
where doi.guid = id.guid
87
and id.docid = docs.docid
88
and id.rev = docs.rev
89
and docs.rootnodeid = nodes.rootnodeid
90
and nodes.nodeid = child.parentnodeid
91
and nodes.parentnodeid = parent.nodeid
92
and nodes.nodename = 'organizationName'
93
and parent.nodename = 'creator'
94
and nodes.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = parent.nodeid and nodetype = 'ELEMENT');
95
96 7189 leinfelder
-- update creator using nodes
97
-- creator/individualName/surName
98
-- creator/individualName/givenName
99
update doi_registration doi
100 7191 leinfelder
set creator = child.nodedata
101 7189 leinfelder
from identifier id,
102
xml_documents docs,
103
xml_nodes nodes,
104
xml_nodes parent,
105
xml_nodes child,
106 7191 leinfelder
xml_nodes grandparent
107 7189 leinfelder
where doi.guid = id.guid
108
and id.docid = docs.docid
109
and id.rev = docs.rev
110
and docs.rootnodeid = nodes.rootnodeid
111
and nodes.nodeid = child.parentnodeid
112
and nodes.parentnodeid = parent.nodeid
113
and parent.parentnodeid = grandparent.nodeid
114
and nodes.nodename = 'surName'
115
and parent.nodename = 'individualName'
116
and grandparent.nodename = 'creator'
117 7191 leinfelder
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
118 7189 leinfelder
119 7191 leinfelder
-- add the first name if we have it
120
update doi_registration doi
121
set creator = child.nodedata ||' '|| creator
122
from identifier id,
123
xml_documents docs,
124
xml_nodes nodes,
125
xml_nodes parent,
126
xml_nodes child,
127
xml_nodes grandparent
128
where doi.guid = id.guid
129
and id.docid = docs.docid
130
and id.rev = docs.rev
131
and docs.rootnodeid = nodes.rootnodeid
132
and nodes.nodeid = child.parentnodeid
133
and nodes.parentnodeid = parent.nodeid
134
and parent.parentnodeid = grandparent.nodeid
135
and nodes.nodename = 'givenName'
136
and parent.nodename = 'individualName'
137
and grandparent.nodename = 'creator'
138
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
139 7189 leinfelder
140
-- update title using revisions
141
update doi_registration doi
142
set title = child.nodedata
143
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
144
where doi.guid = id.guid
145
and id.docid = docs.docid
146
and id.rev = docs.rev
147
and docs.rootnodeid = nodes.rootnodeid
148
and nodes.nodeid = child.parentnodeid
149
and nodes.parentnodeid = parent.nodeid
150
and nodes.nodename = 'title'
151
and parent.nodename = 'dataset';
152
153 7190 leinfelder
-- update pubDate using revisions
154
update doi_registration doi
155
set pub_date = child.nodedata
156
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
157
where doi.guid = id.guid
158
and id.docid = docs.docid
159
and id.rev = docs.rev
160
and docs.rootnodeid = nodes.rootnodeid
161
and nodes.nodeid = child.parentnodeid
162
and nodes.parentnodeid = parent.nodeid
163
and nodes.nodename = 'pubDate'
164
and parent.nodename = 'dataset';
165 7189 leinfelder
166 7190 leinfelder
-- update publisher using revisions
167
update doi_registration doi
168
set publisher = child.nodedata
169
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
170
where doi.guid = id.guid
171
and id.docid = docs.docid
172
and id.rev = docs.rev
173
and docs.rootnodeid = nodes.rootnodeid
174
and nodes.nodeid = child.parentnodeid
175
and nodes.parentnodeid = parent.nodeid
176
and nodes.nodename = 'publisher'
177
and parent.nodename = 'dataset';
178
179 7191 leinfelder
-- update organization using nodes
180
update doi_registration doi
181
set creator = child.nodedata
182
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
183
where doi.guid = id.guid
184
and id.docid = docs.docid
185
and id.rev = docs.rev
186
and docs.rootnodeid = nodes.rootnodeid
187
and nodes.nodeid = child.parentnodeid
188
and nodes.parentnodeid = parent.nodeid
189
and nodes.nodename = 'organizationName'
190
and parent.nodename = 'creator'
191
and nodes.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = parent.nodeid and nodetype = 'ELEMENT');
192 7190 leinfelder
193 7189 leinfelder
-- update creator using revisions
194
update doi_registration doi
195 7191 leinfelder
set creator = child.nodedata
196 7189 leinfelder
from identifier id,
197
xml_revisions docs,
198
xml_nodes_revisions nodes,
199
xml_nodes_revisions parent,
200
xml_nodes_revisions child,
201 7191 leinfelder
xml_nodes_revisions grandparent
202 7189 leinfelder
where doi.guid = id.guid
203
and id.docid = docs.docid
204
and id.rev = docs.rev
205
and docs.rootnodeid = nodes.rootnodeid
206
and nodes.nodeid = child.parentnodeid
207
and nodes.parentnodeid = parent.nodeid
208
and parent.parentnodeid = grandparent.nodeid
209
and nodes.nodename = 'surName'
210
and parent.nodename = 'individualName'
211
and grandparent.nodename = 'creator'
212 7191 leinfelder
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
213 7189 leinfelder
214 7191 leinfelder
-- add the first name if we have it
215
update doi_registration doi
216
set creator = child.nodedata ||' '|| creator
217
from identifier id,
218
xml_revisions docs,
219
xml_nodes_revisions nodes,
220
xml_nodes_revisions parent,
221
xml_nodes_revisions child,
222
xml_nodes_revisions grandparent
223
where doi.guid = id.guid
224
and id.docid = docs.docid
225
and id.rev = docs.rev
226
and docs.rootnodeid = nodes.rootnodeid
227
and nodes.nodeid = child.parentnodeid
228
and nodes.parentnodeid = parent.nodeid
229
and parent.parentnodeid = grandparent.nodeid
230
and nodes.nodename = 'givenName'
231
and parent.nodename = 'individualName'
232
and grandparent.nodename = 'creator'
233
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
234
235
-- use xml_documents for defaults that are still missing
236
update doi_registration doi
237
set creator = doc.user_owner
238
from xml_documents doc, identifier id
239
where doi.guid = id.guid
240
and id.docid = doc.docid
241
and id.rev = doc.rev
242
and doi.creator is null;
243
244
update doi_registration doi
245
set creator = doc.user_owner
246
from xml_revisions doc, identifier id
247
where doi.guid = id.guid
248
and id.docid = doc.docid
249
and id.rev = doc.rev
250
and doi.creator is null;
251
252
-- set publisher
253
update doi_registration
254
set publisher = creator
255
where publisher is null or publisher = '';
256
257 7189 leinfelder
-- clean up
258
update doi_registration
259
set title = replace(title, E'\n', ' ');
260
261 7190 leinfelder
update doi_registration
262 7193 leinfelder
set title = regexp_replace(title, E'\\s+', ' ', 'g');
263 7190 leinfelder
264
update doi_registration
265 7193 leinfelder
set title = regexp_replace(title, E'^\\s+', '', 'g');
266 7190 leinfelder
267 7288 leinfelder
update doi_registration
268
set ezid_account = 'KNB'
269
where guid like 'doi:10.5063%';
270
--16988
271
272
update doi_registration
273
set ezid_account = 'PISCO'
274
where guid like 'doi:10.6085%';
275
--98078
276
277
update doi_registration
278
set ezid_account = 'LTER'
279
where guid like 'doi:10.6073%';
280
--49482
281
282 7290 leinfelder
/**
283
 * Additional modifications, July 5th, 2012
284
 */
285
ALTER TABLE doi_registration
286
ADD COLUMN resourceType text,
287
ADD COLUMN objectFormat text,
288
ADD COLUMN obsoletedBy text,
289
ADD COLUMN obsoletes text,
290
ADD COLUMN resourceMapId text,
291
ADD COLUMN resourceMapLocation text,
292
ADD COLUMN access text;
293
294
-- update access values
295
update doi_registration
296
set access = 'protected';
297
298
update doi_registration doi
299
set access = 'public'
300
from xml_access a
301
where doi.guid = a.guid
302
and a.principal_name = 'public'
303
and a.permission >= '4'
304
and a.perm_type = 'allow';
305
306
-- update the objectFormat from xml_documents
307
update doi_registration doi
308
set objectFormat = doc.doctype
309
from identifier id, xml_documents doc
310
where doi.guid = id.guid
311
and id.docid = doc.docid;
312
313
-- update the objectFormat from xml_revisions
314
update doi_registration doi
315
set objectFormat = rev.doctype
316
from identifier id, xml_revisions rev
317
where doi.guid = id.guid
318
and id.docid = rev.docid;
319
320
--update resourceType
321
update doi_registration
322
set resourceType = 'Dataset/METADATA';
323
324
update doi_registration
325
set resourceType = 'Dataset/DATA'
326
where objectFormat = 'BIN';
327
328
-- update the objectFormat from SM table (will be subset)
329
update doi_registration doi
330
set objectFormat = sm.object_format
331
from systemMetadata sm
332
where doi.guid = sm.guid;
333
--16938
334
335
--update revision history from SM
336
update doi_registration doi
337
set obsoletes = sm.obsoletes,
338
obsoletedBy = sm.obsoleted_by
339
from systemMetadata sm
340
where doi.guid = sm.guid;
341
342
--update resourceMap pointer TBD
343
-- NOTE: some of these maps might not actually exist
344
update doi_registration doi
345
set resourceMapId = 'resourceMap_'||id.docid||'.'||id.rev
346
from identifier id
347
where doi.guid = id.guid
348
and id.docid = rev.docid
349
and objectFormat like 'eml%';
350
351
--update the resourcemapid for described data
352
CREATE TABLE ecogrid_docids AS
353
354
select distinct id.docid, id.rev, id.guid, substring(xpi.nodedata from 'ecogrid://knb/(.*)$') as data_docid
355
from xml_path_index xpi, identifier id, xml_documents xmld
356
where xpi.docid = xmld.docid
357
and xmld.docid = id.docid
358
and xmld.rev = id.rev
359
and xpi.path like 'dataset/%/physical/distribution/online/url'
360
and xpi.nodedata like 'ecogrid%';
361
362
-- TODO: include revisions for described data
363
364
-- NOTE: some of thee maps might not actually exist
365
update doi_registration doi
366
set resourceMapId = 'resourceMap_'||eco.docid||'.'||eco.rev
367
from identifier id, ecogrid_docids eco
368
where doi.guid = id.guid
369
and id.docid||'.'||id.rev = eco.data_docid;
370
371
--update the resource map location
372
update doi_registration
373
set resourceMapLocation = 'https://cn.dataone.org/cn/v1/resolve/' || resourceMapId
374
where resourceMapId is not null;
375
376
--update creator usinig LDAP lookup
377 7296 leinfelder
CREATE TABLE ecoinfo_dn (dn text, givenName text, sn text);
378
COPY ecoinfo_dn FROM '/tmp/ecoinfo_dn.csv' WITH CSV HEADER;
379 7290 leinfelder
380 7296 leinfelder
update doi_registration
381
set creator = givenName|| ' ' || sn
382
from ecoinfo_dn
383
where creator = dn;
384
385
update doi_registration
386
set publisher = givenName|| ' ' || sn
387
from ecoinfo_dn
388
where publisher = dn;
389
390
select creator, count(*) as cnt
391
from doi_registration
392
where creator like 'uid=%'
393
group by creator
394
order by cnt desc;
395
396 7290 leinfelder
-- copy to the external file
397
COPY
398
(select ezid_account,
399
guid as dc_identifier,
400
url as datacite_url,
401
title as dc_title,
402
creator as dc_creator,
403
publisher as dc_publisher,
404
pub_date as datacite_publicationYear,
405
resourceType,
406
objectFormat,
407
obsoletedBy,
408
obsoletes,
409
resourceMapId,
410
resourceMapLocation,
411
access
412
from doi_registration
413 7296 leinfelder
order by dc_identifier)
414 7189 leinfelder
TO '/tmp/doi_registration.csv'
415
WITH CSV HEADER;
416 7288 leinfelder
--164548
417 7189 leinfelder
418
--drop table doi_registration;
419 7290 leinfelder
--drop table ecogrid_docids;
420 7296 leinfelder
--drop table ecoinfo_dn;