Project

General

Profile

1 7190 leinfelder
-- use default values initially
2 7191 leinfelder
CREATE TABLE doi_registration AS
3
4 7189 leinfelder
select
5
text('testing') as ezid_account,
6
doc.doctype,
7
id.guid,
8 7284 leinfelder
'https://cn.dataone.org/cn/v1/resolve/' || regexp_replace(regexp_replace(id.guid, '/', '%2F', 'g'), ':', '%3A', 'g') as url,
9 7190 leinfelder
text('Unknown') as title,
10 7191 leinfelder
text(null) as creator,
11
text(null) as publisher,
12 7190 leinfelder
to_char(doc.date_created, 'YYYY') as pub_date
13 7189 leinfelder
from identifier id, xml_documents doc
14
where guid like 'doi%'
15 7191 leinfelder
and id.docid = doc.docid
16
and id.rev = doc.rev
17 7189 leinfelder
18 7191 leinfelder
UNION ALL
19
20
select
21
text('testing') as ezid_account,
22
doc.doctype,
23
id.guid,
24 7284 leinfelder
'https://cn.dataone.org/cn/v1/resolve/' || regexp_replace(regexp_replace(id.guid, '/', '%2F', 'g'), ':', '%3A', 'g') as url,
25 7191 leinfelder
text('Unknown') as title,
26
text(null) as creator,
27
text(null) as publisher,
28
to_char(doc.date_created, 'YYYY') as pub_date
29
from identifier id, xml_revisions doc
30
where guid like 'doi%'
31
and id.docid = doc.docid
32
and id.rev = doc.rev;
33
34 7190 leinfelder
--update defaults
35
update doi_registration
36
set title = 'Data file'
37
where doctype = 'BIN';
38
39
update doi_registration
40
set title = 'Legacy EML file'
41
where doctype like '%eml%2.0.0beta%';
42
43
-- update title using node information
44 7189 leinfelder
update doi_registration doi
45
set title = child.nodedata
46
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
47
where doi.guid = id.guid
48
and id.docid = docs.docid
49
and id.rev = docs.rev
50
and docs.rootnodeid = nodes.rootnodeid
51
and nodes.nodeid = child.parentnodeid
52
and nodes.parentnodeid = parent.nodeid
53
and nodes.nodename = 'title'
54
and parent.nodename = 'dataset';
55
56 7191 leinfelder
-- update pubDate using nodes
57 7190 leinfelder
update doi_registration doi
58
set pub_date = child.nodedata
59
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
60
where doi.guid = id.guid
61
and id.docid = docs.docid
62
and id.rev = docs.rev
63
and docs.rootnodeid = nodes.rootnodeid
64
and nodes.nodeid = child.parentnodeid
65
and nodes.parentnodeid = parent.nodeid
66
and nodes.nodename = 'pubDate'
67
and parent.nodename = 'dataset';
68
69 7191 leinfelder
-- update publisher using nodes
70 7190 leinfelder
update doi_registration doi
71
set publisher = child.nodedata
72
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
73
where doi.guid = id.guid
74
and id.docid = docs.docid
75
and id.rev = docs.rev
76
and docs.rootnodeid = nodes.rootnodeid
77
and nodes.nodeid = child.parentnodeid
78
and nodes.parentnodeid = parent.nodeid
79
and nodes.nodename = 'publisher'
80 7305 leinfelder
and parent.nodename = 'dataset'
81
and child.nodetype = 'TEXT';
82 7190 leinfelder
83 7191 leinfelder
-- update organization using nodes
84
update doi_registration doi
85
set creator = child.nodedata
86
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
87
where doi.guid = id.guid
88
and id.docid = docs.docid
89
and id.rev = docs.rev
90
and docs.rootnodeid = nodes.rootnodeid
91
and nodes.nodeid = child.parentnodeid
92
and nodes.parentnodeid = parent.nodeid
93
and nodes.nodename = 'organizationName'
94
and parent.nodename = 'creator'
95
and nodes.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = parent.nodeid and nodetype = 'ELEMENT');
96
97 7189 leinfelder
-- update creator using nodes
98
-- creator/individualName/surName
99
-- creator/individualName/givenName
100
update doi_registration doi
101 7191 leinfelder
set creator = child.nodedata
102 7189 leinfelder
from identifier id,
103
xml_documents docs,
104
xml_nodes nodes,
105
xml_nodes parent,
106
xml_nodes child,
107 7191 leinfelder
xml_nodes grandparent
108 7189 leinfelder
where doi.guid = id.guid
109
and id.docid = docs.docid
110
and id.rev = docs.rev
111
and docs.rootnodeid = nodes.rootnodeid
112
and nodes.nodeid = child.parentnodeid
113
and nodes.parentnodeid = parent.nodeid
114
and parent.parentnodeid = grandparent.nodeid
115
and nodes.nodename = 'surName'
116
and parent.nodename = 'individualName'
117
and grandparent.nodename = 'creator'
118 7191 leinfelder
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
119 7189 leinfelder
120 7191 leinfelder
-- add the first name if we have it
121
update doi_registration doi
122
set creator = child.nodedata ||' '|| creator
123
from identifier id,
124
xml_documents docs,
125
xml_nodes nodes,
126
xml_nodes parent,
127
xml_nodes child,
128
xml_nodes grandparent
129
where doi.guid = id.guid
130
and id.docid = docs.docid
131
and id.rev = docs.rev
132
and docs.rootnodeid = nodes.rootnodeid
133
and nodes.nodeid = child.parentnodeid
134
and nodes.parentnodeid = parent.nodeid
135
and parent.parentnodeid = grandparent.nodeid
136
and nodes.nodename = 'givenName'
137
and parent.nodename = 'individualName'
138
and grandparent.nodename = 'creator'
139
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
140 7189 leinfelder
141
-- update title using revisions
142
update doi_registration doi
143
set title = child.nodedata
144
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
145
where doi.guid = id.guid
146
and id.docid = docs.docid
147
and id.rev = docs.rev
148
and docs.rootnodeid = nodes.rootnodeid
149
and nodes.nodeid = child.parentnodeid
150
and nodes.parentnodeid = parent.nodeid
151
and nodes.nodename = 'title'
152
and parent.nodename = 'dataset';
153
154 7190 leinfelder
-- update pubDate using revisions
155
update doi_registration doi
156
set pub_date = child.nodedata
157
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
158
where doi.guid = id.guid
159
and id.docid = docs.docid
160
and id.rev = docs.rev
161
and docs.rootnodeid = nodes.rootnodeid
162
and nodes.nodeid = child.parentnodeid
163
and nodes.parentnodeid = parent.nodeid
164
and nodes.nodename = 'pubDate'
165
and parent.nodename = 'dataset';
166 7189 leinfelder
167 7190 leinfelder
-- update publisher using revisions
168
update doi_registration doi
169
set publisher = child.nodedata
170
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
171
where doi.guid = id.guid
172
and id.docid = docs.docid
173
and id.rev = docs.rev
174
and docs.rootnodeid = nodes.rootnodeid
175
and nodes.nodeid = child.parentnodeid
176
and nodes.parentnodeid = parent.nodeid
177
and nodes.nodename = 'publisher'
178 7305 leinfelder
and parent.nodename = 'dataset'
179
and child.nodetype = 'TEXT';
180 7190 leinfelder
181 7191 leinfelder
-- update organization using nodes
182
update doi_registration doi
183
set creator = child.nodedata
184
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
185
where doi.guid = id.guid
186
and id.docid = docs.docid
187
and id.rev = docs.rev
188
and docs.rootnodeid = nodes.rootnodeid
189
and nodes.nodeid = child.parentnodeid
190
and nodes.parentnodeid = parent.nodeid
191
and nodes.nodename = 'organizationName'
192
and parent.nodename = 'creator'
193
and nodes.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = parent.nodeid and nodetype = 'ELEMENT');
194 7190 leinfelder
195 7189 leinfelder
-- update creator using revisions
196
update doi_registration doi
197 7191 leinfelder
set creator = child.nodedata
198 7189 leinfelder
from identifier id,
199
xml_revisions docs,
200
xml_nodes_revisions nodes,
201
xml_nodes_revisions parent,
202
xml_nodes_revisions child,
203 7191 leinfelder
xml_nodes_revisions grandparent
204 7189 leinfelder
where doi.guid = id.guid
205
and id.docid = docs.docid
206
and id.rev = docs.rev
207
and docs.rootnodeid = nodes.rootnodeid
208
and nodes.nodeid = child.parentnodeid
209
and nodes.parentnodeid = parent.nodeid
210
and parent.parentnodeid = grandparent.nodeid
211
and nodes.nodename = 'surName'
212
and parent.nodename = 'individualName'
213
and grandparent.nodename = 'creator'
214 7191 leinfelder
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
215 7189 leinfelder
216 7191 leinfelder
-- add the first name if we have it
217
update doi_registration doi
218
set creator = child.nodedata ||' '|| creator
219
from identifier id,
220
xml_revisions docs,
221
xml_nodes_revisions nodes,
222
xml_nodes_revisions parent,
223
xml_nodes_revisions child,
224
xml_nodes_revisions grandparent
225
where doi.guid = id.guid
226
and id.docid = docs.docid
227
and id.rev = docs.rev
228
and docs.rootnodeid = nodes.rootnodeid
229
and nodes.nodeid = child.parentnodeid
230
and nodes.parentnodeid = parent.nodeid
231
and parent.parentnodeid = grandparent.nodeid
232
and nodes.nodename = 'givenName'
233
and parent.nodename = 'individualName'
234
and grandparent.nodename = 'creator'
235
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
236
237
-- use xml_documents for defaults that are still missing
238
update doi_registration doi
239
set creator = doc.user_owner
240
from xml_documents doc, identifier id
241
where doi.guid = id.guid
242
and id.docid = doc.docid
243
and id.rev = doc.rev
244
and doi.creator is null;
245
246
update doi_registration doi
247
set creator = doc.user_owner
248
from xml_revisions doc, identifier id
249
where doi.guid = id.guid
250
and id.docid = doc.docid
251
and id.rev = doc.rev
252
and doi.creator is null;
253
254
-- set publisher
255
update doi_registration
256
set publisher = creator
257
where publisher is null or publisher = '';
258
259 7189 leinfelder
-- clean up
260
update doi_registration
261
set title = replace(title, E'\n', ' ');
262
263 7190 leinfelder
update doi_registration
264 7193 leinfelder
set title = regexp_replace(title, E'\\s+', ' ', 'g');
265 7190 leinfelder
266
update doi_registration
267 7193 leinfelder
set title = regexp_replace(title, E'^\\s+', '', 'g');
268 7190 leinfelder
269 7288 leinfelder
update doi_registration
270
set ezid_account = 'KNB'
271
where guid like 'doi:10.5063%';
272
--16988
273
274
update doi_registration
275
set ezid_account = 'PISCO'
276
where guid like 'doi:10.6085%';
277
--98078
278
279
update doi_registration
280
set ezid_account = 'LTER'
281
where guid like 'doi:10.6073%';
282
--49482
283
284 7290 leinfelder
/**
285
 * Additional modifications, July 5th, 2012
286
 */
287
ALTER TABLE doi_registration
288
ADD COLUMN resourceType text,
289
ADD COLUMN objectFormat text,
290
ADD COLUMN obsoletedBy text,
291
ADD COLUMN obsoletes text,
292
ADD COLUMN resourceMapId text,
293
ADD COLUMN resourceMapLocation text,
294
ADD COLUMN access text;
295
296
-- update access values
297
update doi_registration
298
set access = 'protected';
299
300
update doi_registration doi
301
set access = 'public'
302
from xml_access a
303
where doi.guid = a.guid
304
and a.principal_name = 'public'
305
and a.permission >= '4'
306
and a.perm_type = 'allow';
307
308
-- update the objectFormat from xml_documents
309
update doi_registration doi
310
set objectFormat = doc.doctype
311
from identifier id, xml_documents doc
312
where doi.guid = id.guid
313
and id.docid = doc.docid;
314
315
-- update the objectFormat from xml_revisions
316
update doi_registration doi
317
set objectFormat = rev.doctype
318
from identifier id, xml_revisions rev
319
where doi.guid = id.guid
320
and id.docid = rev.docid;
321
322
--update resourceType
323
update doi_registration
324
set resourceType = 'Dataset/METADATA';
325
326
update doi_registration
327
set resourceType = 'Dataset/DATA'
328
where objectFormat = 'BIN';
329
330
-- update the objectFormat from SM table (will be subset)
331
update doi_registration doi
332
set objectFormat = sm.object_format
333
from systemMetadata sm
334
where doi.guid = sm.guid;
335
--16938
336
337
--update revision history from SM
338
update doi_registration doi
339
set obsoletes = sm.obsoletes,
340
obsoletedBy = sm.obsoleted_by
341
from systemMetadata sm
342
where doi.guid = sm.guid;
343
344 7303 leinfelder
/** use plain old revision history **/
345
-- update obsoletedby
346
update doi_registration doi
347
set obsoletedBy = newer.guid
348
from identifier id, identifier newer
349
where doi.guid = id.guid
350
and id.docid = newer.docid
351
and newer.rev = (select min(next.rev) from identifier next where next.docid = id.docid and next.rev > id.rev);
352
353
-- update the obsolets
354
update doi_registration doi
355
set obsoletes = older.guid
356
from identifier id, identifier older
357
where doi.guid = id.guid
358
and id.docid = older.docid
359
and older.rev = (select max(prev.rev) from identifier prev where prev.docid = id.docid and prev.rev < id.rev);
360
361
/**
362
select doi.guid, older.guid as obsoletes
363
from doi_registration doi, identifier id, identifier older
364
where doi.guid = id.guid
365
and id.docid = older.docid
366
and older.rev = (select max(prev.rev) from identifier prev where prev.docid = id.docid and prev.rev < id.rev)
367
and doi.guid like 'doi:10.5063/AA/ABS.4%';
368
**/
369
370
371
-- update resourceMap pointer
372 7290 leinfelder
-- NOTE: some of these maps might not actually exist
373
update doi_registration doi
374
set resourceMapId = 'resourceMap_'||id.docid||'.'||id.rev
375
from identifier id
376
where doi.guid = id.guid
377
and id.docid = rev.docid
378
and objectFormat like 'eml%';
379
380
--update the resourcemapid for described data
381
CREATE TABLE ecogrid_docids AS
382
select distinct id.docid, id.rev, id.guid, substring(xpi.nodedata from 'ecogrid://knb/(.*)$') as data_docid
383
from xml_path_index xpi, identifier id, xml_documents xmld
384
where xpi.docid = xmld.docid
385
and xmld.docid = id.docid
386
and xmld.rev = id.rev
387
and xpi.path like 'dataset/%/physical/distribution/online/url'
388 7303 leinfelder
and xpi.nodedata like 'ecogrid%'
389
and id.guid in (select guid from doi_registration);
390 7290 leinfelder
391 7303 leinfelder
-- include revisions for described data
392
INSERT INTO ecogrid_docids
393
select distinct id.docid, id.rev, id.guid, substring(child.nodedata from 'ecogrid://knb/(.*)$') as data_docid
394
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
395
where id.docid = docs.docid
396
and id.rev = docs.rev
397
and docs.rootnodeid = nodes.rootnodeid
398
and nodes.nodeid = child.parentnodeid
399
and nodes.parentnodeid = parent.nodeid
400
and nodes.nodename = 'url'
401
and parent.nodename = 'online'
402
and child.nodedata like 'ecogrid%'
403
and child.nodetype = 'TEXT'
404
and id.guid in (select guid from doi_registration);
405 7290 leinfelder
406 7321 leinfelder
-- Set the resource map for the data files (NOTE: some of thee maps might not actually exist on the system depending on how successful the ORE generation was)
407 7290 leinfelder
update doi_registration doi
408
set resourceMapId = 'resourceMap_'||eco.docid||'.'||eco.rev
409
from identifier id, ecogrid_docids eco
410
where doi.guid = id.guid
411
and id.docid||'.'||id.rev = eco.data_docid;
412
413 7321 leinfelder
-- set the resource map id for the metadata file that did most of the packaging work!
414
update doi_registration doi
415
set resourceMapId = 'resourceMap_'||eco.docid||'.'||eco.rev
416
from ecogrid_docids eco
417
where doi.guid = eco.guid;
418
419 7303 leinfelder
select count(*)
420
from doi_registration
421
where resourceMapId is null;
422
423 7290 leinfelder
--update the resource map location
424
update doi_registration
425
set resourceMapLocation = 'https://cn.dataone.org/cn/v1/resolve/' || resourceMapId
426
where resourceMapId is not null;
427
428 7306 leinfelder
-- fix null values (should not have to have this now, but it will not hurt)
429
update doi_registration doi
430
set creator = user_owner
431
from identifier id, xml_documents docs
432
where doi.guid = id.guid
433
and id.docid = docs.docid
434
and id.rev= docs.rev
435
and creator is null;
436
437
update doi_registration doi
438
set publisher = user_owner
439
from identifier id, xml_documents docs
440
where doi.guid = id.guid
441
and id.docid = docs.docid
442
and id.rev= docs.rev
443
and publisher is null;
444
445
update doi_registration doi
446
set creator = user_owner
447
from identifier id, xml_revisions docs
448
where doi.guid = id.guid
449
and id.docid = docs.docid
450
and id.rev= docs.rev
451
and creator is null;
452
453
update doi_registration doi
454
set publisher = user_owner
455
from identifier id, xml_revisions docs
456
where doi.guid = id.guid
457
and id.docid = docs.docid
458
and id.rev= docs.rev
459
and publisher is null;
460
461 7290 leinfelder
--update creator usinig LDAP lookup
462 7296 leinfelder
CREATE TABLE ecoinfo_dn (dn text, givenName text, sn text);
463 7320 leinfelder
--TRUNCATE TABLE ecoinfo_dn;
464 7296 leinfelder
COPY ecoinfo_dn FROM '/tmp/ecoinfo_dn.csv' WITH CSV HEADER;
465 7290 leinfelder
466 7296 leinfelder
update doi_registration
467 7306 leinfelder
set creator =
468
case when givenName is null then
469
	sn
470
else
471
	givenName|| ' ' || sn
472
end
473 7296 leinfelder
from ecoinfo_dn
474 7320 leinfelder
where trim(both from lower(creator)) = lower(dn);
475 7296 leinfelder
476
update doi_registration
477 7306 leinfelder
set publisher =
478
case when givenName is null then
479
	sn
480
else
481
	givenName|| ' ' || sn
482
end
483 7296 leinfelder
from ecoinfo_dn
484 7320 leinfelder
where trim(both from lower(publisher)) = lower(dn);
485 7296 leinfelder
486
select creator, count(*) as cnt
487
from doi_registration
488
where creator like 'uid=%'
489
group by creator
490
order by cnt desc;
491
492 7320 leinfelder
-- update the data files with owner/publisher from EML entry
493
update doi_registration doi
494
set creator = meta.creator,
495
publisher = meta.publisher
496
from doi_registration meta, identifier id, ecogrid_docids eco
497
where doi.guid = id.guid
498
and id.docid||'.'||id.rev = eco.data_docid
499
and eco.guid = meta.guid;
500
501 7290 leinfelder
-- copy to the external file
502
COPY
503
(select ezid_account,
504
guid as dc_identifier,
505
url as datacite_url,
506
title as dc_title,
507
creator as dc_creator,
508
publisher as dc_publisher,
509
pub_date as datacite_publicationYear,
510
resourceType,
511
objectFormat,
512
obsoletedBy,
513
obsoletes,
514
resourceMapId,
515
resourceMapLocation,
516
access
517
from doi_registration
518 7296 leinfelder
order by dc_identifier)
519 7189 leinfelder
TO '/tmp/doi_registration.csv'
520
WITH CSV HEADER;
521 7288 leinfelder
--164548
522 7189 leinfelder
523
--drop table doi_registration;
524 7290 leinfelder
--drop table ecogrid_docids;
525 7296 leinfelder
--drop table ecoinfo_dn;