Project

General

Profile

1
-- use default values initially
2
CREATE TABLE doi_registration AS
3

    
4
select 
5
text('testing') as ezid_account,
6
doc.doctype,
7
id.guid, 
8
'https://cn.dataone.org/cn/v1/resolve/' || regexp_replace(regexp_replace(id.guid, '/', '%2F', 'g'), ':', '%3A', 'g') as url, 
9
text('Unknown') as title, 
10
text(null) as creator,
11
text(null) as publisher,
12
to_char(doc.date_created, 'YYYY') as pub_date
13
from identifier id, xml_documents doc
14
where guid like 'doi%'
15
and id.docid = doc.docid
16
and id.rev = doc.rev
17

    
18
UNION ALL
19

    
20
select 
21
text('testing') as ezid_account,
22
doc.doctype,
23
id.guid, 
24
'https://cn.dataone.org/cn/v1/resolve/' || regexp_replace(regexp_replace(id.guid, '/', '%2F', 'g'), ':', '%3A', 'g') as url, 
25
text('Unknown') as title, 
26
text(null) as creator,
27
text(null) as publisher,
28
to_char(doc.date_created, 'YYYY') as pub_date
29
from identifier id, xml_revisions doc
30
where guid like 'doi%'
31
and id.docid = doc.docid
32
and id.rev = doc.rev;
33

    
34
--update defaults
35
update doi_registration
36
set title = 'Data file'
37
where doctype = 'BIN';
38

    
39
update doi_registration
40
set title = 'Legacy EML file'
41
where doctype like '%eml%2.0.0beta%';
42

    
43
-- update title using node information
44
update doi_registration doi
45
set title = child.nodedata
46
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
47
where doi.guid = id.guid
48
and id.docid = docs.docid
49
and id.rev = docs.rev
50
and docs.rootnodeid = nodes.rootnodeid
51
and nodes.nodeid = child.parentnodeid
52
and nodes.parentnodeid = parent.nodeid
53
and nodes.nodename = 'title'
54
and parent.nodename = 'dataset';
55

    
56
-- update pubDate using nodes
57
update doi_registration doi
58
set pub_date = child.nodedata
59
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
60
where doi.guid = id.guid
61
and id.docid = docs.docid
62
and id.rev = docs.rev
63
and docs.rootnodeid = nodes.rootnodeid
64
and nodes.nodeid = child.parentnodeid
65
and nodes.parentnodeid = parent.nodeid
66
and nodes.nodename = 'pubDate'
67
and parent.nodename = 'dataset';
68

    
69
-- update publisher using nodes
70
update doi_registration doi
71
set publisher = child.nodedata
72
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
73
where doi.guid = id.guid
74
and id.docid = docs.docid
75
and id.rev = docs.rev
76
and docs.rootnodeid = nodes.rootnodeid
77
and nodes.nodeid = child.parentnodeid
78
and nodes.parentnodeid = parent.nodeid
79
and nodes.nodename = 'publisher'
80
and parent.nodename = 'dataset'
81
and child.nodetype = 'TEXT';
82

    
83
-- update organization using nodes
84
update doi_registration doi
85
set creator = child.nodedata
86
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
87
where doi.guid = id.guid
88
and id.docid = docs.docid
89
and id.rev = docs.rev
90
and docs.rootnodeid = nodes.rootnodeid
91
and nodes.nodeid = child.parentnodeid
92
and nodes.parentnodeid = parent.nodeid
93
and nodes.nodename = 'organizationName'
94
and parent.nodename = 'creator'
95
and nodes.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = parent.nodeid and nodetype = 'ELEMENT');
96

    
97
-- update creator using nodes
98
-- creator/individualName/surName
99
-- creator/individualName/givenName
100
update doi_registration doi
101
set creator = child.nodedata
102
from identifier id, 
103
xml_documents docs, 
104
xml_nodes nodes, 
105
xml_nodes parent, 
106
xml_nodes child, 
107
xml_nodes grandparent
108
where doi.guid = id.guid
109
and id.docid = docs.docid
110
and id.rev = docs.rev
111
and docs.rootnodeid = nodes.rootnodeid
112
and nodes.nodeid = child.parentnodeid
113
and nodes.parentnodeid = parent.nodeid
114
and parent.parentnodeid = grandparent.nodeid
115
and nodes.nodename = 'surName'
116
and parent.nodename = 'individualName'
117
and grandparent.nodename = 'creator'
118
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
119

    
120
-- add the first name if we have it
121
update doi_registration doi
122
set creator = child.nodedata ||' '|| creator
123
from identifier id, 
124
xml_documents docs, 
125
xml_nodes nodes, 
126
xml_nodes parent, 
127
xml_nodes child, 
128
xml_nodes grandparent
129
where doi.guid = id.guid
130
and id.docid = docs.docid
131
and id.rev = docs.rev
132
and docs.rootnodeid = nodes.rootnodeid
133
and nodes.nodeid = child.parentnodeid
134
and nodes.parentnodeid = parent.nodeid
135
and parent.parentnodeid = grandparent.nodeid
136
and nodes.nodename = 'givenName'
137
and parent.nodename = 'individualName'
138
and grandparent.nodename = 'creator'
139
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
140

    
141
-- update title using revisions
142
update doi_registration doi
143
set title = child.nodedata
144
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
145
where doi.guid = id.guid
146
and id.docid = docs.docid
147
and id.rev = docs.rev
148
and docs.rootnodeid = nodes.rootnodeid
149
and nodes.nodeid = child.parentnodeid
150
and nodes.parentnodeid = parent.nodeid
151
and nodes.nodename = 'title'
152
and parent.nodename = 'dataset';
153

    
154
-- update pubDate using revisions
155
update doi_registration doi
156
set pub_date = child.nodedata
157
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
158
where doi.guid = id.guid
159
and id.docid = docs.docid
160
and id.rev = docs.rev
161
and docs.rootnodeid = nodes.rootnodeid
162
and nodes.nodeid = child.parentnodeid
163
and nodes.parentnodeid = parent.nodeid
164
and nodes.nodename = 'pubDate'
165
and parent.nodename = 'dataset';
166

    
167
-- update publisher using revisions
168
update doi_registration doi
169
set publisher = child.nodedata
170
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
171
where doi.guid = id.guid
172
and id.docid = docs.docid
173
and id.rev = docs.rev
174
and docs.rootnodeid = nodes.rootnodeid
175
and nodes.nodeid = child.parentnodeid
176
and nodes.parentnodeid = parent.nodeid
177
and nodes.nodename = 'publisher'
178
and parent.nodename = 'dataset'
179
and child.nodetype = 'TEXT';
180

    
181
-- update organization using nodes
182
update doi_registration doi
183
set creator = child.nodedata
184
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
185
where doi.guid = id.guid
186
and id.docid = docs.docid
187
and id.rev = docs.rev
188
and docs.rootnodeid = nodes.rootnodeid
189
and nodes.nodeid = child.parentnodeid
190
and nodes.parentnodeid = parent.nodeid
191
and nodes.nodename = 'organizationName'
192
and parent.nodename = 'creator'
193
and nodes.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = parent.nodeid and nodetype = 'ELEMENT');
194

    
195
-- update creator using revisions
196
update doi_registration doi
197
set creator = child.nodedata
198
from identifier id, 
199
xml_revisions docs, 
200
xml_nodes_revisions nodes, 
201
xml_nodes_revisions parent, 
202
xml_nodes_revisions child, 
203
xml_nodes_revisions grandparent
204
where doi.guid = id.guid
205
and id.docid = docs.docid
206
and id.rev = docs.rev
207
and docs.rootnodeid = nodes.rootnodeid
208
and nodes.nodeid = child.parentnodeid
209
and nodes.parentnodeid = parent.nodeid
210
and parent.parentnodeid = grandparent.nodeid
211
and nodes.nodename = 'surName'
212
and parent.nodename = 'individualName'
213
and grandparent.nodename = 'creator'
214
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
215

    
216
-- add the first name if we have it
217
update doi_registration doi
218
set creator = child.nodedata ||' '|| creator
219
from identifier id, 
220
xml_revisions docs, 
221
xml_nodes_revisions nodes, 
222
xml_nodes_revisions parent, 
223
xml_nodes_revisions child, 
224
xml_nodes_revisions grandparent
225
where doi.guid = id.guid
226
and id.docid = docs.docid
227
and id.rev = docs.rev
228
and docs.rootnodeid = nodes.rootnodeid
229
and nodes.nodeid = child.parentnodeid
230
and nodes.parentnodeid = parent.nodeid
231
and parent.parentnodeid = grandparent.nodeid
232
and nodes.nodename = 'givenName'
233
and parent.nodename = 'individualName'
234
and grandparent.nodename = 'creator'
235
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
236

    
237
-- use xml_documents for defaults that are still missing 
238
update doi_registration doi
239
set creator = doc.user_owner
240
from xml_documents doc, identifier id
241
where doi.guid = id.guid
242
and id.docid = doc.docid
243
and id.rev = doc.rev
244
and doi.creator is null;
245

    
246
update doi_registration doi
247
set creator = doc.user_owner
248
from xml_revisions doc, identifier id
249
where doi.guid = id.guid
250
and id.docid = doc.docid
251
and id.rev = doc.rev
252
and doi.creator is null;
253

    
254
-- set publisher
255
update doi_registration
256
set publisher = creator
257
where publisher is null or publisher = '';
258

    
259
-- clean up
260
update doi_registration
261
set title = replace(title, E'\n', ' ');
262

    
263
update doi_registration
264
set title = regexp_replace(title, E'\\s+', ' ', 'g');
265

    
266
update doi_registration
267
set title = regexp_replace(title, E'^\\s+', '', 'g');
268

    
269
update doi_registration
270
set ezid_account = 'KNB'
271
where guid like 'doi:10.5063%';
272
--16988
273

    
274
update doi_registration
275
set ezid_account = 'PISCO'
276
where guid like 'doi:10.6085%';
277
--98078
278

    
279
update doi_registration
280
set ezid_account = 'LTER'
281
where guid like 'doi:10.6073%';
282
--49482
283

    
284
/**
285
 * Additional modifications, July 5th, 2012
286
 */
287
ALTER TABLE doi_registration
288
ADD COLUMN resourceType text,
289
ADD COLUMN objectFormat text,
290
ADD COLUMN obsoletedBy text,
291
ADD COLUMN obsoletes text,
292
ADD COLUMN resourceMapId text,
293
ADD COLUMN resourceMapLocation text,
294
ADD COLUMN access text;
295

    
296
-- update access values
297
update doi_registration
298
set access = 'protected';
299

    
300
update doi_registration doi
301
set access = 'public'
302
from xml_access a
303
where doi.guid = a.guid
304
and a.principal_name = 'public'
305
and a.permission >= '4'
306
and a.perm_type = 'allow';
307

    
308
-- update the objectFormat from xml_documents
309
update doi_registration doi
310
set objectFormat = doc.doctype
311
from identifier id, xml_documents doc
312
where doi.guid = id.guid
313
and id.docid = doc.docid;
314

    
315
-- update the objectFormat from xml_revisions
316
update doi_registration doi
317
set objectFormat = rev.doctype
318
from identifier id, xml_revisions rev
319
where doi.guid = id.guid
320
and id.docid = rev.docid;
321

    
322
--update resourceType
323
update doi_registration
324
set resourceType = 'Dataset/METADATA';
325

    
326
update doi_registration
327
set resourceType = 'Dataset/DATA'
328
where objectFormat = 'BIN';
329

    
330
-- update the objectFormat from SM table (will be subset)
331
update doi_registration doi
332
set objectFormat = sm.object_format
333
from systemMetadata sm
334
where doi.guid = sm.guid;
335
--16938
336

    
337
--update revision history from SM
338
update doi_registration doi
339
set obsoletes = sm.obsoletes,
340
obsoletedBy = sm.obsoleted_by
341
from systemMetadata sm
342
where doi.guid = sm.guid;
343

    
344
/** use plain old revision history **/
345
-- update obsoletedby
346
update doi_registration doi
347
set obsoletedBy = newer.guid
348
from identifier id, identifier newer
349
where doi.guid = id.guid
350
and id.docid = newer.docid
351
and newer.rev = (select min(next.rev) from identifier next where next.docid = id.docid and next.rev > id.rev);
352

    
353
-- update the obsolets
354
update doi_registration doi
355
set obsoletes = older.guid
356
from identifier id, identifier older
357
where doi.guid = id.guid
358
and id.docid = older.docid
359
and older.rev = (select max(prev.rev) from identifier prev where prev.docid = id.docid and prev.rev < id.rev);
360

    
361
/**
362
select doi.guid, older.guid as obsoletes 
363
from doi_registration doi, identifier id, identifier older
364
where doi.guid = id.guid
365
and id.docid = older.docid
366
and older.rev = (select max(prev.rev) from identifier prev where prev.docid = id.docid and prev.rev < id.rev)
367
and doi.guid like 'doi:10.5063/AA/ABS.4%';
368
**/
369

    
370

    
371
-- update resourceMap pointer
372
-- NOTE: some of these maps might not actually exist
373
update doi_registration doi
374
set resourceMapId = 'resourceMap_'||id.docid||'.'||id.rev
375
from identifier id
376
where doi.guid = id.guid
377
and id.docid = rev.docid
378
and objectFormat like 'eml%';
379

    
380
--update the resourcemapid for described data
381
CREATE TABLE ecogrid_docids AS
382
select distinct id.docid, id.rev, id.guid, substring(xpi.nodedata from 'ecogrid://knb/(.*)$') as data_docid
383
from xml_path_index xpi, identifier id, xml_documents xmld
384
where xpi.docid = xmld.docid 
385
and xmld.docid = id.docid
386
and xmld.rev = id.rev
387
and xpi.path like 'dataset/%/physical/distribution/online/url' 
388
and xpi.nodedata like 'ecogrid%'
389
and id.guid in (select guid from doi_registration);
390

    
391
-- include revisions for described data
392
INSERT INTO ecogrid_docids
393
select distinct id.docid, id.rev, id.guid, substring(child.nodedata from 'ecogrid://knb/(.*)$') as data_docid
394
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
395
where id.docid = docs.docid
396
and id.rev = docs.rev
397
and docs.rootnodeid = nodes.rootnodeid
398
and nodes.nodeid = child.parentnodeid
399
and nodes.parentnodeid = parent.nodeid
400
and nodes.nodename = 'url'
401
and parent.nodename = 'online'
402
and child.nodedata like 'ecogrid%'
403
and child.nodetype = 'TEXT'
404
and id.guid in (select guid from doi_registration);
405

    
406
-- Set the resource map for the data files (NOTE: some of thee maps might not actually exist on the system depending on how successful the ORE generation was)
407
update doi_registration doi
408
set resourceMapId = 'resourceMap_'||eco.docid||'.'||eco.rev
409
from identifier id, ecogrid_docids eco
410
where doi.guid = id.guid
411
and id.docid||'.'||id.rev = eco.data_docid;
412

    
413
-- set the resource map id for the metadata file that did most of the packaging work!
414
update doi_registration doi
415
set resourceMapId = 'resourceMap_'||eco.docid||'.'||eco.rev
416
from ecogrid_docids eco
417
where doi.guid = eco.guid;
418

    
419
select count(*)
420
from doi_registration
421
where resourceMapId is null;
422

    
423
--update the resource map location
424
update doi_registration
425
set resourceMapLocation = 'https://cn.dataone.org/cn/v1/resolve/' || resourceMapId
426
where resourceMapId is not null;
427

    
428
-- fix null values (should not have to have this now, but it will not hurt)
429
update doi_registration doi
430
set creator = user_owner
431
from identifier id, xml_documents docs
432
where doi.guid = id.guid
433
and id.docid = docs.docid
434
and id.rev= docs.rev
435
and creator is null;
436

    
437
update doi_registration doi
438
set publisher = user_owner
439
from identifier id, xml_documents docs
440
where doi.guid = id.guid
441
and id.docid = docs.docid
442
and id.rev= docs.rev
443
and publisher is null;
444

    
445
update doi_registration doi
446
set creator = user_owner
447
from identifier id, xml_revisions docs
448
where doi.guid = id.guid
449
and id.docid = docs.docid
450
and id.rev= docs.rev
451
and creator is null;
452

    
453
update doi_registration doi
454
set publisher = user_owner
455
from identifier id, xml_revisions docs
456
where doi.guid = id.guid
457
and id.docid = docs.docid
458
and id.rev= docs.rev
459
and publisher is null;
460

    
461
--update creator usinig LDAP lookup
462
CREATE TABLE ecoinfo_dn (dn text, givenName text, sn text);
463
--TRUNCATE TABLE ecoinfo_dn;
464
COPY ecoinfo_dn FROM '/tmp/ecoinfo_dn.csv' WITH CSV HEADER;
465

    
466
update doi_registration
467
set creator = 
468
case when givenName is null then
469
	sn
470
else 
471
	givenName|| ' ' || sn
472
end
473
from ecoinfo_dn
474
where trim(both from lower(creator)) = lower(dn);
475

    
476
update doi_registration
477
set publisher = 
478
case when givenName is null then
479
	sn
480
else 
481
	givenName|| ' ' || sn
482
end
483
from ecoinfo_dn
484
where trim(both from lower(publisher)) = lower(dn);
485

    
486
select creator, count(*) as cnt
487
from doi_registration
488
where creator like 'uid=%'
489
group by creator
490
order by cnt desc;
491

    
492
-- update the data files with owner/publisher from EML entry
493
update doi_registration doi
494
set creator = meta.creator,
495
publisher = meta.publisher
496
from doi_registration meta, identifier id, ecogrid_docids eco
497
where doi.guid = id.guid
498
and id.docid||'.'||id.rev = eco.data_docid
499
and eco.guid = meta.guid;
500

    
501
-- copy to the external file
502
COPY 
503
(select ezid_account, 
504
guid as dc_identifier, 
505
url as datacite_url, 
506
title as dc_title, 
507
creator as dc_creator, 
508
publisher as dc_publisher, 
509
pub_date as datacite_publicationYear,
510
resourceType,
511
objectFormat,
512
obsoletedBy,
513
obsoletes,
514
resourceMapId,
515
resourceMapLocation,
516
access
517
from doi_registration 
518
order by dc_identifier) 
519
TO '/tmp/doi_registration.csv'
520
WITH CSV HEADER;
521
--164548
522

    
523
--drop table doi_registration;
524
--drop table ecogrid_docids;
525
--drop table ecoinfo_dn;
526

    
(10-10/63)