Project

General

Profile

metacat / src / doi_registration.sql @ 8556

1
-- use default values initially
2
CREATE TABLE doi_registration AS
3

    
4
select 
5
text('testing') as ezid_account,
6
doc.doctype,
7
id.guid, 
8
'https://cn.dataone.org/cn/v1/resolve/' || regexp_replace(regexp_replace(id.guid, '/', '%2F', 'g'), ':', '%3A', 'g') as url, 
9
text('Unknown') as title, 
10
text(null) as creator,
11
text(null) as publisher,
12
to_char(doc.date_created, 'YYYY') as pub_date
13
from identifier id, xml_documents doc
14
where guid like 'doi%'
15
and id.docid = doc.docid
16
and id.rev = doc.rev
17

    
18
UNION ALL
19

    
20
select 
21
text('testing') as ezid_account,
22
doc.doctype,
23
id.guid, 
24
'https://cn.dataone.org/cn/v1/resolve/' || regexp_replace(regexp_replace(id.guid, '/', '%2F', 'g'), ':', '%3A', 'g') as url, 
25
text('Unknown') as title, 
26
text(null) as creator,
27
text(null) as publisher,
28
to_char(doc.date_created, 'YYYY') as pub_date
29
from identifier id, xml_revisions doc
30
where guid like 'doi%'
31
and id.docid = doc.docid
32
and id.rev = doc.rev;
33

    
34
--update defaults
35
update doi_registration
36
set title = 'Data file'
37
where doctype = 'BIN';
38

    
39
update doi_registration
40
set title = 'Legacy EML file'
41
where doctype like '%eml%2.0.0beta%';
42

    
43
-- update title using node information
44
update doi_registration doi
45
set title = child.nodedata
46
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
47
where doi.guid = id.guid
48
and id.docid = docs.docid
49
and id.rev = docs.rev
50
and docs.rootnodeid = nodes.rootnodeid
51
and nodes.nodeid = child.parentnodeid
52
and nodes.parentnodeid = parent.nodeid
53
and nodes.nodename = 'title'
54
and parent.nodename = 'dataset';
55

    
56
-- update pubDate using nodes
57
update doi_registration doi
58
set pub_date = child.nodedata
59
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
60
where doi.guid = id.guid
61
and id.docid = docs.docid
62
and id.rev = docs.rev
63
and docs.rootnodeid = nodes.rootnodeid
64
and nodes.nodeid = child.parentnodeid
65
and nodes.parentnodeid = parent.nodeid
66
and nodes.nodename = 'pubDate'
67
and parent.nodename = 'dataset';
68

    
69
-- update publisher using nodes
70
update doi_registration doi
71
set publisher = child.nodedata
72
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
73
where doi.guid = id.guid
74
and id.docid = docs.docid
75
and id.rev = docs.rev
76
and docs.rootnodeid = nodes.rootnodeid
77
and nodes.nodeid = child.parentnodeid
78
and nodes.parentnodeid = parent.nodeid
79
and nodes.nodename = 'organizationName'
80
and parent.nodename = 'publisher'
81
and nodes.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = parent.nodeid and nodetype = 'ELEMENT');
82

    
83
-- update publisher surName
84
update doi_registration doi
85
set publisher = child.nodedata
86
from identifier id, 
87
xml_documents docs, 
88
xml_nodes nodes, 
89
xml_nodes parent, 
90
xml_nodes child, 
91
xml_nodes grandparent
92
where doi.guid = id.guid
93
and id.docid = docs.docid
94
and id.rev = docs.rev
95
and docs.rootnodeid = nodes.rootnodeid
96
and nodes.nodeid = child.parentnodeid
97
and nodes.parentnodeid = parent.nodeid
98
and parent.parentnodeid = grandparent.nodeid
99
and nodes.nodename = 'surName'
100
and parent.nodename = 'individualName'
101
and grandparent.nodename = 'publisher'
102
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
103

    
104
-- add the first name if we have it
105
update doi_registration doi
106
set publisher = publisher ||', '|| child.nodedata
107
from identifier id, 
108
xml_documents docs, 
109
xml_nodes nodes, 
110
xml_nodes parent, 
111
xml_nodes child, 
112
xml_nodes grandparent
113
where doi.guid = id.guid
114
and id.docid = docs.docid
115
and id.rev = docs.rev
116
and docs.rootnodeid = nodes.rootnodeid
117
and nodes.nodeid = child.parentnodeid
118
and nodes.parentnodeid = parent.nodeid
119
and parent.parentnodeid = grandparent.nodeid
120
and nodes.nodename = 'givenName'
121
and parent.nodename = 'individualName'
122
and grandparent.nodename = 'publisher'
123
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
124

    
125
-- update organization using nodes
126
update doi_registration doi
127
set creator = child.nodedata
128
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
129
where doi.guid = id.guid
130
and id.docid = docs.docid
131
and id.rev = docs.rev
132
and docs.rootnodeid = nodes.rootnodeid
133
and nodes.nodeid = child.parentnodeid
134
and nodes.parentnodeid = parent.nodeid
135
and nodes.nodename = 'organizationName'
136
and parent.nodename = 'creator'
137
and nodes.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = parent.nodeid and nodetype = 'ELEMENT');
138

    
139
-- update creator using nodes
140
-- creator/individualName/surName
141
-- creator/individualName/givenName
142
update doi_registration doi
143
set creator = child.nodedata
144
from identifier id, 
145
xml_documents docs, 
146
xml_nodes nodes, 
147
xml_nodes parent, 
148
xml_nodes child, 
149
xml_nodes grandparent
150
where doi.guid = id.guid
151
and id.docid = docs.docid
152
and id.rev = docs.rev
153
and docs.rootnodeid = nodes.rootnodeid
154
and nodes.nodeid = child.parentnodeid
155
and nodes.parentnodeid = parent.nodeid
156
and parent.parentnodeid = grandparent.nodeid
157
and nodes.nodename = 'surName'
158
and parent.nodename = 'individualName'
159
and grandparent.nodename = 'creator'
160
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
161

    
162
-- add the first name if we have it
163
update doi_registration doi
164
set creator = creator ||', '|| child.nodedata
165
from identifier id, 
166
xml_documents docs, 
167
xml_nodes nodes, 
168
xml_nodes parent, 
169
xml_nodes child, 
170
xml_nodes grandparent
171
where doi.guid = id.guid
172
and id.docid = docs.docid
173
and id.rev = docs.rev
174
and docs.rootnodeid = nodes.rootnodeid
175
and nodes.nodeid = child.parentnodeid
176
and nodes.parentnodeid = parent.nodeid
177
and parent.parentnodeid = grandparent.nodeid
178
and nodes.nodename = 'givenName'
179
and parent.nodename = 'individualName'
180
and grandparent.nodename = 'creator'
181
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
182

    
183
-- update title using revisions
184
update doi_registration doi
185
set title = child.nodedata
186
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
187
where doi.guid = id.guid
188
and id.docid = docs.docid
189
and id.rev = docs.rev
190
and docs.rootnodeid = nodes.rootnodeid
191
and nodes.nodeid = child.parentnodeid
192
and nodes.parentnodeid = parent.nodeid
193
and nodes.nodename = 'title'
194
and parent.nodename = 'dataset';
195

    
196
-- update pubDate using revisions
197
update doi_registration doi
198
set pub_date = child.nodedata
199
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
200
where doi.guid = id.guid
201
and id.docid = docs.docid
202
and id.rev = docs.rev
203
and docs.rootnodeid = nodes.rootnodeid
204
and nodes.nodeid = child.parentnodeid
205
and nodes.parentnodeid = parent.nodeid
206
and nodes.nodename = 'pubDate'
207
and parent.nodename = 'dataset';
208

    
209
-- update publisher using revisions
210
update doi_registration doi
211
set publisher = child.nodedata
212
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
213
where doi.guid = id.guid
214
and id.docid = docs.docid
215
and id.rev = docs.rev
216
and docs.rootnodeid = nodes.rootnodeid
217
and nodes.nodeid = child.parentnodeid
218
and nodes.parentnodeid = parent.nodeid
219
and nodes.nodename = 'organizationName'
220
and parent.nodename = 'publisher'
221
and nodes.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = parent.nodeid and nodetype = 'ELEMENT');
222

    
223
-- use publisher individual name if we have it
224
update doi_registration doi
225
set publisher = child.nodedata
226
from identifier id, 
227
xml_revisions docs, 
228
xml_nodes_revisions nodes, 
229
xml_nodes_revisions parent, 
230
xml_nodes_revisions child, 
231
xml_nodes_revisions grandparent
232
where doi.guid = id.guid
233
and id.docid = docs.docid
234
and id.rev = docs.rev
235
and docs.rootnodeid = nodes.rootnodeid
236
and nodes.nodeid = child.parentnodeid
237
and nodes.parentnodeid = parent.nodeid
238
and parent.parentnodeid = grandparent.nodeid
239
and nodes.nodename = 'surName'
240
and parent.nodename = 'individualName'
241
and grandparent.nodename = 'publisher'
242
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
243

    
244
--include the first name (careful not to rerun this part)
245
update doi_registration doi
246
set publisher = publisher ||', '|| child.nodedata
247
from identifier id, 
248
xml_revisions docs, 
249
xml_nodes_revisions nodes, 
250
xml_nodes_revisions parent, 
251
xml_nodes_revisions child, 
252
xml_nodes_revisions grandparent
253
where doi.guid = id.guid
254
and id.docid = docs.docid
255
and id.rev = docs.rev
256
and docs.rootnodeid = nodes.rootnodeid
257
and nodes.nodeid = child.parentnodeid
258
and nodes.parentnodeid = parent.nodeid
259
and parent.parentnodeid = grandparent.nodeid
260
and nodes.nodename = 'givenName'
261
and parent.nodename = 'individualName'
262
and grandparent.nodename = 'publisher'
263
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
264

    
265
-- update organization using nodes
266
update doi_registration doi
267
set creator = child.nodedata
268
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
269
where doi.guid = id.guid
270
and id.docid = docs.docid
271
and id.rev = docs.rev
272
and docs.rootnodeid = nodes.rootnodeid
273
and nodes.nodeid = child.parentnodeid
274
and nodes.parentnodeid = parent.nodeid
275
and nodes.nodename = 'organizationName'
276
and parent.nodename = 'creator'
277
and nodes.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = parent.nodeid and nodetype = 'ELEMENT');
278

    
279
-- update creator using revisions
280
update doi_registration doi
281
set creator = child.nodedata
282
from identifier id, 
283
xml_revisions docs, 
284
xml_nodes_revisions nodes, 
285
xml_nodes_revisions parent, 
286
xml_nodes_revisions child, 
287
xml_nodes_revisions grandparent
288
where doi.guid = id.guid
289
and id.docid = docs.docid
290
and id.rev = docs.rev
291
and docs.rootnodeid = nodes.rootnodeid
292
and nodes.nodeid = child.parentnodeid
293
and nodes.parentnodeid = parent.nodeid
294
and parent.parentnodeid = grandparent.nodeid
295
and nodes.nodename = 'surName'
296
and parent.nodename = 'individualName'
297
and grandparent.nodename = 'creator'
298
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
299

    
300
-- add the first name if we have it
301
update doi_registration doi
302
set creator = creator ||', '|| child.nodedata
303
from identifier id, 
304
xml_revisions docs, 
305
xml_nodes_revisions nodes, 
306
xml_nodes_revisions parent, 
307
xml_nodes_revisions child, 
308
xml_nodes_revisions grandparent
309
where doi.guid = id.guid
310
and id.docid = docs.docid
311
and id.rev = docs.rev
312
and docs.rootnodeid = nodes.rootnodeid
313
and nodes.nodeid = child.parentnodeid
314
and nodes.parentnodeid = parent.nodeid
315
and parent.parentnodeid = grandparent.nodeid
316
and nodes.nodename = 'givenName'
317
and parent.nodename = 'individualName'
318
and grandparent.nodename = 'creator'
319
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
320

    
321
-- use xml_documents for defaults that are still missing 
322
update doi_registration doi
323
set creator = doc.user_owner
324
from xml_documents doc, identifier id
325
where doi.guid = id.guid
326
and id.docid = doc.docid
327
and id.rev = doc.rev
328
and doi.creator is null;
329

    
330
update doi_registration doi
331
set creator = doc.user_owner
332
from xml_revisions doc, identifier id
333
where doi.guid = id.guid
334
and id.docid = doc.docid
335
and id.rev = doc.rev
336
and doi.creator is null;
337

    
338
-- set publisher
339
update doi_registration
340
set publisher = creator
341
where publisher is null or publisher = '';
342

    
343
-- clean up
344
update doi_registration
345
set title = replace(title, E'\n', ' ');
346

    
347
update doi_registration
348
set title = regexp_replace(title, E'\\s+', ' ', 'g');
349

    
350
update doi_registration
351
set title = regexp_replace(title, E'^\\s+', '', 'g');
352

    
353
update doi_registration
354
set ezid_account = 'sb_nceas'
355
where guid like 'doi:10.5063%';
356
--16988
357

    
358
update doi_registration
359
set ezid_account = 'sb_pisco'
360
where guid like 'doi:10.6085%';
361
--98078
362

    
363
update doi_registration
364
set ezid_account = 'lternet'
365
where guid like 'doi:10.6073%';
366
--49482
367

    
368
/**
369
 * Additional modifications, July 5th, 2012
370
 */
371
ALTER TABLE doi_registration
372
ADD COLUMN resourceType text,
373
ADD COLUMN objectFormat text,
374
ADD COLUMN obsoletedBy text,
375
ADD COLUMN obsoletes text,
376
ADD COLUMN resourceMapId text,
377
ADD COLUMN resourceMapLocation text,
378
ADD COLUMN access text;
379

    
380
-- update access values
381
update doi_registration
382
set access = 'protected';
383

    
384
update doi_registration doi
385
set access = 'public'
386
from xml_access a
387
where doi.guid = a.guid
388
and a.principal_name = 'public'
389
and a.permission >= '4'
390
and a.perm_type = 'allow';
391

    
392
-- update the objectFormat from xml_documents
393
update doi_registration doi
394
set objectFormat = doc.doctype
395
from identifier id, xml_documents doc
396
where doi.guid = id.guid
397
and id.docid = doc.docid
398
and id.rev = doc.rev;
399

    
400
-- update the objectFormat from xml_revisions
401
update doi_registration doi
402
set objectFormat = rev.doctype
403
from identifier id, xml_revisions rev
404
where doi.guid = id.guid
405
and id.docid = rev.docid
406
and id.rev = rev.rev;
407

    
408
--update resourceType
409
update doi_registration
410
set resourceType = 'Dataset/metadata';
411

    
412
update doi_registration
413
set resourceType = 'Dataset/data'
414
where objectFormat = 'BIN';
415

    
416
-- update the data file titles
417
update doi_registration doi
418
set title = 'Data file - ' || doc.docname
419
from identifier id, xml_documents doc
420
where doi.guid = id.guid
421
and id.docid = doc.docid
422
and id.rev = doc.rev
423
and doi.resourceType = 'Dataset/data';
424

    
425
-- update the data file titles from revisions
426
update doi_registration doi
427
set title = 'Data file - ' || doc.docname
428
from identifier id, xml_revisions doc
429
where doi.guid = id.guid
430
and id.docid = doc.docid
431
and id.rev = doc.rev
432
and doi.resourceType = 'Dataset/data';
433

    
434
-- update the objectFormat from SM table (will be subset)
435
update doi_registration doi
436
set objectFormat = sm.object_format
437
from systemMetadata sm
438
where doi.guid = sm.guid;
439
--16938
440

    
441
--update revision history from SM
442
update doi_registration doi
443
set obsoletes = sm.obsoletes,
444
obsoletedBy = sm.obsoleted_by
445
from systemMetadata sm
446
where doi.guid = sm.guid;
447

    
448
/** use plain old revision history **/
449
-- update obsoletedby
450
update doi_registration doi
451
set obsoletedBy = newer.guid
452
from identifier id, identifier newer
453
where doi.guid = id.guid
454
and id.docid = newer.docid
455
and newer.rev = (select min(next.rev) from identifier next where next.docid = id.docid and next.rev > id.rev);
456

    
457
-- update the obsolets
458
update doi_registration doi
459
set obsoletes = older.guid
460
from identifier id, identifier older
461
where doi.guid = id.guid
462
and id.docid = older.docid
463
and older.rev = (select max(prev.rev) from identifier prev where prev.docid = id.docid and prev.rev < id.rev);
464

    
465
/**
466
select doi.guid, older.guid as obsoletes 
467
from doi_registration doi, identifier id, identifier older
468
where doi.guid = id.guid
469
and id.docid = older.docid
470
and older.rev = (select max(prev.rev) from identifier prev where prev.docid = id.docid and prev.rev < id.rev)
471
and doi.guid like 'doi:10.5063/AA/ABS.4%';
472
**/
473

    
474

    
475
--update the resourcemapid for described data
476
CREATE TABLE ecogrid_docids AS
477
select distinct id.docid, id.rev, id.guid, substring(xpi.nodedata from 'ecogrid://knb/(.*)$') as data_docid
478
from xml_path_index xpi, identifier id, xml_documents xmld
479
where xpi.docid = xmld.docid 
480
and xmld.docid = id.docid
481
and xmld.rev = id.rev
482
and xpi.path like 'dataset/%/physical/distribution/online/url' 
483
and xpi.nodedata like 'ecogrid%'
484
and id.guid in (select guid from doi_registration);
485

    
486
-- include revisions for described data
487
INSERT INTO ecogrid_docids
488
select distinct id.docid, id.rev, id.guid, substring(child.nodedata from 'ecogrid://knb/(.*)$') as data_docid
489
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
490
where id.docid = docs.docid
491
and id.rev = docs.rev
492
and docs.rootnodeid = nodes.rootnodeid
493
and nodes.nodeid = child.parentnodeid
494
and nodes.parentnodeid = parent.nodeid
495
and nodes.nodename = 'url'
496
and parent.nodename = 'online'
497
and child.nodedata like 'ecogrid%'
498
and child.nodetype = 'TEXT'
499
and id.guid in (select guid from doi_registration);
500

    
501
-- Set the resource map for the data files (NOTE: some of thee maps might not actually exist on the system depending on how successful the ORE generation was)
502
update doi_registration doi
503
set resourceMapId = 'resourceMap_'||eco.docid||'.'||eco.rev
504
from identifier id, ecogrid_docids eco
505
where doi.guid = id.guid
506
and id.docid||'.'||id.rev = eco.data_docid;
507

    
508
-- set the resource map id for the metadata file that did most of the packaging work!
509
update doi_registration doi
510
set resourceMapId = 'resourceMap_'||eco.docid||'.'||eco.rev
511
from ecogrid_docids eco
512
where doi.guid = eco.guid;
513

    
514
select count(*)
515
from doi_registration
516
where resourceMapId is null;
517

    
518
--update the resource map location
519
update doi_registration
520
set resourceMapLocation = 'https://cn.dataone.org/cn/v1/resolve/' || resourceMapId
521
where resourceMapId is not null;
522

    
523
-- fix null values (should not have to have this now, but it will not hurt)
524
update doi_registration doi
525
set creator = user_owner
526
from identifier id, xml_documents docs
527
where doi.guid = id.guid
528
and id.docid = docs.docid
529
and id.rev= docs.rev
530
and (creator is null or trim(creator) = '');
531

    
532
update doi_registration doi
533
set publisher = user_owner
534
from identifier id, xml_documents docs
535
where doi.guid = id.guid
536
and id.docid = docs.docid
537
and id.rev= docs.rev
538
and (publisher is null or trim(publisher) = '');
539

    
540
update doi_registration doi
541
set creator = user_owner
542
from identifier id, xml_revisions docs
543
where doi.guid = id.guid
544
and id.docid = docs.docid
545
and id.rev= docs.rev
546
and (creator is null or trim(creator) = '');
547

    
548
update doi_registration doi
549
set publisher = user_owner
550
from identifier id, xml_revisions docs
551
where doi.guid = id.guid
552
and id.docid = docs.docid
553
and id.rev= docs.rev
554
and (publisher is null or trim(publisher) = '');
555

    
556
-- fix a previous mistake
557
update doi_registration
558
set publisher = null
559
where publisher = 'document';
560

    
561
--update creator usinig LDAP lookup
562
CREATE TABLE ecoinfo_dn (dn text, givenName text, sn text);
563
--TRUNCATE TABLE ecoinfo_dn;
564
COPY ecoinfo_dn FROM '/tmp/ecoinfo_dn.csv' WITH CSV HEADER;
565

    
566
update doi_registration
567
set creator = 
568
case when givenName is null then
569
        sn
570
else 
571
        sn || ', ' || givenName
572
end
573
from ecoinfo_dn
574
where trim(both from lower(creator)) = lower(dn);
575

    
576
update doi_registration
577
set publisher = 
578
case when givenName is null then
579
        sn
580
else 
581
        sn || ', ' || givenName
582
end
583
from ecoinfo_dn
584
where trim(both from lower(publisher)) = lower(dn);
585

    
586
select creator, count(*) as cnt
587
from doi_registration
588
where creator like 'uid=%'
589
group by creator
590
order by cnt desc;
591

    
592
-- update the data files with owner/publisher from EML entry
593
update doi_registration doi
594
set creator = meta.creator,
595
publisher = meta.publisher
596
from doi_registration meta, identifier id, ecogrid_docids eco
597
where doi.guid = id.guid
598
and id.docid||'.'||id.rev = eco.data_docid
599
and eco.guid = meta.guid;
600

    
601
-- clean up KBS entries that have empty user_owner
602
update doi_registration
603
set creator = 'Kellogg Biological Station'
604
where (creator is null or trim(creator) = '')
605
and guid like '%knb-lter-kbs%';
606

    
607
update doi_registration
608
set publisher = 'Kellogg Biological Station'
609
where (publisher is null or trim(publisher) = '')
610
and guid like '%knb-lter-kbs%';
611

    
612
-- more clean up
613
update doi_registration
614
set creator = trim(creator),
615
publisher = trim(publisher);
616

    
617
-- set the publisher to the source system if it is the same as the creator
618
update doi_registration
619
set publisher = 
620
case 
621
when guid like 'doi:10.5063%' then
622
        'Knowledge Network for Biocomplexity (KNB)'
623
when guid like 'doi:10.6085%' then
624
        'Partnership for Interdisciplinary Studies of Coastal Oceans (PISCO)'
625
when guid like 'doi:10.6073%' then
626
        'Long Term Ecological Research Network (LTER)'
627
else
628
        publisher
629
end
630
where publisher = creator;
631

    
632
-- entity references
633
update doi_registration
634
set publisher = regexp_replace(publisher, '&amp;', '&', 'g');
635
update doi_registration
636
set publisher = regexp_replace(publisher, '&apos;', E'\'', 'g');
637
--accented i (í)
638
update doi_registration
639
set publisher = regexp_replace(publisher, '&#237;', 'í', 'g')
640
where publisher like '%&#237;%';
641
update doi_registration
642
set creator = regexp_replace(creator, '&amp;', '&', 'g');
643
update doi_registration
644
set creator = regexp_replace(creator, '&apos;', E'\'', 'g');
645

    
646
--titles
647
update doi_registration
648
set title = regexp_replace(title, '&amp;', '&', 'g');
649
update doi_registration
650
set title = regexp_replace(title, '&mp;', '&', 'g')
651
where title like '%&mp;%';
652
update doi_registration
653
set title = regexp_replace(title, '&apos;', E'\'', 'g');
654
update doi_registration
655
set title = regexp_replace(title, '&quot;', E'\'', 'g');
656
update doi_registration
657
set title = regexp_replace(title, '&gt;', '>', 'g');
658
update doi_registration
659
set title = regexp_replace(title, '&lt;', '<', 'g');
660
update doi_registration
661
set title = regexp_replace(title, '&#237;', 'í', 'g')
662
where title like '%&#237;%';
663
update doi_registration
664
set title = regexp_replace(title, '&#195;&#173;', 'í', 'g')
665
where title like '%&#195;&#173;%';
666
update doi_registration
667
set title = regexp_replace(title, '&#195;&#177;', 'í', 'g')
668
where title like '%&#195;&#177;%';
669
update doi_registration
670
set title = regexp_replace(title, '&#243;', 'ó', 'g')
671
where title like '%&#243;%';
672
update doi_registration
673
set title = regexp_replace(title, '&#227;', 'ã', 'g')
674
where title like '%&#227;%';
675
update doi_registration
676
set title = regexp_replace(title, '&#231;', 'ç', 'g')
677
where title like '%&#231;%';
678
update doi_registration
679
set title = regexp_replace(title, '&#233;', 'é', 'g')
680
where title like '%&#233;%';
681
update doi_registration
682
set title = regexp_replace(title, '&#241;', 'ñ', 'g')
683
where title like '%&#241;%';
684
update doi_registration
685
set title = regexp_replace(title, '&#226;&#128;&#147;', '-', 'g')
686
where title like '%&#226;&#128;&#147;%';
687
update doi_registration
688
set title = regexp_replace(title, '&#226;&#128;&#153;', E'\'', 'g')
689
where title like '%&#226;&#128;&#153;%';
690
update doi_registration
691
set title = regexp_replace(title, '&#8211;', '/', 'g')
692
where title like '%&#8211;%';
693
update doi_registration
694
set title = regexp_replace(title, '&#195;&#161;', 'á', 'g')
695
where title like '%&#195;&#161;%';
696
update doi_registration
697
set title = regexp_replace(title, '&#195;&#179;', 'ó', 'g')
698
where title like '%&#195;&#179;%';
699
--&#26684;&#24335;&#31684;&#20363; (格式范例)
700
update doi_registration
701
set title = regexp_replace(title, '&#26684;&#24335;&#31684;&#20363;', '格式范例', 'g')
702
where title like '%&#26684;&#24335;&#31684;&#20363;%';
703

    
704
--directional quotes
705
update doi_registration
706
set title = regexp_replace(title, '&#226;&#128;&#157;', '"', 'g')
707
where title like '%&#226;&#128;&#157;%';
708
update doi_registration
709
set title = regexp_replace(title, '&#226;&#128;&#156;', '"', 'g')
710
where title like '%&#226;&#128;&#156;%';
711
--one off apostrophe
712
update doi_registration
713
set title = regexp_replace(title, '&#213;', E'\'', 'g')
714
where title like '%&#213;%';
715
update doi_registration
716
set title = regexp_replace(title, '&#150;', '-', 'g')
717
where title like '%&#150;%';
718

    
719
--after reading the complete metadata, I could interpret these as "greater than"
720
update doi_registration
721
set title = regexp_replace(title, '&t;', '>', 'g')
722
where title like '%&t;%'
723
and guid in ('doi:10.6073/AA/knb-lter-luq.15.1', 'doi:10.6073/AA/knb-lter-luq.39.1', 'doi:10.6073/AA/knb-lter-luq.37.1', 'doi:10.6073/AA/knb-lter-luq.40.1', 'doi:10.6073/AA/knb-lter-luq.18.1' );
724

    
725

    
726
--special characters
727
update doi_registration
728
set creator = regexp_replace(creator, '&#233;', 'é', 'g')
729
where creator like '%&#233;%';
730
update doi_registration
731
set creator = regexp_replace(creator, '&#231;', 'ç', 'g')
732
where creator like '%&#231;%';
733
update doi_registration
734
set creator = regexp_replace(creator, '&#241;', 'ñ', 'g')
735
where creator like '%&#241;%';
736
-- Guti&#195;&#169;rrez, Ralph, J.
737
-- Li&#195;&#169;bault,  Fr&#195;&#169;d&#195;&#169;ric
738
update doi_registration
739
set creator = regexp_replace(creator, '&#195;&#169;', 'é', 'g')
740
where creator like '%&#195;&#169;%';
741
-- Ram&#195;&#173;rez, Alonso
742
update doi_registration
743
set creator = regexp_replace(creator, '&#195;&#173;', 'í', 'g')
744
where creator like '%&#195;&#173;%';
745
-- Gonz&#195;&#161;lez, Grizelle
746
update doi_registration
747
set creator = regexp_replace(creator, '&#195;&#161;', 'á', 'g')
748
where creator like '%&#195;&#161;%';
749
--&#29579;,  (王, 名, 姓)
750
update doi_registration
751
set creator = regexp_replace(creator, '&#29579;', '', 'g')
752
where creator like '%&#29579;%';
753
--&#21517;
754
update doi_registration
755
set creator = regexp_replace(creator, '&#21517;', '', 'g')
756
where creator like '%&#21517;%';
757
--&#22995;
758
update doi_registration
759
set creator = regexp_replace(creator, '&#22995;', '', 'g')
760
where creator like '%&#22995;%';
761
-- These are left
762
-- Helmbrecht, S &#226;&#128;&#153;rai
763
-- lk&#195;&#177;lkl, Fgfggf
764

    
765
-- clean up
766
update doi_registration
767
set publisher = replace(publisher, E'\n', ' ');
768
update doi_registration
769
set publisher = regexp_replace(publisher, E'\\s+', ' ', 'g');
770
update doi_registration
771
set publisher = regexp_replace(publisher, E'^\\s+', '', 'g');
772

    
773
-- use only year for pub_date
774
update doi_registration
775
set pub_date = to_char(date(pub_date), 'YYYY')
776
where length(pub_date) > 4
777
and pub_date like '%-%-%';
778

    
779
--set the rest to upload date, xml_documents
780
update doi_registration doi
781
set pub_date = to_char(date_created, 'YYYY')
782
from xml_documents x, identifier id
783
where length(pub_date) != 4
784
and doi.guid = id.guid
785
and id.docid = x.docid
786
and id.rev = x.rev;
787
-- revisions
788
update doi_registration doi
789
set pub_date = to_char(date_created, 'YYYY')
790
from xml_revisions x, identifier id
791
where length(pub_date) != 4
792
and doi.guid = id.guid
793
and id.docid = x.docid
794
and id.rev = x.rev;
795

    
796
update doi_registration
797
set creator = replace(creator, E'\n', ' ');
798

    
799
-- some entries got in with blank spaces 
800
update doi_registration
801
set title = 'unknown'
802
where length(trim(title)) = 0;
803

    
804
update doi_registration
805
set creator = 'unknown'
806
where length(trim(creator)) = 0;
807

    
808
-- BES has very long publisher fields, so use this default for them all
809
update doi_registration
810
set publisher = 'Baltimore Ecosystem Study LTER'
811
where guid like '%knb-lter-bes%';
812

    
813
-- need URL identifiers for DataCite when they are not DOIs
814
update doi_registration
815
set obsoletedBy = 'https://cn.dataone.org/cn/v1/resolve/' || obsoletedBy 
816
where obsoletedBy not like 'doi%';
817

    
818
update doi_registration
819
set obsoletes = 'https://cn.dataone.org/cn/v1/resolve/' || obsoletes
820
where obsoletes not like 'doi%';
821

    
822
ALTER TABLE doi_registration
823
ADD COLUMN obsoletedByIdType text,
824
ADD COLUMN obsoletesIdType text,
825
ADD COLUMN resourceMapLocationIdType text;
826

    
827
update doi_registration
828
set obsoletedByIdType = 'DOI'
829
where obsoletedBy is not null
830
and obsoletedBy like 'doi%';
831

    
832
update doi_registration
833
set obsoletedByIdType = 'URL'
834
where obsoletedBy is not null
835
and obsoletedBy not like 'doi%';
836

    
837
update doi_registration
838
set obsoletesIdType = 'DOI'
839
where obsoletes is not null
840
and obsoletes like 'doi%';
841

    
842
update doi_registration
843
set obsoletesIdType = 'URL'
844
where obsoletes is not null
845
and obsoletes not like 'doi%';
846

    
847
update doi_registration
848
set resourceMapLocationIdType = 'URL'
849
where resourceMapLocation is not null;
850

    
851
-- copy to the external file
852
COPY 
853
(select ezid_account, 
854
guid as dc_identifier, 
855
url as datacite_url, 
856
title as dc_title, 
857
creator as dc_creator, 
858
publisher as dc_publisher, 
859
pub_date as datacite_publicationYear,
860
split_part(resourceType, '/', 1) as datacite_resourceTypeGeneral,
861
split_part(resourceType, '/', 2) as datacite_resourceType,
862
objectFormat as datacite_format,
863
obsoletedBy as datacite_relatedIdentifier_isPreviousVersionOf,
864
obsoletedByIdType as datacite_relatedIdentifier_isPreviousVersionOfType,
865
obsoletes as datacite_relatedIdentifier_isNewVersionOf,
866
obsoletesIdType as datacite_relatedIdentifier_isNewVersionOfType,
867
resourceMapLocation as datacite_relatedIdentifier_isPartOf,
868
resourceMapLocationIdType as datacite_relatedIdentifier_isPartOfType,
869
access as d1_read_access
870
from doi_registration 
871
where access = 'public'
872
order by dc_identifier) 
873
TO '/tmp/doi_registration.csv'
874
WITH CSV HEADER;
875
--164548
876

    
877
--drop table doi_registration;
878
--drop table ecogrid_docids;
879
--drop table ecoinfo_dn;
880