Project

General

Profile

metacat / src / doi_registration.sql @ 7303

1
-- use default values initially
2
CREATE TABLE doi_registration AS
3

    
4
select 
5
text('testing') as ezid_account,
6
doc.doctype,
7
id.guid, 
8
'https://cn.dataone.org/cn/v1/resolve/' || regexp_replace(regexp_replace(id.guid, '/', '%2F', 'g'), ':', '%3A', 'g') as url, 
9
text('Unknown') as title, 
10
text(null) as creator,
11
text(null) as publisher,
12
to_char(doc.date_created, 'YYYY') as pub_date
13
from identifier id, xml_documents doc
14
where guid like 'doi%'
15
and id.docid = doc.docid
16
and id.rev = doc.rev
17

    
18
UNION ALL
19

    
20
select 
21
text('testing') as ezid_account,
22
doc.doctype,
23
id.guid, 
24
'https://cn.dataone.org/cn/v1/resolve/' || regexp_replace(regexp_replace(id.guid, '/', '%2F', 'g'), ':', '%3A', 'g') as url, 
25
text('Unknown') as title, 
26
text(null) as creator,
27
text(null) as publisher,
28
to_char(doc.date_created, 'YYYY') as pub_date
29
from identifier id, xml_revisions doc
30
where guid like 'doi%'
31
and id.docid = doc.docid
32
and id.rev = doc.rev;
33

    
34
--update defaults
35
update doi_registration
36
set title = 'Data file'
37
where doctype = 'BIN';
38

    
39
update doi_registration
40
set title = 'Legacy EML file'
41
where doctype like '%eml%2.0.0beta%';
42

    
43
-- update title using node information
44
update doi_registration doi
45
set title = child.nodedata
46
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
47
where doi.guid = id.guid
48
and id.docid = docs.docid
49
and id.rev = docs.rev
50
and docs.rootnodeid = nodes.rootnodeid
51
and nodes.nodeid = child.parentnodeid
52
and nodes.parentnodeid = parent.nodeid
53
and nodes.nodename = 'title'
54
and parent.nodename = 'dataset';
55

    
56
-- update pubDate using nodes
57
update doi_registration doi
58
set pub_date = child.nodedata
59
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
60
where doi.guid = id.guid
61
and id.docid = docs.docid
62
and id.rev = docs.rev
63
and docs.rootnodeid = nodes.rootnodeid
64
and nodes.nodeid = child.parentnodeid
65
and nodes.parentnodeid = parent.nodeid
66
and nodes.nodename = 'pubDate'
67
and parent.nodename = 'dataset';
68

    
69
-- update publisher using nodes
70
update doi_registration doi
71
set publisher = child.nodedata
72
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
73
where doi.guid = id.guid
74
and id.docid = docs.docid
75
and id.rev = docs.rev
76
and docs.rootnodeid = nodes.rootnodeid
77
and nodes.nodeid = child.parentnodeid
78
and nodes.parentnodeid = parent.nodeid
79
and nodes.nodename = 'publisher'
80
and parent.nodename = 'dataset';
81

    
82
-- update organization using nodes
83
update doi_registration doi
84
set creator = child.nodedata
85
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
86
where doi.guid = id.guid
87
and id.docid = docs.docid
88
and id.rev = docs.rev
89
and docs.rootnodeid = nodes.rootnodeid
90
and nodes.nodeid = child.parentnodeid
91
and nodes.parentnodeid = parent.nodeid
92
and nodes.nodename = 'organizationName'
93
and parent.nodename = 'creator'
94
and nodes.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = parent.nodeid and nodetype = 'ELEMENT');
95

    
96
-- update creator using nodes
97
-- creator/individualName/surName
98
-- creator/individualName/givenName
99
update doi_registration doi
100
set creator = child.nodedata
101
from identifier id, 
102
xml_documents docs, 
103
xml_nodes nodes, 
104
xml_nodes parent, 
105
xml_nodes child, 
106
xml_nodes grandparent
107
where doi.guid = id.guid
108
and id.docid = docs.docid
109
and id.rev = docs.rev
110
and docs.rootnodeid = nodes.rootnodeid
111
and nodes.nodeid = child.parentnodeid
112
and nodes.parentnodeid = parent.nodeid
113
and parent.parentnodeid = grandparent.nodeid
114
and nodes.nodename = 'surName'
115
and parent.nodename = 'individualName'
116
and grandparent.nodename = 'creator'
117
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
118

    
119
-- add the first name if we have it
120
update doi_registration doi
121
set creator = child.nodedata ||' '|| creator
122
from identifier id, 
123
xml_documents docs, 
124
xml_nodes nodes, 
125
xml_nodes parent, 
126
xml_nodes child, 
127
xml_nodes grandparent
128
where doi.guid = id.guid
129
and id.docid = docs.docid
130
and id.rev = docs.rev
131
and docs.rootnodeid = nodes.rootnodeid
132
and nodes.nodeid = child.parentnodeid
133
and nodes.parentnodeid = parent.nodeid
134
and parent.parentnodeid = grandparent.nodeid
135
and nodes.nodename = 'givenName'
136
and parent.nodename = 'individualName'
137
and grandparent.nodename = 'creator'
138
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
139

    
140
-- update title using revisions
141
update doi_registration doi
142
set title = child.nodedata
143
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
144
where doi.guid = id.guid
145
and id.docid = docs.docid
146
and id.rev = docs.rev
147
and docs.rootnodeid = nodes.rootnodeid
148
and nodes.nodeid = child.parentnodeid
149
and nodes.parentnodeid = parent.nodeid
150
and nodes.nodename = 'title'
151
and parent.nodename = 'dataset';
152

    
153
-- update pubDate using revisions
154
update doi_registration doi
155
set pub_date = child.nodedata
156
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
157
where doi.guid = id.guid
158
and id.docid = docs.docid
159
and id.rev = docs.rev
160
and docs.rootnodeid = nodes.rootnodeid
161
and nodes.nodeid = child.parentnodeid
162
and nodes.parentnodeid = parent.nodeid
163
and nodes.nodename = 'pubDate'
164
and parent.nodename = 'dataset';
165

    
166
-- update publisher using revisions
167
update doi_registration doi
168
set publisher = child.nodedata
169
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
170
where doi.guid = id.guid
171
and id.docid = docs.docid
172
and id.rev = docs.rev
173
and docs.rootnodeid = nodes.rootnodeid
174
and nodes.nodeid = child.parentnodeid
175
and nodes.parentnodeid = parent.nodeid
176
and nodes.nodename = 'publisher'
177
and parent.nodename = 'dataset';
178

    
179
-- update organization using nodes
180
update doi_registration doi
181
set creator = child.nodedata
182
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
183
where doi.guid = id.guid
184
and id.docid = docs.docid
185
and id.rev = docs.rev
186
and docs.rootnodeid = nodes.rootnodeid
187
and nodes.nodeid = child.parentnodeid
188
and nodes.parentnodeid = parent.nodeid
189
and nodes.nodename = 'organizationName'
190
and parent.nodename = 'creator'
191
and nodes.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = parent.nodeid and nodetype = 'ELEMENT');
192

    
193
-- update creator using revisions
194
update doi_registration doi
195
set creator = child.nodedata
196
from identifier id, 
197
xml_revisions docs, 
198
xml_nodes_revisions nodes, 
199
xml_nodes_revisions parent, 
200
xml_nodes_revisions child, 
201
xml_nodes_revisions grandparent
202
where doi.guid = id.guid
203
and id.docid = docs.docid
204
and id.rev = docs.rev
205
and docs.rootnodeid = nodes.rootnodeid
206
and nodes.nodeid = child.parentnodeid
207
and nodes.parentnodeid = parent.nodeid
208
and parent.parentnodeid = grandparent.nodeid
209
and nodes.nodename = 'surName'
210
and parent.nodename = 'individualName'
211
and grandparent.nodename = 'creator'
212
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
213

    
214
-- add the first name if we have it
215
update doi_registration doi
216
set creator = child.nodedata ||' '|| creator
217
from identifier id, 
218
xml_revisions docs, 
219
xml_nodes_revisions nodes, 
220
xml_nodes_revisions parent, 
221
xml_nodes_revisions child, 
222
xml_nodes_revisions grandparent
223
where doi.guid = id.guid
224
and id.docid = docs.docid
225
and id.rev = docs.rev
226
and docs.rootnodeid = nodes.rootnodeid
227
and nodes.nodeid = child.parentnodeid
228
and nodes.parentnodeid = parent.nodeid
229
and parent.parentnodeid = grandparent.nodeid
230
and nodes.nodename = 'givenName'
231
and parent.nodename = 'individualName'
232
and grandparent.nodename = 'creator'
233
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
234

    
235
-- use xml_documents for defaults that are still missing 
236
update doi_registration doi
237
set creator = doc.user_owner
238
from xml_documents doc, identifier id
239
where doi.guid = id.guid
240
and id.docid = doc.docid
241
and id.rev = doc.rev
242
and doi.creator is null;
243

    
244
update doi_registration doi
245
set creator = doc.user_owner
246
from xml_revisions doc, identifier id
247
where doi.guid = id.guid
248
and id.docid = doc.docid
249
and id.rev = doc.rev
250
and doi.creator is null;
251

    
252
-- set publisher
253
update doi_registration
254
set publisher = creator
255
where publisher is null or publisher = '';
256

    
257
-- clean up
258
update doi_registration
259
set title = replace(title, E'\n', ' ');
260

    
261
update doi_registration
262
set title = regexp_replace(title, E'\\s+', ' ', 'g');
263

    
264
update doi_registration
265
set title = regexp_replace(title, E'^\\s+', '', 'g');
266

    
267
update doi_registration
268
set ezid_account = 'KNB'
269
where guid like 'doi:10.5063%';
270
--16988
271

    
272
update doi_registration
273
set ezid_account = 'PISCO'
274
where guid like 'doi:10.6085%';
275
--98078
276

    
277
update doi_registration
278
set ezid_account = 'LTER'
279
where guid like 'doi:10.6073%';
280
--49482
281

    
282
/**
283
 * Additional modifications, July 5th, 2012
284
 */
285
ALTER TABLE doi_registration
286
ADD COLUMN resourceType text,
287
ADD COLUMN objectFormat text,
288
ADD COLUMN obsoletedBy text,
289
ADD COLUMN obsoletes text,
290
ADD COLUMN resourceMapId text,
291
ADD COLUMN resourceMapLocation text,
292
ADD COLUMN access text;
293

    
294
-- update access values
295
update doi_registration
296
set access = 'protected';
297

    
298
update doi_registration doi
299
set access = 'public'
300
from xml_access a
301
where doi.guid = a.guid
302
and a.principal_name = 'public'
303
and a.permission >= '4'
304
and a.perm_type = 'allow';
305

    
306
-- update the objectFormat from xml_documents
307
update doi_registration doi
308
set objectFormat = doc.doctype
309
from identifier id, xml_documents doc
310
where doi.guid = id.guid
311
and id.docid = doc.docid;
312

    
313
-- update the objectFormat from xml_revisions
314
update doi_registration doi
315
set objectFormat = rev.doctype
316
from identifier id, xml_revisions rev
317
where doi.guid = id.guid
318
and id.docid = rev.docid;
319

    
320
--update resourceType
321
update doi_registration
322
set resourceType = 'Dataset/METADATA';
323

    
324
update doi_registration
325
set resourceType = 'Dataset/DATA'
326
where objectFormat = 'BIN';
327

    
328
-- update the objectFormat from SM table (will be subset)
329
update doi_registration doi
330
set objectFormat = sm.object_format
331
from systemMetadata sm
332
where doi.guid = sm.guid;
333
--16938
334

    
335
--update revision history from SM
336
update doi_registration doi
337
set obsoletes = sm.obsoletes,
338
obsoletedBy = sm.obsoleted_by
339
from systemMetadata sm
340
where doi.guid = sm.guid;
341

    
342
/** use plain old revision history **/
343
-- update obsoletedby
344
update doi_registration doi
345
set obsoletedBy = newer.guid
346
from identifier id, identifier newer
347
where doi.guid = id.guid
348
and id.docid = newer.docid
349
and newer.rev = (select min(next.rev) from identifier next where next.docid = id.docid and next.rev > id.rev);
350

    
351
-- update the obsolets
352
update doi_registration doi
353
set obsoletes = older.guid
354
from identifier id, identifier older
355
where doi.guid = id.guid
356
and id.docid = older.docid
357
and older.rev = (select max(prev.rev) from identifier prev where prev.docid = id.docid and prev.rev < id.rev);
358

    
359
/**
360
select doi.guid, older.guid as obsoletes 
361
from doi_registration doi, identifier id, identifier older
362
where doi.guid = id.guid
363
and id.docid = older.docid
364
and older.rev = (select max(prev.rev) from identifier prev where prev.docid = id.docid and prev.rev < id.rev)
365
and doi.guid like 'doi:10.5063/AA/ABS.4%';
366
**/
367

    
368

    
369
-- update resourceMap pointer
370
-- NOTE: some of these maps might not actually exist
371
update doi_registration doi
372
set resourceMapId = 'resourceMap_'||id.docid||'.'||id.rev
373
from identifier id
374
where doi.guid = id.guid
375
and id.docid = rev.docid
376
and objectFormat like 'eml%';
377

    
378
--update the resourcemapid for described data
379
CREATE TABLE ecogrid_docids AS
380
select distinct id.docid, id.rev, id.guid, substring(xpi.nodedata from 'ecogrid://knb/(.*)$') as data_docid
381
from xml_path_index xpi, identifier id, xml_documents xmld
382
where xpi.docid = xmld.docid 
383
and xmld.docid = id.docid
384
and xmld.rev = id.rev
385
and xpi.path like 'dataset/%/physical/distribution/online/url' 
386
and xpi.nodedata like 'ecogrid%'
387
and id.guid in (select guid from doi_registration);
388

    
389
-- include revisions for described data
390
INSERT INTO ecogrid_docids
391
select distinct id.docid, id.rev, id.guid, substring(child.nodedata from 'ecogrid://knb/(.*)$') as data_docid
392
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
393
where id.docid = docs.docid
394
and id.rev = docs.rev
395
and docs.rootnodeid = nodes.rootnodeid
396
and nodes.nodeid = child.parentnodeid
397
and nodes.parentnodeid = parent.nodeid
398
and nodes.nodename = 'url'
399
and parent.nodename = 'online'
400
and child.nodedata like 'ecogrid%'
401
and child.nodetype = 'TEXT'
402
and id.guid in (select guid from doi_registration);
403

    
404
-- NOTE: some of thee maps might not actually exist
405
update doi_registration doi
406
set resourceMapId = 'resourceMap_'||eco.docid||'.'||eco.rev
407
from identifier id, ecogrid_docids eco
408
where doi.guid = id.guid
409
and id.docid||'.'||id.rev = eco.data_docid;
410

    
411
select count(*)
412
from doi_registration
413
where resourceMapId is null;
414

    
415
--update the resource map location
416
update doi_registration
417
set resourceMapLocation = 'https://cn.dataone.org/cn/v1/resolve/' || resourceMapId
418
where resourceMapId is not null;
419

    
420
--update creator usinig LDAP lookup
421
CREATE TABLE ecoinfo_dn (dn text, givenName text, sn text);
422
COPY ecoinfo_dn FROM '/tmp/ecoinfo_dn.csv' WITH CSV HEADER;
423

    
424
update doi_registration
425
set creator = givenName|| ' ' || sn
426
from ecoinfo_dn
427
where creator = dn;
428

    
429
update doi_registration
430
set publisher = givenName|| ' ' || sn
431
from ecoinfo_dn
432
where publisher = dn;
433

    
434
select creator, count(*) as cnt
435
from doi_registration
436
where creator like 'uid=%'
437
group by creator
438
order by cnt desc;
439

    
440
-- copy to the external file
441
COPY 
442
(select ezid_account, 
443
guid as dc_identifier, 
444
url as datacite_url, 
445
title as dc_title, 
446
creator as dc_creator, 
447
publisher as dc_publisher, 
448
pub_date as datacite_publicationYear,
449
resourceType,
450
objectFormat,
451
obsoletedBy,
452
obsoletes,
453
resourceMapId,
454
resourceMapLocation,
455
access
456
from doi_registration 
457
order by dc_identifier) 
458
TO '/tmp/doi_registration.csv'
459
WITH CSV HEADER;
460
--164548
461

    
462
--drop table doi_registration;
463
--drop table ecogrid_docids;
464
--drop table ecoinfo_dn;
465