Project

General

Profile

1
-- use default values initially
2
CREATE TABLE doi_registration AS
3

    
4
select 
5
text('testing') as ezid_account,
6
doc.doctype,
7
id.guid, 
8
'https://cn.dataone.org/cn/v1/resolve/' || regexp_replace(regexp_replace(id.guid, '/', '%2F', 'g'), ':', '%3A', 'g') as url, 
9
text('Unknown') as title, 
10
text(null) as creator,
11
text(null) as publisher,
12
to_char(doc.date_created, 'YYYY') as pub_date
13
from identifier id, xml_documents doc
14
where guid like 'doi%'
15
and id.docid = doc.docid
16
and id.rev = doc.rev
17

    
18
UNION ALL
19

    
20
select 
21
text('testing') as ezid_account,
22
doc.doctype,
23
id.guid, 
24
'https://cn.dataone.org/cn/v1/resolve/' || regexp_replace(regexp_replace(id.guid, '/', '%2F', 'g'), ':', '%3A', 'g') as url, 
25
text('Unknown') as title, 
26
text(null) as creator,
27
text(null) as publisher,
28
to_char(doc.date_created, 'YYYY') as pub_date
29
from identifier id, xml_revisions doc
30
where guid like 'doi%'
31
and id.docid = doc.docid
32
and id.rev = doc.rev;
33

    
34
--update defaults
35
update doi_registration
36
set title = 'Data file'
37
where doctype = 'BIN';
38

    
39
update doi_registration
40
set title = 'Legacy EML file'
41
where doctype like '%eml%2.0.0beta%';
42

    
43
-- update title using node information
44
update doi_registration doi
45
set title = child.nodedata
46
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
47
where doi.guid = id.guid
48
and id.docid = docs.docid
49
and id.rev = docs.rev
50
and docs.rootnodeid = nodes.rootnodeid
51
and nodes.nodeid = child.parentnodeid
52
and nodes.parentnodeid = parent.nodeid
53
and nodes.nodename = 'title'
54
and parent.nodename = 'dataset';
55

    
56
-- update pubDate using nodes
57
update doi_registration doi
58
set pub_date = child.nodedata
59
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
60
where doi.guid = id.guid
61
and id.docid = docs.docid
62
and id.rev = docs.rev
63
and docs.rootnodeid = nodes.rootnodeid
64
and nodes.nodeid = child.parentnodeid
65
and nodes.parentnodeid = parent.nodeid
66
and nodes.nodename = 'pubDate'
67
and parent.nodename = 'dataset';
68

    
69
-- update publisher using nodes
70
update doi_registration doi
71
set publisher = child.nodedata
72
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
73
where doi.guid = id.guid
74
and id.docid = docs.docid
75
and id.rev = docs.rev
76
and docs.rootnodeid = nodes.rootnodeid
77
and nodes.nodeid = child.parentnodeid
78
and nodes.parentnodeid = parent.nodeid
79
and nodes.nodename = 'publisher'
80
and parent.nodename = 'dataset'
81
and child.nodetype = 'TEXT';
82

    
83
-- update organization using nodes
84
update doi_registration doi
85
set creator = child.nodedata
86
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
87
where doi.guid = id.guid
88
and id.docid = docs.docid
89
and id.rev = docs.rev
90
and docs.rootnodeid = nodes.rootnodeid
91
and nodes.nodeid = child.parentnodeid
92
and nodes.parentnodeid = parent.nodeid
93
and nodes.nodename = 'organizationName'
94
and parent.nodename = 'creator'
95
and nodes.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = parent.nodeid and nodetype = 'ELEMENT');
96

    
97
-- update creator using nodes
98
-- creator/individualName/surName
99
-- creator/individualName/givenName
100
update doi_registration doi
101
set creator = child.nodedata
102
from identifier id, 
103
xml_documents docs, 
104
xml_nodes nodes, 
105
xml_nodes parent, 
106
xml_nodes child, 
107
xml_nodes grandparent
108
where doi.guid = id.guid
109
and id.docid = docs.docid
110
and id.rev = docs.rev
111
and docs.rootnodeid = nodes.rootnodeid
112
and nodes.nodeid = child.parentnodeid
113
and nodes.parentnodeid = parent.nodeid
114
and parent.parentnodeid = grandparent.nodeid
115
and nodes.nodename = 'surName'
116
and parent.nodename = 'individualName'
117
and grandparent.nodename = 'creator'
118
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
119

    
120
-- add the first name if we have it
121
update doi_registration doi
122
set creator = child.nodedata ||' '|| creator
123
from identifier id, 
124
xml_documents docs, 
125
xml_nodes nodes, 
126
xml_nodes parent, 
127
xml_nodes child, 
128
xml_nodes grandparent
129
where doi.guid = id.guid
130
and id.docid = docs.docid
131
and id.rev = docs.rev
132
and docs.rootnodeid = nodes.rootnodeid
133
and nodes.nodeid = child.parentnodeid
134
and nodes.parentnodeid = parent.nodeid
135
and parent.parentnodeid = grandparent.nodeid
136
and nodes.nodename = 'givenName'
137
and parent.nodename = 'individualName'
138
and grandparent.nodename = 'creator'
139
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
140

    
141
-- update title using revisions
142
update doi_registration doi
143
set title = child.nodedata
144
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
145
where doi.guid = id.guid
146
and id.docid = docs.docid
147
and id.rev = docs.rev
148
and docs.rootnodeid = nodes.rootnodeid
149
and nodes.nodeid = child.parentnodeid
150
and nodes.parentnodeid = parent.nodeid
151
and nodes.nodename = 'title'
152
and parent.nodename = 'dataset';
153

    
154
-- update pubDate using revisions
155
update doi_registration doi
156
set pub_date = child.nodedata
157
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
158
where doi.guid = id.guid
159
and id.docid = docs.docid
160
and id.rev = docs.rev
161
and docs.rootnodeid = nodes.rootnodeid
162
and nodes.nodeid = child.parentnodeid
163
and nodes.parentnodeid = parent.nodeid
164
and nodes.nodename = 'pubDate'
165
and parent.nodename = 'dataset';
166

    
167
-- update publisher using revisions
168
update doi_registration doi
169
set publisher = child.nodedata
170
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
171
where doi.guid = id.guid
172
and id.docid = docs.docid
173
and id.rev = docs.rev
174
and docs.rootnodeid = nodes.rootnodeid
175
and nodes.nodeid = child.parentnodeid
176
and nodes.parentnodeid = parent.nodeid
177
and nodes.nodename = 'publisher'
178
and parent.nodename = 'dataset'
179
and child.nodetype = 'TEXT';
180

    
181
-- update organization using nodes
182
update doi_registration doi
183
set creator = child.nodedata
184
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
185
where doi.guid = id.guid
186
and id.docid = docs.docid
187
and id.rev = docs.rev
188
and docs.rootnodeid = nodes.rootnodeid
189
and nodes.nodeid = child.parentnodeid
190
and nodes.parentnodeid = parent.nodeid
191
and nodes.nodename = 'organizationName'
192
and parent.nodename = 'creator'
193
and nodes.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = parent.nodeid and nodetype = 'ELEMENT');
194

    
195
-- update creator using revisions
196
update doi_registration doi
197
set creator = child.nodedata
198
from identifier id, 
199
xml_revisions docs, 
200
xml_nodes_revisions nodes, 
201
xml_nodes_revisions parent, 
202
xml_nodes_revisions child, 
203
xml_nodes_revisions grandparent
204
where doi.guid = id.guid
205
and id.docid = docs.docid
206
and id.rev = docs.rev
207
and docs.rootnodeid = nodes.rootnodeid
208
and nodes.nodeid = child.parentnodeid
209
and nodes.parentnodeid = parent.nodeid
210
and parent.parentnodeid = grandparent.nodeid
211
and nodes.nodename = 'surName'
212
and parent.nodename = 'individualName'
213
and grandparent.nodename = 'creator'
214
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
215

    
216
-- add the first name if we have it
217
update doi_registration doi
218
set creator = child.nodedata ||' '|| creator
219
from identifier id, 
220
xml_revisions docs, 
221
xml_nodes_revisions nodes, 
222
xml_nodes_revisions parent, 
223
xml_nodes_revisions child, 
224
xml_nodes_revisions grandparent
225
where doi.guid = id.guid
226
and id.docid = docs.docid
227
and id.rev = docs.rev
228
and docs.rootnodeid = nodes.rootnodeid
229
and nodes.nodeid = child.parentnodeid
230
and nodes.parentnodeid = parent.nodeid
231
and parent.parentnodeid = grandparent.nodeid
232
and nodes.nodename = 'givenName'
233
and parent.nodename = 'individualName'
234
and grandparent.nodename = 'creator'
235
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
236

    
237
-- use xml_documents for defaults that are still missing 
238
update doi_registration doi
239
set creator = doc.user_owner
240
from xml_documents doc, identifier id
241
where doi.guid = id.guid
242
and id.docid = doc.docid
243
and id.rev = doc.rev
244
and doi.creator is null;
245

    
246
update doi_registration doi
247
set creator = doc.user_owner
248
from xml_revisions doc, identifier id
249
where doi.guid = id.guid
250
and id.docid = doc.docid
251
and id.rev = doc.rev
252
and doi.creator is null;
253

    
254
-- set publisher
255
update doi_registration
256
set publisher = creator
257
where publisher is null or publisher = '';
258

    
259
-- clean up
260
update doi_registration
261
set title = replace(title, E'\n', ' ');
262

    
263
update doi_registration
264
set title = regexp_replace(title, E'\\s+', ' ', 'g');
265

    
266
update doi_registration
267
set title = regexp_replace(title, E'^\\s+', '', 'g');
268

    
269
update doi_registration
270
set ezid_account = 'KNB'
271
where guid like 'doi:10.5063%';
272
--16988
273

    
274
update doi_registration
275
set ezid_account = 'PISCO'
276
where guid like 'doi:10.6085%';
277
--98078
278

    
279
update doi_registration
280
set ezid_account = 'LTER'
281
where guid like 'doi:10.6073%';
282
--49482
283

    
284
/**
285
 * Additional modifications, July 5th, 2012
286
 */
287
ALTER TABLE doi_registration
288
ADD COLUMN resourceType text,
289
ADD COLUMN objectFormat text,
290
ADD COLUMN obsoletedBy text,
291
ADD COLUMN obsoletes text,
292
ADD COLUMN resourceMapId text,
293
ADD COLUMN resourceMapLocation text,
294
ADD COLUMN access text;
295

    
296
-- update access values
297
update doi_registration
298
set access = 'protected';
299

    
300
update doi_registration doi
301
set access = 'public'
302
from xml_access a
303
where doi.guid = a.guid
304
and a.principal_name = 'public'
305
and a.permission >= '4'
306
and a.perm_type = 'allow';
307

    
308
-- update the objectFormat from xml_documents
309
update doi_registration doi
310
set objectFormat = doc.doctype
311
from identifier id, xml_documents doc
312
where doi.guid = id.guid
313
and id.docid = doc.docid;
314

    
315
-- update the objectFormat from xml_revisions
316
update doi_registration doi
317
set objectFormat = rev.doctype
318
from identifier id, xml_revisions rev
319
where doi.guid = id.guid
320
and id.docid = rev.docid;
321

    
322
--update resourceType
323
update doi_registration
324
set resourceType = 'Dataset/METADATA';
325

    
326
update doi_registration
327
set resourceType = 'Dataset/DATA'
328
where objectFormat = 'BIN';
329

    
330
-- update the objectFormat from SM table (will be subset)
331
update doi_registration doi
332
set objectFormat = sm.object_format
333
from systemMetadata sm
334
where doi.guid = sm.guid;
335
--16938
336

    
337
--update revision history from SM
338
update doi_registration doi
339
set obsoletes = sm.obsoletes,
340
obsoletedBy = sm.obsoleted_by
341
from systemMetadata sm
342
where doi.guid = sm.guid;
343

    
344
/** use plain old revision history **/
345
-- update obsoletedby
346
update doi_registration doi
347
set obsoletedBy = newer.guid
348
from identifier id, identifier newer
349
where doi.guid = id.guid
350
and id.docid = newer.docid
351
and newer.rev = (select min(next.rev) from identifier next where next.docid = id.docid and next.rev > id.rev);
352

    
353
-- update the obsolets
354
update doi_registration doi
355
set obsoletes = older.guid
356
from identifier id, identifier older
357
where doi.guid = id.guid
358
and id.docid = older.docid
359
and older.rev = (select max(prev.rev) from identifier prev where prev.docid = id.docid and prev.rev < id.rev);
360

    
361
/**
362
select doi.guid, older.guid as obsoletes 
363
from doi_registration doi, identifier id, identifier older
364
where doi.guid = id.guid
365
and id.docid = older.docid
366
and older.rev = (select max(prev.rev) from identifier prev where prev.docid = id.docid and prev.rev < id.rev)
367
and doi.guid like 'doi:10.5063/AA/ABS.4%';
368
**/
369

    
370

    
371
-- update resourceMap pointer
372
-- NOTE: some of these maps might not actually exist
373
update doi_registration doi
374
set resourceMapId = 'resourceMap_'||id.docid||'.'||id.rev
375
from identifier id
376
where doi.guid = id.guid
377
and id.docid = rev.docid
378
and objectFormat like 'eml%';
379

    
380
--update the resourcemapid for described data
381
CREATE TABLE ecogrid_docids AS
382
select distinct id.docid, id.rev, id.guid, substring(xpi.nodedata from 'ecogrid://knb/(.*)$') as data_docid
383
from xml_path_index xpi, identifier id, xml_documents xmld
384
where xpi.docid = xmld.docid 
385
and xmld.docid = id.docid
386
and xmld.rev = id.rev
387
and xpi.path like 'dataset/%/physical/distribution/online/url' 
388
and xpi.nodedata like 'ecogrid%'
389
and id.guid in (select guid from doi_registration);
390

    
391
-- include revisions for described data
392
INSERT INTO ecogrid_docids
393
select distinct id.docid, id.rev, id.guid, substring(child.nodedata from 'ecogrid://knb/(.*)$') as data_docid
394
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
395
where id.docid = docs.docid
396
and id.rev = docs.rev
397
and docs.rootnodeid = nodes.rootnodeid
398
and nodes.nodeid = child.parentnodeid
399
and nodes.parentnodeid = parent.nodeid
400
and nodes.nodename = 'url'
401
and parent.nodename = 'online'
402
and child.nodedata like 'ecogrid%'
403
and child.nodetype = 'TEXT'
404
and id.guid in (select guid from doi_registration);
405

    
406
-- NOTE: some of thee maps might not actually exist
407
update doi_registration doi
408
set resourceMapId = 'resourceMap_'||eco.docid||'.'||eco.rev
409
from identifier id, ecogrid_docids eco
410
where doi.guid = id.guid
411
and id.docid||'.'||id.rev = eco.data_docid;
412

    
413
select count(*)
414
from doi_registration
415
where resourceMapId is null;
416

    
417
--update the resource map location
418
update doi_registration
419
set resourceMapLocation = 'https://cn.dataone.org/cn/v1/resolve/' || resourceMapId
420
where resourceMapId is not null;
421

    
422
--update creator usinig LDAP lookup
423
CREATE TABLE ecoinfo_dn (dn text, givenName text, sn text);
424
COPY ecoinfo_dn FROM '/tmp/ecoinfo_dn.csv' WITH CSV HEADER;
425

    
426
update doi_registration
427
set creator = givenName|| ' ' || sn
428
from ecoinfo_dn
429
where creator = dn;
430

    
431
update doi_registration
432
set publisher = givenName|| ' ' || sn
433
from ecoinfo_dn
434
where publisher = dn;
435

    
436
select creator, count(*) as cnt
437
from doi_registration
438
where creator like 'uid=%'
439
group by creator
440
order by cnt desc;
441

    
442
-- copy to the external file
443
COPY 
444
(select ezid_account, 
445
guid as dc_identifier, 
446
url as datacite_url, 
447
title as dc_title, 
448
creator as dc_creator, 
449
publisher as dc_publisher, 
450
pub_date as datacite_publicationYear,
451
resourceType,
452
objectFormat,
453
obsoletedBy,
454
obsoletes,
455
resourceMapId,
456
resourceMapLocation,
457
access
458
from doi_registration 
459
order by dc_identifier) 
460
TO '/tmp/doi_registration.csv'
461
WITH CSV HEADER;
462
--164548
463

    
464
--drop table doi_registration;
465
--drop table ecogrid_docids;
466
--drop table ecoinfo_dn;
467

    
(10-10/63)