Project

General

Profile

1
-- use default values initially
2
CREATE TABLE doi_registration AS
3

    
4
select 
5
text('testing') as ezid_account,
6
doc.doctype,
7
id.guid, 
8
'https://cn.dataone.org/cn/v1/resolve/' || regexp_replace(regexp_replace(id.guid, '/', '%2F', 'g'), ':', '%3A', 'g') as url, 
9
text('Unknown') as title, 
10
text(null) as creator,
11
text(null) as publisher,
12
to_char(doc.date_created, 'YYYY') as pub_date
13
from identifier id, xml_documents doc
14
where guid like 'doi%'
15
and id.docid = doc.docid
16
and id.rev = doc.rev
17

    
18
UNION ALL
19

    
20
select 
21
text('testing') as ezid_account,
22
doc.doctype,
23
id.guid, 
24
'https://cn.dataone.org/cn/v1/resolve/' || regexp_replace(regexp_replace(id.guid, '/', '%2F', 'g'), ':', '%3A', 'g') as url, 
25
text('Unknown') as title, 
26
text(null) as creator,
27
text(null) as publisher,
28
to_char(doc.date_created, 'YYYY') as pub_date
29
from identifier id, xml_revisions doc
30
where guid like 'doi%'
31
and id.docid = doc.docid
32
and id.rev = doc.rev;
33

    
34
--update defaults
35
update doi_registration
36
set title = 'Data file'
37
where doctype = 'BIN';
38

    
39
update doi_registration
40
set title = 'Legacy EML file'
41
where doctype like '%eml%2.0.0beta%';
42

    
43
-- update title using node information
44
update doi_registration doi
45
set title = child.nodedata
46
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
47
where doi.guid = id.guid
48
and id.docid = docs.docid
49
and id.rev = docs.rev
50
and docs.rootnodeid = nodes.rootnodeid
51
and nodes.nodeid = child.parentnodeid
52
and nodes.parentnodeid = parent.nodeid
53
and nodes.nodename = 'title'
54
and parent.nodename = 'dataset';
55

    
56
-- update pubDate using nodes
57
update doi_registration doi
58
set pub_date = child.nodedata
59
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
60
where doi.guid = id.guid
61
and id.docid = docs.docid
62
and id.rev = docs.rev
63
and docs.rootnodeid = nodes.rootnodeid
64
and nodes.nodeid = child.parentnodeid
65
and nodes.parentnodeid = parent.nodeid
66
and nodes.nodename = 'pubDate'
67
and parent.nodename = 'dataset';
68

    
69
-- update publisher using nodes
70
update doi_registration doi
71
set publisher = child.nodedata
72
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
73
where doi.guid = id.guid
74
and id.docid = docs.docid
75
and id.rev = docs.rev
76
and docs.rootnodeid = nodes.rootnodeid
77
and nodes.nodeid = child.parentnodeid
78
and nodes.parentnodeid = parent.nodeid
79
and nodes.nodename = 'publisher'
80
and parent.nodename = 'dataset';
81

    
82
-- update organization using nodes
83
update doi_registration doi
84
set creator = child.nodedata
85
from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child
86
where doi.guid = id.guid
87
and id.docid = docs.docid
88
and id.rev = docs.rev
89
and docs.rootnodeid = nodes.rootnodeid
90
and nodes.nodeid = child.parentnodeid
91
and nodes.parentnodeid = parent.nodeid
92
and nodes.nodename = 'organizationName'
93
and parent.nodename = 'creator'
94
and nodes.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = parent.nodeid and nodetype = 'ELEMENT');
95

    
96
-- update creator using nodes
97
-- creator/individualName/surName
98
-- creator/individualName/givenName
99
update doi_registration doi
100
set creator = child.nodedata
101
from identifier id, 
102
xml_documents docs, 
103
xml_nodes nodes, 
104
xml_nodes parent, 
105
xml_nodes child, 
106
xml_nodes grandparent
107
where doi.guid = id.guid
108
and id.docid = docs.docid
109
and id.rev = docs.rev
110
and docs.rootnodeid = nodes.rootnodeid
111
and nodes.nodeid = child.parentnodeid
112
and nodes.parentnodeid = parent.nodeid
113
and parent.parentnodeid = grandparent.nodeid
114
and nodes.nodename = 'surName'
115
and parent.nodename = 'individualName'
116
and grandparent.nodename = 'creator'
117
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
118

    
119
-- add the first name if we have it
120
update doi_registration doi
121
set creator = child.nodedata ||' '|| creator
122
from identifier id, 
123
xml_documents docs, 
124
xml_nodes nodes, 
125
xml_nodes parent, 
126
xml_nodes child, 
127
xml_nodes grandparent
128
where doi.guid = id.guid
129
and id.docid = docs.docid
130
and id.rev = docs.rev
131
and docs.rootnodeid = nodes.rootnodeid
132
and nodes.nodeid = child.parentnodeid
133
and nodes.parentnodeid = parent.nodeid
134
and parent.parentnodeid = grandparent.nodeid
135
and nodes.nodename = 'givenName'
136
and parent.nodename = 'individualName'
137
and grandparent.nodename = 'creator'
138
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
139

    
140
-- update title using revisions
141
update doi_registration doi
142
set title = child.nodedata
143
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
144
where doi.guid = id.guid
145
and id.docid = docs.docid
146
and id.rev = docs.rev
147
and docs.rootnodeid = nodes.rootnodeid
148
and nodes.nodeid = child.parentnodeid
149
and nodes.parentnodeid = parent.nodeid
150
and nodes.nodename = 'title'
151
and parent.nodename = 'dataset';
152

    
153
-- update pubDate using revisions
154
update doi_registration doi
155
set pub_date = child.nodedata
156
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
157
where doi.guid = id.guid
158
and id.docid = docs.docid
159
and id.rev = docs.rev
160
and docs.rootnodeid = nodes.rootnodeid
161
and nodes.nodeid = child.parentnodeid
162
and nodes.parentnodeid = parent.nodeid
163
and nodes.nodename = 'pubDate'
164
and parent.nodename = 'dataset';
165

    
166
-- update publisher using revisions
167
update doi_registration doi
168
set publisher = child.nodedata
169
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
170
where doi.guid = id.guid
171
and id.docid = docs.docid
172
and id.rev = docs.rev
173
and docs.rootnodeid = nodes.rootnodeid
174
and nodes.nodeid = child.parentnodeid
175
and nodes.parentnodeid = parent.nodeid
176
and nodes.nodename = 'publisher'
177
and parent.nodename = 'dataset';
178

    
179
-- update organization using nodes
180
update doi_registration doi
181
set creator = child.nodedata
182
from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child
183
where doi.guid = id.guid
184
and id.docid = docs.docid
185
and id.rev = docs.rev
186
and docs.rootnodeid = nodes.rootnodeid
187
and nodes.nodeid = child.parentnodeid
188
and nodes.parentnodeid = parent.nodeid
189
and nodes.nodename = 'organizationName'
190
and parent.nodename = 'creator'
191
and nodes.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = parent.nodeid and nodetype = 'ELEMENT');
192

    
193
-- update creator using revisions
194
update doi_registration doi
195
set creator = child.nodedata
196
from identifier id, 
197
xml_revisions docs, 
198
xml_nodes_revisions nodes, 
199
xml_nodes_revisions parent, 
200
xml_nodes_revisions child, 
201
xml_nodes_revisions grandparent
202
where doi.guid = id.guid
203
and id.docid = docs.docid
204
and id.rev = docs.rev
205
and docs.rootnodeid = nodes.rootnodeid
206
and nodes.nodeid = child.parentnodeid
207
and nodes.parentnodeid = parent.nodeid
208
and parent.parentnodeid = grandparent.nodeid
209
and nodes.nodename = 'surName'
210
and parent.nodename = 'individualName'
211
and grandparent.nodename = 'creator'
212
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
213

    
214
-- add the first name if we have it
215
update doi_registration doi
216
set creator = child.nodedata ||' '|| creator
217
from identifier id, 
218
xml_revisions docs, 
219
xml_nodes_revisions nodes, 
220
xml_nodes_revisions parent, 
221
xml_nodes_revisions child, 
222
xml_nodes_revisions grandparent
223
where doi.guid = id.guid
224
and id.docid = docs.docid
225
and id.rev = docs.rev
226
and docs.rootnodeid = nodes.rootnodeid
227
and nodes.nodeid = child.parentnodeid
228
and nodes.parentnodeid = parent.nodeid
229
and parent.parentnodeid = grandparent.nodeid
230
and nodes.nodename = 'givenName'
231
and parent.nodename = 'individualName'
232
and grandparent.nodename = 'creator'
233
and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT');
234

    
235
-- use xml_documents for defaults that are still missing 
236
update doi_registration doi
237
set creator = doc.user_owner
238
from xml_documents doc, identifier id
239
where doi.guid = id.guid
240
and id.docid = doc.docid
241
and id.rev = doc.rev
242
and doi.creator is null;
243

    
244
update doi_registration doi
245
set creator = doc.user_owner
246
from xml_revisions doc, identifier id
247
where doi.guid = id.guid
248
and id.docid = doc.docid
249
and id.rev = doc.rev
250
and doi.creator is null;
251

    
252
-- set publisher
253
update doi_registration
254
set publisher = creator
255
where publisher is null or publisher = '';
256

    
257
-- clean up
258
update doi_registration
259
set title = replace(title, E'\n', ' ');
260

    
261
update doi_registration
262
set title = regexp_replace(title, E'\\s+', ' ', 'g');
263

    
264
update doi_registration
265
set title = regexp_replace(title, E'^\\s+', '', 'g');
266

    
267
update doi_registration
268
set ezid_account = 'KNB'
269
where guid like 'doi:10.5063%';
270
--16988
271

    
272
update doi_registration
273
set ezid_account = 'PISCO'
274
where guid like 'doi:10.6085%';
275
--98078
276

    
277
update doi_registration
278
set ezid_account = 'LTER'
279
where guid like 'doi:10.6073%';
280
--49482
281

    
282
/**
283
 * Additional modifications, July 5th, 2012
284
 */
285
ALTER TABLE doi_registration
286
ADD COLUMN resourceType text,
287
ADD COLUMN objectFormat text,
288
ADD COLUMN obsoletedBy text,
289
ADD COLUMN obsoletes text,
290
ADD COLUMN resourceMapId text,
291
ADD COLUMN resourceMapLocation text,
292
ADD COLUMN access text;
293

    
294
-- update access values
295
update doi_registration
296
set access = 'protected';
297

    
298
update doi_registration doi
299
set access = 'public'
300
from xml_access a
301
where doi.guid = a.guid
302
and a.principal_name = 'public'
303
and a.permission >= '4'
304
and a.perm_type = 'allow';
305

    
306
-- update the objectFormat from xml_documents
307
update doi_registration doi
308
set objectFormat = doc.doctype
309
from identifier id, xml_documents doc
310
where doi.guid = id.guid
311
and id.docid = doc.docid;
312

    
313
-- update the objectFormat from xml_revisions
314
update doi_registration doi
315
set objectFormat = rev.doctype
316
from identifier id, xml_revisions rev
317
where doi.guid = id.guid
318
and id.docid = rev.docid;
319

    
320
--update resourceType
321
update doi_registration
322
set resourceType = 'Dataset/METADATA';
323

    
324
update doi_registration
325
set resourceType = 'Dataset/DATA'
326
where objectFormat = 'BIN';
327

    
328
-- update the objectFormat from SM table (will be subset)
329
update doi_registration doi
330
set objectFormat = sm.object_format
331
from systemMetadata sm
332
where doi.guid = sm.guid;
333
--16938
334

    
335
--update revision history from SM
336
update doi_registration doi
337
set obsoletes = sm.obsoletes,
338
obsoletedBy = sm.obsoleted_by
339
from systemMetadata sm
340
where doi.guid = sm.guid;
341

    
342
--update resourceMap pointer TBD
343
-- NOTE: some of these maps might not actually exist
344
update doi_registration doi
345
set resourceMapId = 'resourceMap_'||id.docid||'.'||id.rev
346
from identifier id
347
where doi.guid = id.guid
348
and id.docid = rev.docid
349
and objectFormat like 'eml%';
350

    
351
--update the resourcemapid for described data
352
CREATE TABLE ecogrid_docids AS
353

    
354
select distinct id.docid, id.rev, id.guid, substring(xpi.nodedata from 'ecogrid://knb/(.*)$') as data_docid
355
from xml_path_index xpi, identifier id, xml_documents xmld
356
where xpi.docid = xmld.docid 
357
and xmld.docid = id.docid
358
and xmld.rev = id.rev
359
and xpi.path like 'dataset/%/physical/distribution/online/url' 
360
and xpi.nodedata like 'ecogrid%';
361

    
362
-- TODO: include revisions for described data
363

    
364
-- NOTE: some of thee maps might not actually exist
365
update doi_registration doi
366
set resourceMapId = 'resourceMap_'||eco.docid||'.'||eco.rev
367
from identifier id, ecogrid_docids eco
368
where doi.guid = id.guid
369
and id.docid||'.'||id.rev = eco.data_docid;
370

    
371
--update the resource map location
372
update doi_registration
373
set resourceMapLocation = 'https://cn.dataone.org/cn/v1/resolve/' || resourceMapId
374
where resourceMapId is not null;
375

    
376
--update creator usinig LDAP lookup
377
CREATE TABLE ecoinfo_dn (dn text, givenName text, sn text);
378
COPY ecoinfo_dn FROM '/tmp/ecoinfo_dn.csv' WITH CSV HEADER;
379

    
380
update doi_registration
381
set creator = givenName|| ' ' || sn
382
from ecoinfo_dn
383
where creator = dn;
384

    
385
update doi_registration
386
set publisher = givenName|| ' ' || sn
387
from ecoinfo_dn
388
where publisher = dn;
389

    
390
select creator, count(*) as cnt
391
from doi_registration
392
where creator like 'uid=%'
393
group by creator
394
order by cnt desc;
395

    
396
-- copy to the external file
397
COPY 
398
(select ezid_account, 
399
guid as dc_identifier, 
400
url as datacite_url, 
401
title as dc_title, 
402
creator as dc_creator, 
403
publisher as dc_publisher, 
404
pub_date as datacite_publicationYear,
405
resourceType,
406
objectFormat,
407
obsoletedBy,
408
obsoletes,
409
resourceMapId,
410
resourceMapLocation,
411
access
412
from doi_registration 
413
order by dc_identifier) 
414
TO '/tmp/doi_registration.csv'
415
WITH CSV HEADER;
416
--164548
417

    
418
--drop table doi_registration;
419
--drop table ecogrid_docids;
420
--drop table ecoinfo_dn;
421

    
(10-10/63)