-- use default values initially CREATE TABLE doi_registration AS select text('testing') as ezid_account, doc.doctype, id.guid, 'https://cn.dataone.org/cn/v1/resolve/' || regexp_replace(regexp_replace(id.guid, '/', '%2F', 'g'), ':', '%3A', 'g') as url, text('Unknown') as title, text(null) as creator, text(null) as publisher, to_char(doc.date_created, 'YYYY') as pub_date from identifier id, xml_documents doc where guid like 'doi%' and id.docid = doc.docid and id.rev = doc.rev UNION ALL select text('testing') as ezid_account, doc.doctype, id.guid, 'https://cn.dataone.org/cn/v1/resolve/' || regexp_replace(regexp_replace(id.guid, '/', '%2F', 'g'), ':', '%3A', 'g') as url, text('Unknown') as title, text(null) as creator, text(null) as publisher, to_char(doc.date_created, 'YYYY') as pub_date from identifier id, xml_revisions doc where guid like 'doi%' and id.docid = doc.docid and id.rev = doc.rev; --update defaults update doi_registration set title = 'Data file' where doctype = 'BIN'; update doi_registration set title = 'Legacy EML file' where doctype like '%eml%2.0.0beta%'; -- update title using node information update doi_registration doi set title = child.nodedata from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and nodes.nodename = 'title' and parent.nodename = 'dataset'; -- update pubDate using nodes update doi_registration doi set pub_date = child.nodedata from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and nodes.nodename = 'pubDate' and parent.nodename = 'dataset'; -- update publisher using nodes update doi_registration doi set publisher = child.nodedata from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and nodes.nodename = 'organizationName' and parent.nodename = 'publisher' and nodes.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = parent.nodeid and nodetype = 'ELEMENT'); -- update publisher surName update doi_registration doi set publisher = child.nodedata from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child, xml_nodes grandparent where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and parent.parentnodeid = grandparent.nodeid and nodes.nodename = 'surName' and parent.nodename = 'individualName' and grandparent.nodename = 'publisher' and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT'); -- add the first name if we have it update doi_registration doi set publisher = child.nodedata ||' '|| creator from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child, xml_nodes grandparent where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and parent.parentnodeid = grandparent.nodeid and nodes.nodename = 'givenName' and parent.nodename = 'individualName' and grandparent.nodename = 'publisher' and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT'); -- update organization using nodes update doi_registration doi set creator = child.nodedata from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and nodes.nodename = 'organizationName' and parent.nodename = 'creator' and nodes.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = parent.nodeid and nodetype = 'ELEMENT'); -- update creator using nodes -- creator/individualName/surName -- creator/individualName/givenName update doi_registration doi set creator = child.nodedata from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child, xml_nodes grandparent where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and parent.parentnodeid = grandparent.nodeid and nodes.nodename = 'surName' and parent.nodename = 'individualName' and grandparent.nodename = 'creator' and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT'); -- add the first name if we have it update doi_registration doi set creator = child.nodedata ||' '|| creator from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child, xml_nodes grandparent where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and parent.parentnodeid = grandparent.nodeid and nodes.nodename = 'givenName' and parent.nodename = 'individualName' and grandparent.nodename = 'creator' and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT'); -- update title using revisions update doi_registration doi set title = child.nodedata from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and nodes.nodename = 'title' and parent.nodename = 'dataset'; -- update pubDate using revisions update doi_registration doi set pub_date = child.nodedata from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and nodes.nodename = 'pubDate' and parent.nodename = 'dataset'; -- update publisher using revisions update doi_registration doi set publisher = child.nodedata from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and nodes.nodename = 'organizationName' and parent.nodename = 'publisher' and nodes.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = parent.nodeid and nodetype = 'ELEMENT'); -- use publisher individual name if we have it update doi_registration doi set publisher = child.nodedata from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child, xml_nodes_revisions grandparent where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and parent.parentnodeid = grandparent.nodeid and nodes.nodename = 'surName' and parent.nodename = 'individualName' and grandparent.nodename = 'publisher' and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT'); --include the first name (careful not to rerun this part) update doi_registration doi set publisher = child.nodedata ||' '|| creator from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child, xml_nodes_revisions grandparent where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and parent.parentnodeid = grandparent.nodeid and nodes.nodename = 'givenName' and parent.nodename = 'individualName' and grandparent.nodename = 'publisher' and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT'); -- update organization using nodes update doi_registration doi set creator = child.nodedata from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and nodes.nodename = 'organizationName' and parent.nodename = 'creator' and nodes.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = parent.nodeid and nodetype = 'ELEMENT'); -- update creator using revisions update doi_registration doi set creator = child.nodedata from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child, xml_nodes_revisions grandparent where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and parent.parentnodeid = grandparent.nodeid and nodes.nodename = 'surName' and parent.nodename = 'individualName' and grandparent.nodename = 'creator' and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT'); -- add the first name if we have it update doi_registration doi set creator = child.nodedata ||' '|| creator from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child, xml_nodes_revisions grandparent where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and parent.parentnodeid = grandparent.nodeid and nodes.nodename = 'givenName' and parent.nodename = 'individualName' and grandparent.nodename = 'creator' and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT'); -- use xml_documents for defaults that are still missing update doi_registration doi set creator = doc.user_owner from xml_documents doc, identifier id where doi.guid = id.guid and id.docid = doc.docid and id.rev = doc.rev and doi.creator is null; update doi_registration doi set creator = doc.user_owner from xml_revisions doc, identifier id where doi.guid = id.guid and id.docid = doc.docid and id.rev = doc.rev and doi.creator is null; -- set publisher update doi_registration set publisher = creator where publisher is null or publisher = ''; -- clean up update doi_registration set title = replace(title, E'\n', ' '); update doi_registration set title = regexp_replace(title, E'\\s+', ' ', 'g'); update doi_registration set title = regexp_replace(title, E'^\\s+', '', 'g'); update doi_registration set ezid_account = 'KNB' where guid like 'doi:10.5063%'; --16988 update doi_registration set ezid_account = 'PISCO' where guid like 'doi:10.6085%'; --98078 update doi_registration set ezid_account = 'LTER' where guid like 'doi:10.6073%'; --49482 /** * Additional modifications, July 5th, 2012 */ ALTER TABLE doi_registration ADD COLUMN resourceType text, ADD COLUMN objectFormat text, ADD COLUMN obsoletedBy text, ADD COLUMN obsoletes text, ADD COLUMN resourceMapId text, ADD COLUMN resourceMapLocation text, ADD COLUMN access text; -- update access values update doi_registration set access = 'protected'; update doi_registration doi set access = 'public' from xml_access a where doi.guid = a.guid and a.principal_name = 'public' and a.permission >= '4' and a.perm_type = 'allow'; -- update the objectFormat from xml_documents update doi_registration doi set objectFormat = doc.doctype from identifier id, xml_documents doc where doi.guid = id.guid and id.docid = doc.docid and id.rev = doc.rev; -- update the objectFormat from xml_revisions update doi_registration doi set objectFormat = rev.doctype from identifier id, xml_revisions rev where doi.guid = id.guid and id.docid = rev.docid and id.rev = rev.rev; --update resourceType update doi_registration set resourceType = 'Dataset/METADATA'; update doi_registration set resourceType = 'Dataset/DATA' where objectFormat = 'BIN'; -- update the objectFormat from SM table (will be subset) update doi_registration doi set objectFormat = sm.object_format from systemMetadata sm where doi.guid = sm.guid; --16938 --update revision history from SM update doi_registration doi set obsoletes = sm.obsoletes, obsoletedBy = sm.obsoleted_by from systemMetadata sm where doi.guid = sm.guid; /** use plain old revision history **/ -- update obsoletedby update doi_registration doi set obsoletedBy = newer.guid from identifier id, identifier newer where doi.guid = id.guid and id.docid = newer.docid and newer.rev = (select min(next.rev) from identifier next where next.docid = id.docid and next.rev > id.rev); -- update the obsolets update doi_registration doi set obsoletes = older.guid from identifier id, identifier older where doi.guid = id.guid and id.docid = older.docid and older.rev = (select max(prev.rev) from identifier prev where prev.docid = id.docid and prev.rev < id.rev); /** select doi.guid, older.guid as obsoletes from doi_registration doi, identifier id, identifier older where doi.guid = id.guid and id.docid = older.docid and older.rev = (select max(prev.rev) from identifier prev where prev.docid = id.docid and prev.rev < id.rev) and doi.guid like 'doi:10.5063/AA/ABS.4%'; **/ -- update resourceMap pointer -- NOTE: some of these maps might not actually exist update doi_registration doi set resourceMapId = 'resourceMap_'||id.docid||'.'||id.rev from identifier id where doi.guid = id.guid and objectFormat like 'eml%'; --update the resourcemapid for described data CREATE TABLE ecogrid_docids AS select distinct id.docid, id.rev, id.guid, substring(xpi.nodedata from 'ecogrid://knb/(.*)$') as data_docid from xml_path_index xpi, identifier id, xml_documents xmld where xpi.docid = xmld.docid and xmld.docid = id.docid and xmld.rev = id.rev and xpi.path like 'dataset/%/physical/distribution/online/url' and xpi.nodedata like 'ecogrid%' and id.guid in (select guid from doi_registration); -- include revisions for described data INSERT INTO ecogrid_docids select distinct id.docid, id.rev, id.guid, substring(child.nodedata from 'ecogrid://knb/(.*)$') as data_docid from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child where id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and nodes.nodename = 'url' and parent.nodename = 'online' and child.nodedata like 'ecogrid%' and child.nodetype = 'TEXT' and id.guid in (select guid from doi_registration); -- Set the resource map for the data files (NOTE: some of thee maps might not actually exist on the system depending on how successful the ORE generation was) update doi_registration doi set resourceMapId = 'resourceMap_'||eco.docid||'.'||eco.rev from identifier id, ecogrid_docids eco where doi.guid = id.guid and id.docid||'.'||id.rev = eco.data_docid; -- set the resource map id for the metadata file that did most of the packaging work! update doi_registration doi set resourceMapId = 'resourceMap_'||eco.docid||'.'||eco.rev from ecogrid_docids eco where doi.guid = eco.guid; select count(*) from doi_registration where resourceMapId is null; --update the resource map location update doi_registration set resourceMapLocation = 'https://cn.dataone.org/cn/v1/resolve/' || resourceMapId where resourceMapId is not null; -- fix null values (should not have to have this now, but it will not hurt) update doi_registration doi set creator = user_owner from identifier id, xml_documents docs where doi.guid = id.guid and id.docid = docs.docid and id.rev= docs.rev and (creator is null or trim(creator) = ''); update doi_registration doi set publisher = user_owner from identifier id, xml_documents docs where doi.guid = id.guid and id.docid = docs.docid and id.rev= docs.rev and (publisher is null or trim(publisher) = ''); update doi_registration doi set creator = user_owner from identifier id, xml_revisions docs where doi.guid = id.guid and id.docid = docs.docid and id.rev= docs.rev and (creator is null or trim(creator) = ''); update doi_registration doi set publisher = user_owner from identifier id, xml_revisions docs where doi.guid = id.guid and id.docid = docs.docid and id.rev= docs.rev and (publisher is null or trim(publisher) = ''); -- fix a previous mistake update doi_registration set publisher = null where publisher = 'document'; --update creator usinig LDAP lookup CREATE TABLE ecoinfo_dn (dn text, givenName text, sn text); --TRUNCATE TABLE ecoinfo_dn; COPY ecoinfo_dn FROM '/tmp/ecoinfo_dn.csv' WITH CSV HEADER; update doi_registration set creator = case when givenName is null then sn else givenName|| ' ' || sn end from ecoinfo_dn where trim(both from lower(creator)) = lower(dn); update doi_registration set publisher = case when givenName is null then sn else givenName|| ' ' || sn end from ecoinfo_dn where trim(both from lower(publisher)) = lower(dn); select creator, count(*) as cnt from doi_registration where creator like 'uid=%' group by creator order by cnt desc; -- update the data files with owner/publisher from EML entry update doi_registration doi set creator = meta.creator, publisher = meta.publisher from doi_registration meta, identifier id, ecogrid_docids eco where doi.guid = id.guid and id.docid||'.'||id.rev = eco.data_docid and eco.guid = meta.guid; -- clean up KBS entries that have empty user_owner update doi_registration set creator = 'Kellogg Biological Station' where (creator is null or trim(creator) = '') and guid like '%knb-lter-kbs%'; update doi_registration set publisher = 'Kellogg Biological Station' where (publisher is null or trim(publisher) = '') and guid like '%knb-lter-kbs%'; -- copy to the external file COPY (select ezid_account, guid as dc_identifier, url as datacite_url, title as dc_title, creator as dc_creator, publisher as dc_publisher, pub_date as datacite_publicationYear, resourceType, objectFormat, obsoletedBy, obsoletes, resourceMapId, resourceMapLocation, access from doi_registration order by dc_identifier) TO '/tmp/doi_registration.csv' WITH CSV HEADER; --164548 --drop table doi_registration; --drop table ecogrid_docids; --drop table ecoinfo_dn;