-- use default values initially CREATE TABLE doi_registration AS select text('testing') as ezid_account, doc.doctype, id.guid, 'https://cn-stage.dataone.org/cn/v1/resolve/' || id.guid as url, text('Unknown') as title, text(null) as creator, text(null) as publisher, to_char(doc.date_created, 'YYYY') as pub_date from identifier id, xml_documents doc where guid like 'doi%' and id.docid = doc.docid and id.rev = doc.rev UNION ALL select text('testing') as ezid_account, doc.doctype, id.guid, 'https://cn-stage.dataone.org/cn/v1/resolve/' || id.guid as url, text('Unknown') as title, text(null) as creator, text(null) as publisher, to_char(doc.date_created, 'YYYY') as pub_date from identifier id, xml_revisions doc where guid like 'doi%' and id.docid = doc.docid and id.rev = doc.rev; --update defaults update doi_registration set title = 'Data file' where doctype = 'BIN'; update doi_registration set title = 'Legacy EML file' where doctype like '%eml%2.0.0beta%'; -- update title using node information update doi_registration doi set title = child.nodedata from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and nodes.nodename = 'title' and parent.nodename = 'dataset'; -- update pubDate using nodes update doi_registration doi set pub_date = child.nodedata from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and nodes.nodename = 'pubDate' and parent.nodename = 'dataset'; -- update publisher using nodes update doi_registration doi set publisher = child.nodedata from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and nodes.nodename = 'publisher' and parent.nodename = 'dataset'; -- update organization using nodes update doi_registration doi set creator = child.nodedata from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and nodes.nodename = 'organizationName' and parent.nodename = 'creator' and nodes.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = parent.nodeid and nodetype = 'ELEMENT'); -- update creator using nodes -- creator/individualName/surName -- creator/individualName/givenName update doi_registration doi set creator = child.nodedata from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child, xml_nodes grandparent where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and parent.parentnodeid = grandparent.nodeid and nodes.nodename = 'surName' and parent.nodename = 'individualName' and grandparent.nodename = 'creator' and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT'); -- add the first name if we have it update doi_registration doi set creator = child.nodedata ||' '|| creator from identifier id, xml_documents docs, xml_nodes nodes, xml_nodes parent, xml_nodes child, xml_nodes grandparent where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and parent.parentnodeid = grandparent.nodeid and nodes.nodename = 'givenName' and parent.nodename = 'individualName' and grandparent.nodename = 'creator' and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT'); -- update title using revisions update doi_registration doi set title = child.nodedata from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and nodes.nodename = 'title' and parent.nodename = 'dataset'; -- update pubDate using revisions update doi_registration doi set pub_date = child.nodedata from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and nodes.nodename = 'pubDate' and parent.nodename = 'dataset'; -- update publisher using revisions update doi_registration doi set publisher = child.nodedata from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and nodes.nodename = 'publisher' and parent.nodename = 'dataset'; -- update organization using nodes update doi_registration doi set creator = child.nodedata from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and nodes.nodename = 'organizationName' and parent.nodename = 'creator' and nodes.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = parent.nodeid and nodetype = 'ELEMENT'); -- update creator using revisions update doi_registration doi set creator = child.nodedata from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child, xml_nodes_revisions grandparent where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and parent.parentnodeid = grandparent.nodeid and nodes.nodename = 'surName' and parent.nodename = 'individualName' and grandparent.nodename = 'creator' and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT'); -- add the first name if we have it update doi_registration doi set creator = child.nodedata ||' '|| creator from identifier id, xml_revisions docs, xml_nodes_revisions nodes, xml_nodes_revisions parent, xml_nodes_revisions child, xml_nodes_revisions grandparent where doi.guid = id.guid and id.docid = docs.docid and id.rev = docs.rev and docs.rootnodeid = nodes.rootnodeid and nodes.nodeid = child.parentnodeid and nodes.parentnodeid = parent.nodeid and parent.parentnodeid = grandparent.nodeid and nodes.nodename = 'givenName' and parent.nodename = 'individualName' and grandparent.nodename = 'creator' and parent.nodeid = (select min(thisnode.nodeid) from xml_nodes_revisions thisnode where thisnode.parentnodeid = grandparent.nodeid and nodetype = 'ELEMENT'); -- use xml_documents for defaults that are still missing update doi_registration doi set creator = doc.user_owner from xml_documents doc, identifier id where doi.guid = id.guid and id.docid = doc.docid and id.rev = doc.rev and doi.creator is null; update doi_registration doi set creator = doc.user_owner from xml_revisions doc, identifier id where doi.guid = id.guid and id.docid = doc.docid and id.rev = doc.rev and doi.creator is null; -- set publisher update doi_registration set publisher = creator where publisher is null or publisher = ''; -- clean up update doi_registration set title = replace(title, E'\n', ' '); update doi_registration set title = regexp_replace(title, E'\\s+', ' '); update doi_registration set title = regexp_replace(title, E'^\\s+', ''); COPY (select ezid_account, guid as doi, url, title, creator, publisher, pub_date from doi_registration order by doi) TO '/tmp/doi_registration.csv' WITH CSV HEADER; --drop table doi_registration;