1 |
4722
|
walbridge
|
#!/usr/bin/env perl
|
2 |
|
|
# eml_get_objectnames.pl
|
3 |
|
|
# Given a list of EML docids and dataids, generate a list of objectName elements
|
4 |
|
|
# present in the documents mapped to each dataid, and generate UPDATE SQL statements.
|
5 |
|
|
#
|
6 |
|
|
# This information is useful because both Morpho and the Metacat Registry historically
|
7 |
|
|
# stored the dataid i n the 'docname' field within xml_documents.
|
8 |
|
|
use Metacat;
|
9 |
|
|
use Data::Dumper;
|
10 |
|
|
use XML::LibXML;
|
11 |
|
|
use LWP::UserAgent;
|
12 |
|
|
use strict;
|
13 |
|
|
|
14 |
|
|
############################################################################
|
15 |
|
|
#
|
16 |
|
|
# MAIN program block
|
17 |
|
|
#
|
18 |
|
|
############################################################################
|
19 |
|
|
|
20 |
|
|
# check that the correct number or parameters are passed from the commandline
|
21 |
|
|
if (($#ARGV + 1) < 1) {die "Usage: %./eml_get_objectnames.pl <metacat_url>\n\n";}
|
22 |
|
|
# Get the URL to the metacat server from the command line options
|
23 |
|
|
my $url = @ARGV;
|
24 |
|
|
|
25 |
|
|
# Open a metacat connection
|
26 |
|
|
my $metacat = openMetacatConnection($url);
|
27 |
|
|
|
28 |
|
|
my @errorMessages;
|
29 |
|
|
my $error;
|
30 |
|
|
|
31 |
|
|
# requires an input CSV file containing two columns:
|
32 |
|
|
# the dataid and docid, which can be extracted from the database:
|
33 |
|
|
# SELECT nodedata, docid FROM xml_nodes WHERE nodetype = 'TEXT' AND nodedata LIKE 'ecogrid%'
|
34 |
|
|
# and then trimming the ecogrid prefix from the dataid fields.
|
35 |
|
|
|
36 |
|
|
my $nameFile = 'docid-for-binaries.txt';
|
37 |
|
|
|
38 |
|
|
unless (open (FILE_LIST, $nameFile)) {
|
39 |
|
|
print "file with docids required.\n";
|
40 |
|
|
exit;
|
41 |
|
|
}
|
42 |
|
|
|
43 |
|
|
while (my $line = <FILE_LIST>) {
|
44 |
|
|
my ($dataid, $docid) = split(',', trim($line));
|
45 |
|
|
|
46 |
|
|
my $response = $metacat->read($docid);
|
47 |
|
|
my $metadata = $response->content();
|
48 |
|
|
|
49 |
|
|
# Now parse the metadata document, grabbing the objectName for the particular docid
|
50 |
|
|
my $doc = getEMLDoc($metadata);
|
51 |
|
|
my $objName = getObjectName($doc, $dataid);
|
52 |
|
|
if ($objName ne $dataid && $objName ne "" && $objName !~ /deleteme/) {
|
53 |
|
|
my $id = $dataid;
|
54 |
|
|
$id =~ s/\.[0-9]+$//;
|
55 |
|
|
print "UPDATE xml_documents SET docname = '$objName' WHERE docid = '$id';\n";
|
56 |
|
|
}
|
57 |
|
|
}
|
58 |
|
|
|
59 |
|
|
exit;
|
60 |
|
|
|
61 |
|
|
############################################################################
|
62 |
|
|
#
|
63 |
|
|
# SUBROUTINES
|
64 |
|
|
#
|
65 |
|
|
############################################################################
|
66 |
|
|
|
67 |
|
|
#
|
68 |
|
|
# Create a connection to the metacat server
|
69 |
|
|
#
|
70 |
|
|
sub openMetacatConnection {
|
71 |
|
|
my $url = shift;
|
72 |
|
|
|
73 |
|
|
my $metacat = Metacat->new();
|
74 |
|
|
if ($metacat) {
|
75 |
|
|
$metacat->set_options( metacatUrl => $url );
|
76 |
|
|
} else {
|
77 |
|
|
die("Could not open connection to Metacat url: $url\n");
|
78 |
|
|
}
|
79 |
|
|
return $metacat;
|
80 |
|
|
}
|
81 |
|
|
|
82 |
|
|
#
|
83 |
|
|
# Retrieve EML documents and set up XML parser object
|
84 |
|
|
#
|
85 |
|
|
sub getEMLDoc {
|
86 |
|
|
my $resultset = shift;
|
87 |
|
|
|
88 |
|
|
my $parser = XML::LibXML->new();
|
89 |
|
|
my $node;
|
90 |
|
|
my $docid;
|
91 |
|
|
my $doc = $parser->parse_string($resultset);
|
92 |
|
|
if ($doc eq "") {
|
93 |
|
|
$error ="Error in parsing the eml document";
|
94 |
|
|
push(@errorMessages, $error);
|
95 |
|
|
} elsif ($doc=~ /<error/) {
|
96 |
|
|
if ($doc=~ /public/) {
|
97 |
|
|
$error ="Error in reading the eml document. Please check if you are logged in.";
|
98 |
|
|
push(@errorMessages, $error);
|
99 |
|
|
} else {
|
100 |
|
|
$error ="Error in reading the eml document. Please check if you have access to read the document";
|
101 |
|
|
push(@errorMessages, $error);
|
102 |
|
|
}
|
103 |
|
|
} else {
|
104 |
|
|
my $findType = $doc->findnodes('//dataset/identifier');
|
105 |
|
|
if ($findType->size() > 0) {
|
106 |
|
|
# This is a eml beta6 document
|
107 |
|
|
# Read the documents mentioned in triples also
|
108 |
|
|
push(@errorMessages, "EML2 beta6 support deprecated.");
|
109 |
|
|
}
|
110 |
|
|
}
|
111 |
|
|
return $doc;
|
112 |
|
|
}
|
113 |
|
|
|
114 |
|
|
#
|
115 |
|
|
# Inspect an EML document for objectName elements, return the name if it differs from docid
|
116 |
|
|
#
|
117 |
|
|
sub getObjectName {
|
118 |
|
|
my $doc = shift;
|
119 |
|
|
my $docid = shift;
|
120 |
|
|
|
121 |
|
|
# the five types of physical objects, though only dataTable and otherEntity appear to be used
|
122 |
|
|
my @names = qw(dataTable otherEntity spatialRaster spatialVector storedProcedure);
|
123 |
|
|
my $results;
|
124 |
|
|
my $results_urls;
|
125 |
|
|
my $dataid;
|
126 |
|
|
my $objectName;
|
127 |
|
|
my $offset = 0;
|
128 |
|
|
|
129 |
|
|
foreach my $name (@names) {
|
130 |
|
|
$results = $doc->findnodes("//$name/physical/objectName");
|
131 |
|
|
$results_urls = $doc->findnodes("//$name/physical/distribution/online/url");
|
132 |
|
|
foreach my $node ($results_urls->get_nodelist) {
|
133 |
|
|
$offset++;
|
134 |
|
|
$dataid = $node->textContent();
|
135 |
|
|
$dataid =~ s/ecogrid:\/\/knb\///;
|
136 |
|
|
# if the docid == docname, return the objectName
|
137 |
|
|
if ($dataid eq $docid) {
|
138 |
|
|
my $nameNode = $results->get_node($offset);
|
139 |
|
|
if ($nameNode) {
|
140 |
|
|
$objectName = $nameNode->textContent();
|
141 |
|
|
}
|
142 |
|
|
}
|
143 |
|
|
}
|
144 |
|
|
}
|
145 |
|
|
return $objectName;
|
146 |
|
|
}
|
147 |
|
|
|
148 |
|
|
#
|
149 |
|
|
# Remove whitespace from the start and end of the string
|
150 |
|
|
#
|
151 |
|
|
sub trim($)
|
152 |
|
|
{
|
153 |
|
|
my $string = shift;
|
154 |
|
|
$string =~ s/^\s+//;
|
155 |
|
|
$string =~ s/\s+$//;
|
156 |
|
|
return $string;
|
157 |
|
|
}
|