1
|
#!/usr/bin/env perl
|
2
|
# eml_get_objectnames.pl
|
3
|
# Given a list of EML docids and dataids, generate a list of objectName elements
|
4
|
# present in the documents mapped to each dataid, and generate UPDATE SQL statements.
|
5
|
#
|
6
|
# This information is useful because both Morpho and the Metacat Registry historically
|
7
|
# stored the dataid i n the 'docname' field within xml_documents.
|
8
|
use Metacat;
|
9
|
use Data::Dumper;
|
10
|
use XML::LibXML;
|
11
|
use LWP::UserAgent;
|
12
|
use strict;
|
13
|
|
14
|
############################################################################
|
15
|
#
|
16
|
# MAIN program block
|
17
|
#
|
18
|
############################################################################
|
19
|
|
20
|
# check that the correct number or parameters are passed from the commandline
|
21
|
if (($#ARGV + 1) < 1) {die "Usage: %./eml_get_objectnames.pl <metacat_url>\n\n";}
|
22
|
# Get the URL to the metacat server from the command line options
|
23
|
my $url = @ARGV;
|
24
|
|
25
|
# Open a metacat connection
|
26
|
my $metacat = openMetacatConnection($url);
|
27
|
|
28
|
my @errorMessages;
|
29
|
my $error;
|
30
|
|
31
|
# requires an input CSV file containing two columns:
|
32
|
# the dataid and docid, which can be extracted from the database:
|
33
|
# SELECT nodedata, docid FROM xml_nodes WHERE nodetype = 'TEXT' AND nodedata LIKE 'ecogrid%'
|
34
|
# and then trimming the ecogrid prefix from the dataid fields.
|
35
|
|
36
|
my $nameFile = 'docid-for-binaries.txt';
|
37
|
|
38
|
unless (open (FILE_LIST, $nameFile)) {
|
39
|
print "file with docids required.\n";
|
40
|
exit;
|
41
|
}
|
42
|
|
43
|
while (my $line = <FILE_LIST>) {
|
44
|
my ($dataid, $docid) = split(',', trim($line));
|
45
|
|
46
|
my $response = $metacat->read($docid);
|
47
|
my $metadata = $response->content();
|
48
|
|
49
|
# Now parse the metadata document, grabbing the objectName for the particular docid
|
50
|
my $doc = getEMLDoc($metadata);
|
51
|
my $objName = getObjectName($doc, $dataid);
|
52
|
if ($objName ne $dataid && $objName ne "" && $objName !~ /deleteme/) {
|
53
|
my $id = $dataid;
|
54
|
$id =~ s/\.[0-9]+$//;
|
55
|
print "UPDATE xml_documents SET docname = '$objName' WHERE docid = '$id';\n";
|
56
|
}
|
57
|
}
|
58
|
|
59
|
exit;
|
60
|
|
61
|
############################################################################
|
62
|
#
|
63
|
# SUBROUTINES
|
64
|
#
|
65
|
############################################################################
|
66
|
|
67
|
#
|
68
|
# Create a connection to the metacat server
|
69
|
#
|
70
|
sub openMetacatConnection {
|
71
|
my $url = shift;
|
72
|
|
73
|
my $metacat = Metacat->new();
|
74
|
if ($metacat) {
|
75
|
$metacat->set_options( metacatUrl => $url );
|
76
|
} else {
|
77
|
die("Could not open connection to Metacat url: $url\n");
|
78
|
}
|
79
|
return $metacat;
|
80
|
}
|
81
|
|
82
|
#
|
83
|
# Retrieve EML documents and set up XML parser object
|
84
|
#
|
85
|
sub getEMLDoc {
|
86
|
my $resultset = shift;
|
87
|
|
88
|
my $parser = XML::LibXML->new();
|
89
|
my $node;
|
90
|
my $docid;
|
91
|
my $doc = $parser->parse_string($resultset);
|
92
|
if ($doc eq "") {
|
93
|
$error ="Error in parsing the eml document";
|
94
|
push(@errorMessages, $error);
|
95
|
} elsif ($doc=~ /<error/) {
|
96
|
if ($doc=~ /public/) {
|
97
|
$error ="Error in reading the eml document. Please check if you are logged in.";
|
98
|
push(@errorMessages, $error);
|
99
|
} else {
|
100
|
$error ="Error in reading the eml document. Please check if you have access to read the document";
|
101
|
push(@errorMessages, $error);
|
102
|
}
|
103
|
} else {
|
104
|
my $findType = $doc->findnodes('//dataset/identifier');
|
105
|
if ($findType->size() > 0) {
|
106
|
# This is a eml beta6 document
|
107
|
# Read the documents mentioned in triples also
|
108
|
push(@errorMessages, "EML2 beta6 support deprecated.");
|
109
|
}
|
110
|
}
|
111
|
return $doc;
|
112
|
}
|
113
|
|
114
|
#
|
115
|
# Inspect an EML document for objectName elements, return the name if it differs from docid
|
116
|
#
|
117
|
sub getObjectName {
|
118
|
my $doc = shift;
|
119
|
my $docid = shift;
|
120
|
|
121
|
# the five types of physical objects, though only dataTable and otherEntity appear to be used
|
122
|
my @names = qw(dataTable otherEntity spatialRaster spatialVector storedProcedure);
|
123
|
my $results;
|
124
|
my $results_urls;
|
125
|
my $dataid;
|
126
|
my $objectName;
|
127
|
my $offset = 0;
|
128
|
|
129
|
foreach my $name (@names) {
|
130
|
$results = $doc->findnodes("//$name/physical/objectName");
|
131
|
$results_urls = $doc->findnodes("//$name/physical/distribution/online/url");
|
132
|
foreach my $node ($results_urls->get_nodelist) {
|
133
|
$offset++;
|
134
|
$dataid = $node->textContent();
|
135
|
$dataid =~ s/ecogrid:\/\/knb\///;
|
136
|
# if the docid == docname, return the objectName
|
137
|
if ($dataid eq $docid) {
|
138
|
my $nameNode = $results->get_node($offset);
|
139
|
if ($nameNode) {
|
140
|
$objectName = $nameNode->textContent();
|
141
|
}
|
142
|
}
|
143
|
}
|
144
|
}
|
145
|
return $objectName;
|
146
|
}
|
147
|
|
148
|
#
|
149
|
# Remove whitespace from the start and end of the string
|
150
|
#
|
151
|
sub trim($)
|
152
|
{
|
153
|
my $string = shift;
|
154
|
$string =~ s/^\s+//;
|
155
|
$string =~ s/\s+$//;
|
156
|
return $string;
|
157
|
}
|