Project

General

Profile

metacat / src / perl / eml_get_objectnames.pl @ 4722

1
#!/usr/bin/env perl
2
# eml_get_objectnames.pl
3
# Given a list of EML docids and dataids, generate a list of objectName elements
4
# present in the documents mapped to each dataid, and generate UPDATE SQL statements.
5
#
6
# This information is useful because both Morpho and the Metacat Registry historically 
7
# stored the dataid i n the 'docname' field within xml_documents.
8
use Metacat;
9
use Data::Dumper;
10
use XML::LibXML;
11
use LWP::UserAgent;
12
use strict;
13

    
14
############################################################################
15
#
16
# MAIN program block
17
#
18
############################################################################
19

    
20
# check that the correct number or parameters are passed from the commandline
21
if (($#ARGV + 1) < 1) {die "Usage: %./eml_get_objectnames.pl <metacat_url>\n\n";}
22
# Get the URL to the metacat server from the command line options
23
my $url = @ARGV; 
24

    
25
# Open a metacat connection
26
my $metacat = openMetacatConnection($url);
27

    
28
my @errorMessages;
29
my $error;
30

    
31
# requires an input CSV file containing two columns:
32
# the dataid and docid, which can be extracted from the database:
33
#   SELECT nodedata, docid FROM xml_nodes WHERE nodetype = 'TEXT' AND nodedata LIKE 'ecogrid%'
34
# and then trimming the ecogrid prefix from the dataid fields.
35

    
36
my $nameFile = 'docid-for-binaries.txt'; 
37

    
38
unless (open (FILE_LIST, $nameFile)) {
39
    print "file with docids required.\n";
40
    exit;
41
}
42

    
43
while (my $line = <FILE_LIST>) {
44
    my ($dataid, $docid) = split(',', trim($line));
45

    
46
    my $response = $metacat->read($docid);
47
    my $metadata = $response->content();
48

    
49
    # Now parse the metadata document, grabbing the objectName for the particular docid
50
    my $doc = getEMLDoc($metadata);
51
    my $objName = getObjectName($doc, $dataid);
52
    if ($objName ne $dataid && $objName ne "" && $objName !~ /deleteme/) {
53
        my $id = $dataid;
54
        $id =~ s/\.[0-9]+$//;
55
        print "UPDATE xml_documents SET docname = '$objName' WHERE docid = '$id';\n";
56
    }
57
}
58

    
59
exit;
60

    
61
############################################################################
62
#
63
# SUBROUTINES
64
#
65
############################################################################
66

    
67
#
68
# Create a connection to the metacat server
69
#
70
sub openMetacatConnection {
71
    my $url = shift;
72

    
73
    my $metacat = Metacat->new();
74
    if ($metacat) {
75
        $metacat->set_options( metacatUrl => $url );
76
    } else {
77
        die("Could not open connection to Metacat url: $url\n");
78
    }
79
    return $metacat;
80
}
81

    
82
#
83
# Retrieve EML documents and set up XML parser object
84
#
85
sub getEMLDoc {
86
    my $resultset = shift;
87

    
88
    my $parser = XML::LibXML->new();
89
    my $node;
90
    my $docid;
91
    my $doc = $parser->parse_string($resultset);
92
    if ($doc eq "") {
93
        $error ="Error in parsing the eml document";
94
        push(@errorMessages, $error);
95
    } elsif ($doc=~ /<error/) {
96
        if ($doc=~ /public/) {
97
            $error ="Error in reading the eml document. Please check if you are logged in.";
98
            push(@errorMessages, $error);
99
        } else {
100
          $error ="Error in reading the eml document. Please check if you have access to read the document";
101
          push(@errorMessages, $error);
102
        }
103
    } else {
104
        my $findType = $doc->findnodes('//dataset/identifier');
105
        if ($findType->size() > 0) {
106
            # This is a eml beta6 document
107
            # Read the documents mentioned in triples also
108
            push(@errorMessages, "EML2 beta6 support deprecated.");
109
        }
110
    }
111
    return $doc;
112
}
113

    
114
#
115
# Inspect an EML document for objectName elements, return the name if it differs from docid
116
#
117
sub getObjectName {
118
    my $doc = shift;
119
    my $docid = shift;
120

    
121
    # the five types of physical objects, though only dataTable and otherEntity appear to be used
122
    my @names = qw(dataTable otherEntity spatialRaster spatialVector storedProcedure);
123
    my $results;
124
    my $results_urls;
125
    my $dataid;
126
    my $objectName;
127
    my $offset = 0;
128

    
129
    foreach my $name (@names) {
130
        $results = $doc->findnodes("//$name/physical/objectName");
131
        $results_urls = $doc->findnodes("//$name/physical/distribution/online/url");
132
        foreach my $node ($results_urls->get_nodelist) {
133
            $offset++;
134
            $dataid = $node->textContent();
135
            $dataid =~ s/ecogrid:\/\/knb\///;
136
            # if the docid == docname, return the objectName
137
            if ($dataid eq $docid) {
138
                my $nameNode = $results->get_node($offset);
139
                if ($nameNode) {
140
                    $objectName = $nameNode->textContent();
141
                }
142
            }
143
        }
144
    }
145
    return $objectName;
146
}
147

    
148
#
149
# Remove whitespace from the start and end of the string
150
#
151
sub trim($)
152
{
153
  my $string = shift;
154
  $string =~ s/^\s+//;
155
  $string =~ s/\s+$//;
156
  return $string;
157
}