Project

General

Profile

1 3722 jones
#!/usr/bin/perl
2
3
# This script queries a metacat database to locate EML documents, and for each
4
# document determines if there are references to data objects.  If so, those
5
# references are parsed and a list is generated with metadata about each object,
6
# such as whether the object was directly accessible or not, how many records
7
# were present, and the size of the data files.
8
9
use Metacat;
10
use XML::DOM;
11
use LWP::UserAgent;
12
use Cache::FileCache;
13
use strict;
14
15
############################################################################
16
#
17
# MAIN program block
18
#
19
############################################################################
20
21
# check that the correct number or parameters are passed from the commandline
22
if (($#ARGV +1) != 1) {die "Usage: %./cache_eml_data.pl <metacat_url> \n\n";}
23
# Get the URL to the metacat server from the command line options
24
my ($url) = @ARGV;
25
26
# Initialize the data cache
27 3723 jones
my $cacheDir = "/var/metacat/cache";
28 3722 jones
my $cache = initializeCache($cacheDir);
29
30
# Open a metacat connection
31
my $metacat = openMetacatConnection($url);
32
33
# Get a list of EML documents
34 3723 jones
#my $queryTerm = "%Jones%";
35
my $queryTerm = "%";
36
my $result = executeQuery($metacat, $queryTerm);
37 3722 jones
38
# Extract an array of all of the entity URLs for each EML document
39
my $listRef = extractEntityUrlList($result);
40
41
# Retrieve the entities, save them in the cache,  and record metadata
42
my $entityMetadata = cacheEntities($cache, $listRef);
43
44
# Print out the results
45
#printNestedArray($entityMetadata);
46
47
exit(0);
48
49
############################################################################
50
#
51
# SUBROUTINES
52
#
53
############################################################################
54
55
#
56
# Create a connection to the metacat server
57
#
58
sub openMetacatConnection {
59
    my $url = shift;
60
61
    my $metacat = Metacat->new();
62
    if ($metacat) {
63
        $metacat->set_options( metacatUrl => $url );
64
    } else {
65
        die("Could not open connection to Metacat url: $url\n");
66
    }
67
    return $metacat;
68
}
69
70
#
71
# Execute a metacat query and return the XML resultset
72
#
73
sub executeQuery {
74
    my $metacat = shift;
75 3723 jones
    my $queryTerm = shift;
76 3722 jones
77 3723 jones
    my $query = "<?xml version=\"1.0\" ?> <pathquery version=\"1.2\">  <querytitle>Untitled-Search-2</querytitle>  <returndoctype>-//ecoinformatics.org//eml-dataset-2.0.0beta6//EN</returndoctype> <returndoctype>-//NCEAS//eml-dataset-2.0//EN</returndoctype>  <returndoctype>eml://ecoinformatics.org/eml-2.0.0</returndoctype>  <returndoctype>eml://ecoinformatics.org/eml-2.0.1</returndoctype><returnfield>dataTable/physical/distribution/online/url</returnfield><returnfield>dataTable/entityName</returnfield><querygroup operator=\"UNION\"><queryterm searchmode=\"contains\" casesensitive=\"false\"><value>$queryTerm</value><pathexpr>surName</pathexpr></queryterm></querygroup></pathquery>";
78 3722 jones
79
    my $code = $metacat->squery($query);
80
81
    my $result =$metacat->getMessage();
82
83
    if ($result eq "") {
84
        print $code, "\n";
85
        print "Message: ", $result, "\n";
86
        print ("Error or timeout from metacat...");
87
        exit();
88
    }
89
90
    return $result;
91
}
92
93
#
94
# Extract the docid and entity urls for each document in the list
95
#
96
sub extractEntityUrlList {
97
    my $resultset = shift;
98
99
    my $parser = new XML::DOM::Parser;
100
    my $node;
101
    my $docid;
102
    my $doc = $parser->parse($resultset);
103
    my $nodes = $doc->getElementsByTagName("docid");
104
    my $numberNodes = $nodes->getLength;
105
    my @urlList;
106
107
    # Loop through each of the documents in the resultset
108
    for (my $i =0; $i < $numberNodes; $i++) {
109
        my $node = $nodes->item($i);
110
        $docid =  trimwhitespace($node->getFirstChild()->getNodeValue());
111
112
        $node = $node->getParentNode();
113
        my $tempnodes = $node->getElementsByTagName("param");
114
        my $tempnumberNodes = $tempnodes->getLength;
115
116
        my $disturl = "";
117
118
        # Loop through each of the "param" elements for this document
119
        for (my $j =0; $j < $tempnumberNodes; $j++) {
120
	        my $tempnode = $tempnodes->item($j);
121
	        my $paramname = $tempnode->getAttributeNode("name")->getValue();
122
	        if ($paramname eq "dataTable/physical/distribution/online/url") {
123
	            $disturl = trimwhitespace(
124
                        $tempnode->getFirstChild()->getNodeValue());
125
                push(@urlList, [$docid, $disturl]);
126
	        }
127
        }
128
    }
129
    return \@urlList;
130
}
131
132
133
#
134
# Remove whitespace from the start and end of the string
135
#
136
sub trimwhitespace($)
137
{
138
  my $string = shift;
139
  $string =~ s/^\s+//;
140
  $string =~ s/\s+$//;
141
  return $string;
142
}
143
144
#
145
# Print out a nested array of arrays
146
#
147
sub printNestedArray {
148
    my $listRef = shift;
149
150
    for (my $i = 0; $i <= $#{$listRef}; $i++) {
151
        my $innerArray = $$listRef[$i];
152
        printArray($innerArray);
153
    }
154
}
155
156
#
157
# Print an array of scalars of arbitrary length, separating values with commas
158
#
159
sub printArray {
160
    my $innerArray = shift;
161
    my $innerLength = $#{$innerArray};
162
    for (my $i=0; $i <= $innerLength; $i++) {
163
        print $$innerArray[$i];
164
        my $delim = ($i eq $innerLength) ? "\n" : ",";
165
        print $delim;
166
    }
167
}
168
169
#
170
# For each entity in the list, try to cache the entity after downloading it
171
# and return information about the size of each entity
172
#
173
sub cacheEntities {
174
    my $cache = shift;
175
    my $listRef = shift;
176
177
    my @entityMetadata;
178
179
    # Create a user agent object for downloading from URLs
180
    my $ua = LWP::UserAgent->new;
181
    $ua->agent("Metacat Harvester 1.0 ");
182
183
    # Loop through all of the entity URLs
184
    for (my $i = 0; $i <= $#{$listRef}; $i++) {
185
        my $entity;
186
        my $entitySize;
187
        my $packageId = $$listRef[$i][0];
188
        my $entityUrl = $$listRef[$i][1];
189
        if ($entityUrl =~ /^ecogrid:/) {
190
            #print "Need to process Ecogrid uri: ", $entityUrl, "\n";
191
            $entity = -2;
192
            $entitySize = 0;
193
        } else {
194
            # For regular URLs, check if its in the cache already, and use
195
            # it if it is.  If not, download it and save it to the cache
196
            my $entity = $cache->get( $entityUrl );
197
            if ( defined $entity ) {
198
                $entitySize = length($entity);
199
            } else {
200
                $entity = downloadEntity($ua, $entityUrl);
201
                if ($entity == -1) {
202
                    $entitySize = 0;
203
                    #print("Error on download for $entityUrl\n");
204
                } else {
205
                    # write the data to cache, using URL as key
206
                    $cache->set( $entityUrl, $entity, "never" );
207
                    $entitySize = length($entity);
208
                }
209
            }
210
        }
211
212
        # Record metadata about this entity
213
        my $info = [$packageId, $entityUrl,
214
                ($entity < 0) ? $entity : $entitySize];
215
        printArray($info);
216
        push(@entityMetadata, $info);
217
    }
218
    return \@entityMetadata;
219
}
220
221
#
222
# Download a single entity from a given URL and return it, or return -1 on error
223
#
224
sub downloadEntity {
225
    my $ua = shift;
226
    my $url = shift;
227
228
    # Create a request
229
    my $req = HTTP::Request->new(GET => $url);
230
231
    # Pass request to the user agent and get a response back
232
    my $res = $ua->request($req);
233
234
    # Check the outcome of the response
235
    if ($res->is_success) {
236
        return $res->content;
237
    } else {
238
        print $res->status_line, "\n";
239
        return -1;
240
    }
241
}
242
243
#
244
# Create a new cache to be used for storing downloaded entities
245
#
246
sub initializeCache {
247
    my $cacheDir = shift;
248
249
    my $cache = new Cache::FileCache( );
250
    $cache->set_cache_root($cacheDir);
251
252
    return $cache;
253
}