Project

General

Profile

1 3722 jones
#!/usr/bin/perl
2
3
# This script queries a metacat database to locate EML documents, and for each
4
# document determines if there are references to data objects.  If so, those
5
# references are parsed and a list is generated with metadata about each object,
6
# such as whether the object was directly accessible or not, how many records
7
# were present, and the size of the data files.
8
9
use Metacat;
10
use XML::DOM;
11
use LWP::UserAgent;
12
use Cache::FileCache;
13
use strict;
14
15
############################################################################
16
#
17
# MAIN program block
18
#
19
############################################################################
20
21
# check that the correct number or parameters are passed from the commandline
22
if (($#ARGV +1) != 1) {die "Usage: %./cache_eml_data.pl <metacat_url> \n\n";}
23
# Get the URL to the metacat server from the command line options
24
my ($url) = @ARGV;
25
26
# Initialize the data cache
27 3723 jones
my $cacheDir = "/var/metacat/cache";
28 3722 jones
my $cache = initializeCache($cacheDir);
29
30
# Open a metacat connection
31
my $metacat = openMetacatConnection($url);
32
33
# Get a list of EML documents
34 3723 jones
#my $queryTerm = "%Jones%";
35
my $queryTerm = "%";
36 3762 jones
37 3723 jones
my $result = executeQuery($metacat, $queryTerm);
38 3722 jones
39
# Extract an array of all of the entity URLs for each EML document
40
my $listRef = extractEntityUrlList($result);
41
42
# Retrieve the entities, save them in the cache,  and record metadata
43
my $entityMetadata = cacheEntities($cache, $listRef);
44
45
# Print out the results
46
#printNestedArray($entityMetadata);
47
48
exit(0);
49
50
############################################################################
51
#
52
# SUBROUTINES
53
#
54
############################################################################
55
56
#
57
# Create a connection to the metacat server
58
#
59
sub openMetacatConnection {
60
    my $url = shift;
61
62
    my $metacat = Metacat->new();
63
    if ($metacat) {
64
        $metacat->set_options( metacatUrl => $url );
65
    } else {
66
        die("Could not open connection to Metacat url: $url\n");
67
    }
68
    return $metacat;
69
}
70
71
#
72
# Execute a metacat query and return the XML resultset
73
#
74
sub executeQuery {
75
    my $metacat = shift;
76 3723 jones
    my $queryTerm = shift;
77 3722 jones
78 3723 jones
    my $query = "<?xml version=\"1.0\" ?> <pathquery version=\"1.2\">  <querytitle>Untitled-Search-2</querytitle>  <returndoctype>-//ecoinformatics.org//eml-dataset-2.0.0beta6//EN</returndoctype> <returndoctype>-//NCEAS//eml-dataset-2.0//EN</returndoctype>  <returndoctype>eml://ecoinformatics.org/eml-2.0.0</returndoctype>  <returndoctype>eml://ecoinformatics.org/eml-2.0.1</returndoctype><returnfield>dataTable/physical/distribution/online/url</returnfield><returnfield>dataTable/entityName</returnfield><querygroup operator=\"UNION\"><queryterm searchmode=\"contains\" casesensitive=\"false\"><value>$queryTerm</value><pathexpr>surName</pathexpr></queryterm></querygroup></pathquery>";
79 3722 jones
80
    my $code = $metacat->squery($query);
81
82
    my $result =$metacat->getMessage();
83
84
    if ($result eq "") {
85
        print $code, "\n";
86
        print "Message: ", $result, "\n";
87
        print ("Error or timeout from metacat...");
88
        exit();
89
    }
90
91
    return $result;
92
}
93
94
#
95
# Extract the docid and entity urls for each document in the list
96
#
97
sub extractEntityUrlList {
98
    my $resultset = shift;
99
100
    my $parser = new XML::DOM::Parser;
101
    my $node;
102
    my $docid;
103
    my $doc = $parser->parse($resultset);
104
    my $nodes = $doc->getElementsByTagName("docid");
105
    my $numberNodes = $nodes->getLength;
106
    my @urlList;
107
108
    # Loop through each of the documents in the resultset
109
    for (my $i =0; $i < $numberNodes; $i++) {
110
        my $node = $nodes->item($i);
111
        $docid =  trimwhitespace($node->getFirstChild()->getNodeValue());
112
113
        $node = $node->getParentNode();
114
        my $tempnodes = $node->getElementsByTagName("param");
115
        my $tempnumberNodes = $tempnodes->getLength;
116
117
        my $disturl = "";
118
119
        # Loop through each of the "param" elements for this document
120
        for (my $j =0; $j < $tempnumberNodes; $j++) {
121
	        my $tempnode = $tempnodes->item($j);
122
	        my $paramname = $tempnode->getAttributeNode("name")->getValue();
123
	        if ($paramname eq "dataTable/physical/distribution/online/url") {
124
	            $disturl = trimwhitespace(
125
                        $tempnode->getFirstChild()->getNodeValue());
126
                push(@urlList, [$docid, $disturl]);
127
	        }
128
        }
129
    }
130
    return \@urlList;
131
}
132
133
134
#
135
# Remove whitespace from the start and end of the string
136
#
137
sub trimwhitespace($)
138
{
139
  my $string = shift;
140
  $string =~ s/^\s+//;
141
  $string =~ s/\s+$//;
142
  return $string;
143
}
144
145
#
146
# Print out a nested array of arrays
147
#
148
sub printNestedArray {
149
    my $listRef = shift;
150
151
    for (my $i = 0; $i <= $#{$listRef}; $i++) {
152
        my $innerArray = $$listRef[$i];
153
        printArray($innerArray);
154
    }
155
}
156
157
#
158
# Print an array of scalars of arbitrary length, separating values with commas
159
#
160
sub printArray {
161
    my $innerArray = shift;
162
    my $innerLength = $#{$innerArray};
163
    for (my $i=0; $i <= $innerLength; $i++) {
164
        print $$innerArray[$i];
165
        my $delim = ($i eq $innerLength) ? "\n" : ",";
166
        print $delim;
167
    }
168
}
169
170
#
171
# For each entity in the list, try to cache the entity after downloading it
172
# and return information about the size of each entity
173
#
174
sub cacheEntities {
175
    my $cache = shift;
176
    my $listRef = shift;
177
178
    my @entityMetadata;
179
180
    # Create a user agent object for downloading from URLs
181
    my $ua = LWP::UserAgent->new;
182
    $ua->agent("Metacat Harvester 1.0 ");
183 3762 jones
    $ua->timeout(600);
184 3722 jones
185
    # Loop through all of the entity URLs
186
    for (my $i = 0; $i <= $#{$listRef}; $i++) {
187
        my $entity;
188
        my $entitySize;
189
        my $packageId = $$listRef[$i][0];
190
        my $entityUrl = $$listRef[$i][1];
191
        if ($entityUrl =~ /^ecogrid:/) {
192
            #print "Need to process Ecogrid uri: ", $entityUrl, "\n";
193 3762 jones
            my $dataDir = '/var/metacat/data/';
194
            my $pos = length("ecogrid://knb/");
195
            my $entityId = substr($entityUrl, $pos);
196
            #print "Looking for Ecogrid file: ", $dataDir . $entityId, "\n";
197
            my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,$atime,
198
                $mtime,$ctime,$blksize,$blocks) = stat($dataDir . $entityId);
199
            #print "Got Ecogrid size: ", $size, "\n";
200
            $entity = 1;
201
            $entitySize = $size;
202 3722 jones
        } else {
203
            # For regular URLs, check if its in the cache already, and use
204 3762 jones
            # it if it is.  If not, download it and save it to the cache, but
205
            # only if its not an HTML file (test for <html> is simplistic)
206 3722 jones
            my $entity = $cache->get( $entityUrl );
207
            if ( defined $entity ) {
208 3762 jones
                if ($entity =~ /<html>/) {
209
                    $entity = -2;
210
                    $entitySize = -2;
211
                    $cache->remove( $entityUrl );
212
                } else {
213
                    $entitySize = length($entity);
214
                }
215 3722 jones
            } else {
216
                $entity = downloadEntity($ua, $entityUrl);
217
                if ($entity == -1) {
218 3762 jones
                    $entitySize = -1;
219 3722 jones
                    #print("Error on download for $entityUrl\n");
220 3762 jones
                } elsif ($entity =~ /<html>/) {
221
                    $entity = -2;
222
                    $entitySize = -2;
223 3722 jones
                } else {
224
                    # write the data to cache, using URL as key
225
                    $cache->set( $entityUrl, $entity, "never" );
226
                    $entitySize = length($entity);
227
                }
228
            }
229
        }
230
231
        # Record metadata about this entity
232
        my $info = [$packageId, $entityUrl,
233
                ($entity < 0) ? $entity : $entitySize];
234
        printArray($info);
235
        push(@entityMetadata, $info);
236
    }
237
    return \@entityMetadata;
238
}
239
240
#
241
# Download a single entity from a given URL and return it, or return -1 on error
242
#
243
sub downloadEntity {
244
    my $ua = shift;
245
    my $url = shift;
246
247
    # Create a request
248
    my $req = HTTP::Request->new(GET => $url);
249
250
    # Pass request to the user agent and get a response back
251
    my $res = $ua->request($req);
252
253
    # Check the outcome of the response
254
    if ($res->is_success) {
255
        return $res->content;
256
    } else {
257 3762 jones
        #print $res->status_line, "\n";
258 3722 jones
        return -1;
259
    }
260
}
261
262
#
263
# Create a new cache to be used for storing downloaded entities
264
#
265
sub initializeCache {
266
    my $cacheDir = shift;
267
268
    my $cache = new Cache::FileCache( );
269
    $cache->set_cache_root($cacheDir);
270
271
    return $cache;
272
}