Project

General

Profile

« Previous | Next » 

Revision 3722

Initial script for downloading and caching data entities associated
with an EML document that is stored in Metacat.

View differences:

src/perl/cache_eml_data.pl
1
#!/usr/bin/perl
2

  
3
# This script queries a metacat database to locate EML documents, and for each 
4
# document determines if there are references to data objects.  If so, those
5
# references are parsed and a list is generated with metadata about each object,
6
# such as whether the object was directly accessible or not, how many records 
7
# were present, and the size of the data files.
8

  
9
use Metacat;
10
use XML::DOM;
11
use LWP::UserAgent;
12
use Cache::FileCache;
13
use strict;
14

  
15
############################################################################
16
#
17
# MAIN program block
18
#
19
############################################################################
20

  
21
# check that the correct number or parameters are passed from the commandline
22
if (($#ARGV +1) != 1) {die "Usage: %./cache_eml_data.pl <metacat_url> \n\n";}
23
# Get the URL to the metacat server from the command line options
24
my ($url) = @ARGV; 
25

  
26
# Initialize the data cache
27
my $cacheDir = "/tmp/metacat/cache";
28
my $cache = initializeCache($cacheDir);
29

  
30
# Open a metacat connection
31
my $metacat = openMetacatConnection($url);
32

  
33
# Get a list of EML documents
34
my $result = executeQuery($metacat);
35

  
36
# Extract an array of all of the entity URLs for each EML document
37
my $listRef = extractEntityUrlList($result);
38

  
39
# Retrieve the entities, save them in the cache,  and record metadata
40
my $entityMetadata = cacheEntities($cache, $listRef);
41

  
42
# Print out the results
43
#printNestedArray($entityMetadata);
44

  
45
exit(0);
46

  
47
############################################################################
48
#
49
# SUBROUTINES
50
#
51
############################################################################
52

  
53
#
54
# Create a connection to the metacat server
55
#
56
sub openMetacatConnection {
57
    my $url = shift;
58

  
59
    my $metacat = Metacat->new();
60
    if ($metacat) {
61
        $metacat->set_options( metacatUrl => $url );
62
    } else {
63
        die("Could not open connection to Metacat url: $url\n");
64
    }
65
    return $metacat;
66
}
67

  
68
#
69
# Execute a metacat query and return the XML resultset
70
#
71
sub executeQuery {
72
    my $metacat = shift;
73

  
74
    my $query = "<?xml version=\"1.0\" ?> <pathquery version=\"1.2\">  <querytitle>Untitled-Search-2</querytitle>  <returndoctype>-//ecoinformatics.org//eml-dataset-2.0.0beta6//EN</returndoctype> <returndoctype>-//NCEAS//eml-dataset-2.0//EN</returndoctype>  <returndoctype>eml://ecoinformatics.org/eml-2.0.0</returndoctype>  <returndoctype>eml://ecoinformatics.org/eml-2.0.1</returndoctype><returnfield>dataTable/physical/distribution/online/url</returnfield><returnfield>dataTable/entityName</returnfield><querygroup operator=\"UNION\"><queryterm searchmode=\"contains\" casesensitive=\"false\"><value>%Jones%</value><pathexpr>surName</pathexpr></queryterm></querygroup></pathquery>";
75

  
76
    my $code = $metacat->squery($query);
77
    
78
    my $result =$metacat->getMessage();
79

  
80
    if ($result eq "") {
81
        print $code, "\n";
82
        print "Message: ", $result, "\n";
83
        print ("Error or timeout from metacat...");
84
        exit();
85
    }
86

  
87
    return $result;
88
}
89

  
90
#
91
# Extract the docid and entity urls for each document in the list
92
#
93
sub extractEntityUrlList {
94
    my $resultset = shift;
95

  
96
    my $parser = new XML::DOM::Parser;
97
    my $node;
98
    my $docid;
99
    my $doc = $parser->parse($resultset);
100
    my $nodes = $doc->getElementsByTagName("docid");
101
    my $numberNodes = $nodes->getLength;
102
    my @urlList;
103

  
104
    # Loop through each of the documents in the resultset
105
    for (my $i =0; $i < $numberNodes; $i++) {
106
        my $node = $nodes->item($i);
107
        $docid =  trimwhitespace($node->getFirstChild()->getNodeValue());
108
    
109
        $node = $node->getParentNode(); 
110
        my $tempnodes = $node->getElementsByTagName("param");
111
        my $tempnumberNodes = $tempnodes->getLength;
112
 
113
        my $disturl = "";
114
    
115
        # Loop through each of the "param" elements for this document
116
        for (my $j =0; $j < $tempnumberNodes; $j++) {
117
	        my $tempnode = $tempnodes->item($j);	
118
	        my $paramname = $tempnode->getAttributeNode("name")->getValue();
119
	        if ($paramname eq "dataTable/physical/distribution/online/url") {
120
	            $disturl = trimwhitespace(
121
                        $tempnode->getFirstChild()->getNodeValue());
122
                push(@urlList, [$docid, $disturl]);
123
	        }
124
        }
125
    }
126
    return \@urlList;
127
}
128

  
129

  
130
#
131
# Remove whitespace from the start and end of the string
132
#
133
sub trimwhitespace($)
134
{
135
  my $string = shift;
136
  $string =~ s/^\s+//;
137
  $string =~ s/\s+$//;
138
  return $string;
139
}
140

  
141
#
142
# Print out a nested array of arrays
143
#
144
sub printNestedArray {
145
    my $listRef = shift;
146

  
147
    for (my $i = 0; $i <= $#{$listRef}; $i++) {
148
        my $innerArray = $$listRef[$i];
149
        printArray($innerArray);
150
    }
151
}
152

  
153
#
154
# Print an array of scalars of arbitrary length, separating values with commas
155
#
156
sub printArray {
157
    my $innerArray = shift;
158
    my $innerLength = $#{$innerArray};
159
    for (my $i=0; $i <= $innerLength; $i++) {
160
        print $$innerArray[$i];
161
        my $delim = ($i eq $innerLength) ? "\n" : ",";
162
        print $delim;
163
    }
164
}
165

  
166
#
167
# For each entity in the list, try to cache the entity after downloading it
168
# and return information about the size of each entity
169
#
170
sub cacheEntities {
171
    my $cache = shift;
172
    my $listRef = shift;
173

  
174
    my @entityMetadata;
175

  
176
    # Create a user agent object for downloading from URLs
177
    my $ua = LWP::UserAgent->new;
178
    $ua->agent("Metacat Harvester 1.0 ");
179

  
180
    # Loop through all of the entity URLs
181
    for (my $i = 0; $i <= $#{$listRef}; $i++) {
182
        my $entity;
183
        my $entitySize;
184
        my $packageId = $$listRef[$i][0];
185
        my $entityUrl = $$listRef[$i][1];
186
        if ($entityUrl =~ /^ecogrid:/) {
187
            #print "Need to process Ecogrid uri: ", $entityUrl, "\n";
188
            $entity = -2;
189
            $entitySize = 0;
190
        } else {
191
            # For regular URLs, check if its in the cache already, and use
192
            # it if it is.  If not, download it and save it to the cache
193
            my $entity = $cache->get( $entityUrl );
194
            if ( defined $entity ) {
195
                $entitySize = length($entity);
196
            } else {
197
                $entity = downloadEntity($ua, $entityUrl);
198
                if ($entity == -1) {
199
                    $entitySize = 0;
200
                    #print("Error on download for $entityUrl\n");
201
                } else {
202
                    # write the data to cache, using URL as key
203
                    $cache->set( $entityUrl, $entity, "never" );
204
                    $entitySize = length($entity);
205
                }
206
            }
207
        }
208

  
209
        # Record metadata about this entity
210
        my $info = [$packageId, $entityUrl, 
211
                ($entity < 0) ? $entity : $entitySize];
212
        printArray($info);
213
        push(@entityMetadata, $info);
214
    }
215
    return \@entityMetadata;
216
}
217

  
218
#
219
# Download a single entity from a given URL and return it, or return -1 on error
220
#
221
sub downloadEntity {
222
    my $ua = shift;
223
    my $url = shift;
224

  
225
    # Create a request
226
    my $req = HTTP::Request->new(GET => $url);
227

  
228
    # Pass request to the user agent and get a response back
229
    my $res = $ua->request($req);
230
    
231
    # Check the outcome of the response
232
    if ($res->is_success) {
233
        return $res->content;
234
    } else {
235
        print $res->status_line, "\n";
236
        return -1;
237
    }
238
}
239

  
240
# 
241
# Create a new cache to be used for storing downloaded entities
242
#
243
sub initializeCache {
244
    my $cacheDir = shift;
245

  
246
    my $cache = new Cache::FileCache( );
247
    $cache->set_cache_root($cacheDir);
248

  
249
    return $cache;
250
}
251

  
0 252

  

Also available in: Unified diff