Revision 3762
Added by Matt Jones almost 17 years ago
src/perl/cache_eml_data.pl | ||
---|---|---|
33 | 33 |
# Get a list of EML documents |
34 | 34 |
#my $queryTerm = "%Jones%"; |
35 | 35 |
my $queryTerm = "%"; |
36 |
|
|
36 | 37 |
my $result = executeQuery($metacat, $queryTerm); |
37 | 38 |
|
38 | 39 |
# Extract an array of all of the entity URLs for each EML document |
... | ... | |
179 | 180 |
# Create a user agent object for downloading from URLs |
180 | 181 |
my $ua = LWP::UserAgent->new; |
181 | 182 |
$ua->agent("Metacat Harvester 1.0 "); |
183 |
$ua->timeout(600); |
|
182 | 184 |
|
183 | 185 |
# Loop through all of the entity URLs |
184 | 186 |
for (my $i = 0; $i <= $#{$listRef}; $i++) { |
... | ... | |
188 | 190 |
my $entityUrl = $$listRef[$i][1]; |
189 | 191 |
if ($entityUrl =~ /^ecogrid:/) { |
190 | 192 |
#print "Need to process Ecogrid uri: ", $entityUrl, "\n"; |
191 |
$entity = -2; |
|
192 |
$entitySize = 0; |
|
193 |
my $dataDir = '/var/metacat/data/'; |
|
194 |
my $pos = length("ecogrid://knb/"); |
|
195 |
my $entityId = substr($entityUrl, $pos); |
|
196 |
#print "Looking for Ecogrid file: ", $dataDir . $entityId, "\n"; |
|
197 |
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,$atime, |
|
198 |
$mtime,$ctime,$blksize,$blocks) = stat($dataDir . $entityId); |
|
199 |
#print "Got Ecogrid size: ", $size, "\n"; |
|
200 |
$entity = 1; |
|
201 |
$entitySize = $size; |
|
193 | 202 |
} else { |
194 | 203 |
# For regular URLs, check if its in the cache already, and use |
195 |
# it if it is. If not, download it and save it to the cache |
|
204 |
# it if it is. If not, download it and save it to the cache, but |
|
205 |
# only if its not an HTML file (test for <html> is simplistic) |
|
196 | 206 |
my $entity = $cache->get( $entityUrl ); |
197 | 207 |
if ( defined $entity ) { |
198 |
$entitySize = length($entity); |
|
208 |
if ($entity =~ /<html>/) { |
|
209 |
$entity = -2; |
|
210 |
$entitySize = -2; |
|
211 |
$cache->remove( $entityUrl ); |
|
212 |
} else { |
|
213 |
$entitySize = length($entity); |
|
214 |
} |
|
199 | 215 |
} else { |
200 | 216 |
$entity = downloadEntity($ua, $entityUrl); |
201 | 217 |
if ($entity == -1) { |
202 |
$entitySize = 0;
|
|
218 |
$entitySize = -1;
|
|
203 | 219 |
#print("Error on download for $entityUrl\n"); |
220 |
} elsif ($entity =~ /<html>/) { |
|
221 |
$entity = -2; |
|
222 |
$entitySize = -2; |
|
204 | 223 |
} else { |
205 | 224 |
# write the data to cache, using URL as key |
206 | 225 |
$cache->set( $entityUrl, $entity, "never" ); |
... | ... | |
235 | 254 |
if ($res->is_success) { |
236 | 255 |
return $res->content; |
237 | 256 |
} else { |
238 |
print $res->status_line, "\n"; |
|
257 |
#print $res->status_line, "\n";
|
|
239 | 258 |
return -1; |
240 | 259 |
} |
241 | 260 |
} |
Also available in: Unified diff
Fixes to record sies of ecogrid files, and to flag html files.