Project

General

Profile

« Previous | Next » 

Revision 3762

Added by Matt Jones about 16 years ago

Fixes to record sies of ecogrid files, and to flag html files.

View differences:

cache_eml_data.pl
33 33
# Get a list of EML documents
34 34
#my $queryTerm = "%Jones%";
35 35
my $queryTerm = "%";
36

  
36 37
my $result = executeQuery($metacat, $queryTerm);
37 38

  
38 39
# Extract an array of all of the entity URLs for each EML document
......
179 180
    # Create a user agent object for downloading from URLs
180 181
    my $ua = LWP::UserAgent->new;
181 182
    $ua->agent("Metacat Harvester 1.0 ");
183
    $ua->timeout(600);
182 184

  
183 185
    # Loop through all of the entity URLs
184 186
    for (my $i = 0; $i <= $#{$listRef}; $i++) {
......
188 190
        my $entityUrl = $$listRef[$i][1];
189 191
        if ($entityUrl =~ /^ecogrid:/) {
190 192
            #print "Need to process Ecogrid uri: ", $entityUrl, "\n";
191
            $entity = -2;
192
            $entitySize = 0;
193
            my $dataDir = '/var/metacat/data/';
194
            my $pos = length("ecogrid://knb/");
195
            my $entityId = substr($entityUrl, $pos);
196
            #print "Looking for Ecogrid file: ", $dataDir . $entityId, "\n";
197
            my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,$atime,
198
                $mtime,$ctime,$blksize,$blocks) = stat($dataDir . $entityId);
199
            #print "Got Ecogrid size: ", $size, "\n";
200
            $entity = 1;
201
            $entitySize = $size;
193 202
        } else {
194 203
            # For regular URLs, check if its in the cache already, and use
195
            # it if it is.  If not, download it and save it to the cache
204
            # it if it is.  If not, download it and save it to the cache, but
205
            # only if its not an HTML file (test for <html> is simplistic)
196 206
            my $entity = $cache->get( $entityUrl );
197 207
            if ( defined $entity ) {
198
                $entitySize = length($entity);
208
                if ($entity =~ /<html>/) {
209
                    $entity = -2;
210
                    $entitySize = -2;
211
                    $cache->remove( $entityUrl );
212
                } else {
213
                    $entitySize = length($entity);
214
                }
199 215
            } else {
200 216
                $entity = downloadEntity($ua, $entityUrl);
201 217
                if ($entity == -1) {
202
                    $entitySize = 0;
218
                    $entitySize = -1;
203 219
                    #print("Error on download for $entityUrl\n");
220
                } elsif ($entity =~ /<html>/) {
221
                    $entity = -2;
222
                    $entitySize = -2;
204 223
                } else {
205 224
                    # write the data to cache, using URL as key
206 225
                    $cache->set( $entityUrl, $entity, "never" );
......
235 254
    if ($res->is_success) {
236 255
        return $res->content;
237 256
    } else {
238
        print $res->status_line, "\n";
257
        #print $res->status_line, "\n";
239 258
        return -1;
240 259
    }
241 260
}

Also available in: Unified diff