Project

General

Profile

« Previous | Next » 

Revision 5020

Added by Duane Costa over 15 years ago

Bug 3835: Design and implement OAI-PMH compliant harvest subsystem. Develop harvester component of the OAI-PMH harvester/provider pair.

View differences:

lib/oaipmh/runHarvester.bat
1
echo off
2
set METACAT_CLASSES=%METACAT_HOME%\build\classes
3
set METACAT_LIB=%METACAT_HOME%\lib
4
set LOG4J_PATH=%METACAT_HOME%\build\war\WEB-INF\log4j.properties
5
set JDBC=%METACAT_HOME%\build\war\lib\jdbc.jar;%METACAT_LIB%\postgresql-8.0-312.jdbc3.jar
6
set LIB_JARS=%METACAT_LIB%\xercesImpl.jar;%METACAT_LIB%\utilities.jar;%METACAT_LIB%\log4j-1.2.12.jar;%METACAT_LIB%\xalan.jar
7
set CLASSPATH=%METACAT_CLASSES%;%JDBC%;%LIB_JARS%
8
java -Dlog4j.configuration=%LOG4J_PATH% edu.ucsb.nceas.metacat.oaipmh.harvester.OaipmhHarvester %*
src/edu/ucsb/nceas/metacat/oaipmh/harvester/ListIdentifiers.java
1
/**
2
 *  '$RCSfile$'
3
 *  Copyright: 2009 University of New Mexico and the 
4
 *                  Regents of the University of California
5
 *
6
 *   '$Author: costa $'
7
 *     '$Date: 2009-07-27 17:47:44 -0400 (Mon, 27 Jul 2009) $'
8
 * '$Revision: 4999 $'
9
 *
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * This program is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
 * GNU General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU General Public License
21
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23
 * 
24
 * Additional Copyright 2006 OCLC, Online Computer Library Center
25
 * Licensed under the Apache License, Version 2.0 (the "License");
26
 * you may not use this file except in compliance with the License.
27
 * You may obtain a copy of the License at
28
 *
29
 * http://www.apache.org/licenses/LICENSE-2.0
30
 *
31
 * Unless required by applicable law or agreed to in writing, software
32
 * distributed under the License is distributed on an "AS IS" BASIS,
33
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
34
 * See the License for the specific language governing permissions and
35
 * limitations under the License.
36
 */
37

  
38
package edu.ucsb.nceas.metacat.oaipmh.harvester;
39

  
40
import java.io.IOException;
41
import java.io.UnsupportedEncodingException;
42
import java.net.URLEncoder;
43
import javax.xml.parsers.ParserConfigurationException;
44
import javax.xml.transform.TransformerException;
45
import org.xml.sax.SAXException;
46

  
47

  
48
/**
49
 * This class represents an ListIdentifiers response on either the server or on
50
 * the client.
51
 * 
52
 * @author Duane Costa, University of New Mexico, LTER Network Office
53
 * @author Jeffrey A. Young, OCLC Online Computer Library Center
54
 */
55
public class ListIdentifiers extends HarvesterVerb {
56

  
57
  
58
/* Constructors */
59
  
60
  /**
61
   * Mock object constructor (for unit testing purposes)
62
   */
63
  public ListIdentifiers() {
64
    super();
65
  }
66

  
67

  
68
  /**
69
   * Client-side ListIdentifiers verb constructor
70
   * 
71
   * @param baseURL                baseURL of the OAI-PMH provider to be queried
72
   * @param from                   the from date, e.g. "2000-01-01"
73
   * @param until                  the until date. e.g. "2009-12-31"
74
   * @param metadataPrefix         the metadata prefix, e.g. "oai_pmh"
75
   * @param setSpec                the set specifier
76
   * 
77
   * @exception MalformedURLException  the baseURL is bad
78
   * @exception SAXException           the xml response is bad
79
   * @exception IOException            an I/O error occurred
80
   */
81
  public ListIdentifiers(String baseURL, String from, String until, 
82
                         String metadataPrefix, String setSpec) 
83
          throws IOException, ParserConfigurationException,
84
      SAXException, TransformerException {
85
    super(getRequestURL(baseURL, from, until, metadataPrefix, setSpec));
86
  }
87

  
88

  
89
  /**
90
   * Client-side ListIdentifiers verb constructor (resumptionToken version)
91
   * 
92
   * @param baseURL                baseURL of the OAI-PMH provider to be queried
93
   * @param resumptionToken        the resumptionToken string, as returned by
94
   *                               the provider server
95
   * @throws IOException
96
   * @throws ParserConfigurationException
97
   * @throws SAXException
98
   * @throws TransformerException
99
   */
100
  public ListIdentifiers(String baseURL, String resumptionToken)
101
      throws IOException, ParserConfigurationException, SAXException,
102
      TransformerException {
103
    super(getRequestURL(baseURL, resumptionToken));
104
  }
105

  
106
  
107
/* Class methods */
108

  
109
  /**
110
   * Construct the query portion of the http request (non-resumptionToken 
111
   * version)
112
   * @param baseURL                baseURL of the OAI-PMH provider to be queried
113
   * @param from                   the from date, e.g. "2000-01-01"
114
   * @param until                  the until date. e.g. "2009-12-31"
115
   * @param metadataPrefix         the metadata prefix, e.g. "oai_pmh"
116
   * @param setSpec                the set specifier
117
   * 
118
   * @return a String containing the query portion of the http request
119
   */
120
  private static String getRequestURL(String baseURL, String from,
121
                                      String until, String metadataPrefix,
122
                                      String setSpec)
123
  {
124
    StringBuffer stringBuffer = new StringBuffer(baseURL);
125
    stringBuffer.append("?verb=ListIdentifiers");
126
    
127
    if (from != null) stringBuffer.append("&from=").append(from);
128
    if (until != null) stringBuffer.append("&until=").append(until);
129
    if (setSpec != null) stringBuffer.append("&set=").append(setSpec);
130
    stringBuffer.append("&metadataPrefix=").append(metadataPrefix);
131
    
132
    String requestURL = stringBuffer.toString();
133
    return requestURL;
134
  }
135

  
136

  
137
  /**
138
   * Construct the query portion of the http request (resumptionToken version)
139
   * 
140
   * @param baseURL                baseURL of the OAI-PMH provider to be queried
141
   * @param resumptionToken        the resumptionToken string, as returned by
142
   *                               the provider server
143
   * @return a String containing the query portion of the http request
144
   */
145
  private static String getRequestURL(String baseURL, String resumptionToken) 
146
          throws UnsupportedEncodingException 
147
  {
148
    StringBuffer stringBuffer = new StringBuffer(baseURL);
149
    
150
    stringBuffer.append("?verb=ListIdentifiers");
151
    stringBuffer.append("&resumptionToken=");
152
    stringBuffer.append(URLEncoder.encode(resumptionToken, "UTF-8"));
153
    
154
    String requestURL = stringBuffer.toString();
155
    return requestURL;
156
  }
157
  
158
  
159
  /* Instance methods */
160

  
161
  /**
162
   * Get the oai:resumptionToken from the response
163
   * 
164
   * @return the oai:resumptionToken value
165
   * @throws TransformerException
166
   * @throws NoSuchFieldException
167
   */
168
  public String getResumptionToken() 
169
          throws TransformerException, NoSuchFieldException 
170
  {
171
    String schemaLocation = getSchemaLocation();
172
    String resumptionToken = "";
173
    
174
    if (SCHEMA_LOCATION_V2_0.equals(schemaLocation)) {
175
      resumptionToken = getSingleString(
176
                    "/oai20:OAI-PMH/oai20:ListIdentifiers/oai20:resumptionToken"
177
                                       );
178
    } 
179
    else {
180
      throw new NoSuchFieldException(getSchemaLocation());
181
    }
182
    
183
    return resumptionToken;
184
  }
185
 
186
}
src/edu/ucsb/nceas/metacat/oaipmh/harvester/OaipmhHarvester.java
1
/**
2
 *  '$RCSfile$'
3
 *  Copyright: 2009 University of New Mexico and the 
4
 *                  Regents of the University of California
5
 *
6
 *   '$Author: costa $'
7
 *     '$Date: 2009-07-27 17:47:44 -0400 (Mon, 27 Jul 2009) $'
8
 * '$Revision: 4999 $'
9
 *
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * This program is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
 * GNU General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU General Public License
21
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23
 * 
24
 * Additional Copyright 2006 OCLC, Online Computer Library Center
25
 * Licensed under the Apache License, Version 2.0 (the "License");
26
 * you may not use this file except in compliance with the License.
27
 * You may obtain a copy of the License at
28
 *
29
 * http://www.apache.org/licenses/LICENSE-2.0
30
 *
31
 * Unless required by applicable law or agreed to in writing, software
32
 * distributed under the License is distributed on an "AS IS" BASIS,
33
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
34
 * See the License for the specific language governing permissions and
35
 * limitations under the License.
36
 */
37

  
38
package edu.ucsb.nceas.metacat.oaipmh.harvester;
39

  
40
import java.io.*;
41
import java.lang.NoSuchFieldException;
42
import java.sql.Connection;
43
import java.sql.DriverManager;
44
import java.sql.ResultSet;
45
import java.sql.SQLException;
46
import java.sql.SQLWarning;
47
import java.sql.Statement;
48
import java.util.HashMap;
49
import java.util.StringTokenizer;
50

  
51
import javax.xml.parsers.DocumentBuilder;
52
import javax.xml.parsers.DocumentBuilderFactory;
53
import javax.xml.parsers.ParserConfigurationException;
54
import javax.xml.transform.TransformerException;
55

  
56
import org.apache.log4j.BasicConfigurator;
57
import org.apache.log4j.Logger;
58
import org.w3c.dom.Document;
59
import org.w3c.dom.Element;
60
import org.w3c.dom.Node;
61
import org.w3c.dom.NodeList;
62
import org.w3c.dom.Text;
63
import org.xml.sax.InputSource;
64
import org.xml.sax.SAXException;
65

  
66
import edu.ucsb.nceas.metacat.client.InsufficientKarmaException;
67
import edu.ucsb.nceas.metacat.client.Metacat;
68
import edu.ucsb.nceas.metacat.client.MetacatException;
69
import edu.ucsb.nceas.metacat.client.MetacatFactory;
70
import edu.ucsb.nceas.metacat.client.MetacatInaccessibleException;
71
import edu.ucsb.nceas.metacat.service.PropertyService;
72
import edu.ucsb.nceas.metacat.shared.ServiceException;
73
import edu.ucsb.nceas.metacat.util.SystemUtil;
74
import edu.ucsb.nceas.utilities.PropertyNotFoundException;
75

  
76

  
77
/**
78
 * Main class for running the OAI-PMH Harvester program
79
 * 
80
 * @author dcosta
81
 *
82
 */
83
public class OaipmhHarvester {
84
  
85
  
86
  /* Class variables */
87

  
88
  private static final String METACAT_CONFIG_DIR = "../../build/war/WEB-INF";
89
  private static HashMap<String, String> metacatDatestamps = 
90
                                                  new HashMap<String, String>();
91
  private static HashMap<String, Integer> metacatRevisions = 
92
                                                 new HashMap<String, Integer>();
93
  private static Metacat metacatClient = null;
94
  private static String metacatURL = null;
95

  
96
  private static Logger logger = Logger.getLogger(OaipmhHarvester.class);
97
  static {
98
    BasicConfigurator.configure();
99
  }
100
  
101
  /*
102
   * Query string to determine the 'date_updated' value stored
103
   * in Metacat's 'xml_documents' table for a given docid value.
104
   */
105
  private static final String METACAT_QUERY =
106
                           "SELECT docid, rev, date_updated FROM xml_documents";
107

  
108

  
109
  /* Class methods */
110
  
111
  /**
112
   * Converts a Dryad identifier to a Metacat docid (scope + identifier)
113
   * 
114
   * @param dryadID  The Dryad identifier, e.g.
115
   *                 "oai:dryad-dev.nescent.org:10255/dryad.12"
116
   * @return  Metacat docid, e.g. "10255/dryad.12"
117
   */
118
  private static String docidFromDryadIdentifier(String dryadID) {
119
    String docid = null;
120
    String scopeAndIdentifier = null;
121
    String scope = null;
122
    String identifier = null;  
123
    StringTokenizer stringTokenizer = new StringTokenizer(dryadID, ":");
124
    
125
    String token = null;
126
    int tokenCount = stringTokenizer.countTokens();
127
    int i = 1;    
128
    while (stringTokenizer.hasMoreTokens()) {
129
      token = stringTokenizer.nextToken();
130
      if (i == tokenCount) { scopeAndIdentifier = token; }
131
      i++;
132
    }
133
    
134
    if (scopeAndIdentifier != null) {
135
      stringTokenizer = new StringTokenizer(scopeAndIdentifier, ".");
136
      
137
      tokenCount = stringTokenizer.countTokens();
138
      if (tokenCount == 2) {  
139
        i = 1;
140
        while (stringTokenizer.hasMoreTokens()) {
141
          token = stringTokenizer.nextToken();
142
          if (i == (tokenCount - 1)) { scope = token; }
143
          if (i == tokenCount) { identifier = token; }
144
          i++;
145
        }
146
      }
147
      else {
148
        logger.error("Error parsing Dryad identifier: " + dryadID);
149
      }
150
    }
151
    
152
    if (scope != null && identifier != null) {
153
      scope = scope.replace('/', '-'); // Metacat doesn't allow '/' in docid
154
      docid = scope + "." + identifier;
155
    }
156
    
157
    return docid;
158
  }
159
  
160
  
161
  /**
162
   * Converts an OAI-PMH identifier to a Metacat docid (scope + identifier)
163
   * 
164
   * @param   identifier    the OAI-PMH identifier
165
   * @return  docid         Metacat docid
166
   */
167
  private static String docidFromIdentifier(String identifier) {
168
    String docid = null;
169
    
170
    /*
171
     * Call the appropriate method to convert identifier to a Metacat docid.
172
     */
173
    if (identifier != null) {
174
      /*
175
       * Check for LSID syntax.
176
       */
177
      if (identifier.startsWith("urn:lsid:")) {
178
        docid = docidFromLSID(identifier);
179
      }
180
      /* Dryad identifier: http://hdl.handle.net/10255/dryad.66
181
       * Equivalent Metacat identifier: 10255-dryad.66.1
182
       */
183
      else if (identifier.contains("/dryad.")) {
184
        docid = docidFromDryadIdentifier(identifier);
185
      }
186
    }
187
    
188
    return docid;
189
  }
190
  
191
  
192
  /**
193
   * Converts an LSID identifier to a Metacat docid (scope + identifier)
194
   * 
195
   * @param lsidIdentifier  The LSID identifier, e.g.
196
   *                        "urn:lsid:knb.ecoinformatics.org:knb-lter-sgs:6"
197
   * @return  Metacat docid, e.g. "knb-lter-sgs.6"
198
   */
199
  private static String docidFromLSID(String lsidIdentifier) {
200
    String docid = null;
201
    String scope = null;
202
    String identifier = null;  
203
    StringTokenizer stringTokenizer = new StringTokenizer(lsidIdentifier, ":");
204
    
205
    int tokenCount = stringTokenizer.countTokens();
206
    int i = 1;    
207
    while (stringTokenizer.hasMoreTokens()) {
208
      String token = stringTokenizer.nextToken();
209
      if (i == (tokenCount - 1)) { scope = token; }
210
      if (i == tokenCount) { identifier = token; }
211
      i++;
212
    }
213
    
214
    if (scope != null && identifier != null) {
215
      docid = scope + "." + identifier;
216
    }
217
    
218
    return docid;
219
  }
220
  
221
  
222
  /**
223
   * Extracts the metadata content from the XML string returned by the GetRecord
224
   * verb.
225
   * 
226
   * @param getRecordString    The XML string returned by the GetRecord verb
227
   *                           operation.
228
   * @return  metadataString   The document string extracted from the GetRecord
229
   *                           XML string.
230
   */
231
  private static String extractMetadata(String getRecordString) {
232
    String metadataString = null;
233
    StringBuffer stringBuffer = 
234
               new StringBuffer("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
235
    
236
    /* The document string is everything between the <metadata> and </metadata>
237
     * tags.
238
     */
239
    int metadataStartIndex = getRecordString.indexOf("<metadata>");
240
    int metadataEndIndex = getRecordString.indexOf("</metadata>");
241

  
242
    if ((metadataStartIndex >= 0) &&
243
        (metadataEndIndex >= 0) &&
244
        (metadataStartIndex < metadataEndIndex)
245
       ) {
246
      int startPosition = metadataStartIndex + "<metadata>".length();
247
      int endPosition = metadataEndIndex;
248
      String docString = getRecordString.substring(startPosition, endPosition);
249
      stringBuffer.append(docString);
250
      stringBuffer.append("\n");
251
      metadataString = stringBuffer.toString();
252
    }
253
    
254
    return metadataString;
255
  }
256
  
257
  
258
  /**
259
   * Returns a connection to the database. Opens the connection if a connection
260
   * has not already been made previously.
261
   * 
262
   * @return  conn  the database Connection object
263
   */
264
  private static Connection getConnection() {
265
    Connection conn = null;
266
    String dbDriver = "";
267
    String defaultDB = null;
268
    String password = null;
269
    String user = null;
270
    SQLWarning warn;
271
    
272
    if (conn == null) {
273
        try {
274
          dbDriver = PropertyService.getProperty("database.driver");
275
          defaultDB = PropertyService.getProperty("database.connectionURI");
276
          password = PropertyService.getProperty("database.password");
277
          user = PropertyService.getProperty("database.user");
278
        } 
279
        catch (PropertyNotFoundException pnfe) {
280
          logger.error("Can't find database connection property " + pnfe);
281
          System.exit(1);
282
        }
283

  
284
      // Load the jdbc driver
285
      try {
286
        Class.forName(dbDriver);
287
      }
288
      catch (ClassNotFoundException e) {
289
        logger.error("Can't load driver " + e);
290
        System.exit(1);
291
      } 
292

  
293
      // Make the database connection
294
      try {
295
        conn = DriverManager.getConnection(defaultDB, user, password);
296

  
297
        // If a SQLWarning object is available, print its warning(s).
298
        // There may be multiple warnings chained.
299
        warn = conn.getWarnings();
300
      
301
        if (warn != null) {
302
          while (warn != null) {
303
            logger.warn("SQLState: " + warn.getSQLState());
304
            logger.warn("Message:  " + warn.getMessage());
305
            logger.warn("Vendor: " + warn.getErrorCode());
306
            warn = warn.getNextWarning();
307
          }
308
        }
309
      }
310
      catch (SQLException e) {
311
        logger.error("Database access failed " + e);
312
        System.exit(1);
313
      }
314
    }
315
    
316
    return conn;
317
  }
318

  
319

  
320
  /**
321
   * Parses command line options and packages them into a HashMap.
322
   *  
323
   * @param   args     array of command-line strings
324
   * @return  options  HashMap of option/value pairs
325
   */
326
  private static HashMap<String, String> getOptions(String[] args) {
327
    HashMap<String, String> options = new HashMap<String, String>();
328
    boolean foundDN = false;
329
    boolean foundPassword = false;
330
        
331
    for (int i=0; i<args.length; ++i) {
332
      if (args[i].charAt(0) != '-') {
333
        options.put("baseURL", args[i]);
334
      } 
335
      else if (i + 1 < args.length) {
336
        if (args[i].equals("-dn")) { foundDN = true; }
337
        if (args[i].equals("-password")) { foundPassword = true; }
338
        options.put(args[i], args[++i]);
339
      }
340
      else {
341
        throw new IllegalArgumentException();
342
      }
343
    }
344
    
345
    // Check for required command-line options "-dn" and "-password"
346
    if (!foundDN || !foundPassword) { throw new IllegalArgumentException(); }
347
    
348
    return options;
349
  }
350
  
351
  
352
  /**
353
   * Boolean to determine whether the content returned from the GetRecord verb
354
   * indicates a deleted document.
355
   * 
356
   * @param    getRecordString    the content returned by the GetRecord verb
357
   * @return   true if this is a deleted record, else false
358
   */
359
  private static boolean isDeletedRecord(String getRecordString) {
360
    boolean isDeleted = false;
361
    final String DELETED_FLAG_1 = "status=\"deleted\"";
362
    final String DELETED_FLAG_2 = "status='deleted'";
363
    
364
    if (getRecordString != null) {
365
      if ((getRecordString.contains(DELETED_FLAG_1) ||
366
           getRecordString.contains(DELETED_FLAG_2)
367
          ) &&
368
          !getRecordString.contains("<metadata>")
369
         ) {
370
        isDeleted = true;
371
      }
372
    }
373
    
374
    return isDeleted;
375
  }
376

  
377
  
378
  /**
379
   * Load datestamps for all Metacat documents. This will be used to determine
380
   * whether the document in the OAI-PMH repository is newer than the copy
381
   * in Metacat. If it is newer, the document should be harvested.
382
   */
383
  private static void loadMetacatCatalog() {
384
    try {
385
      Connection conn = getConnection();    
386

  
387
      if (conn != null) {
388
        Statement stmt = conn.createStatement();                          
389
        ResultSet rs = stmt.executeQuery(METACAT_QUERY);
390
        while (rs.next()) {
391
          String docid = rs.getString("docid");
392
          String dateUpdated = rs.getDate("date_updated").toString();
393
          int rev = rs.getInt("rev");
394
          Integer revInteger = new Integer(rev);
395
          metacatDatestamps.put(docid, dateUpdated);
396
          metacatRevisions.put(docid, revInteger);
397
        }
398
        stmt.close();   
399
        conn.close();
400
      }
401
    }
402
    catch(SQLException e) {
403
      metacatDatestamps = null;
404
      metacatRevisions = null;
405
      logger.error("SQLException: " + e.getMessage());
406
    }
407
  }
408
    
409
  
410
  /**
411
   * Loads OaipmhHarvester properties from a configuration file. These are
412
   * configuration values that are not specified on the command line, such
413
   * as the database connection values. They are typically stored in the
414
   * 'metacat.properties' file.
415
   * 
416
   * @param   metacatConfigDir   The metacat configuration directory.
417
   *                             Typically, the directory in which the
418
   *                             'metacat.properties' file is found.
419
   */
420
  private static void loadProperties(String metacatConfigDir) {   
421

  
422
    try {
423
        PropertyService.getTestInstance(metacatConfigDir);
424
    } 
425
    catch (ServiceException e) {
426
      logger.error("Error in loading properties: " + e.getMessage());
427
    }
428
  }
429
  
430
  
431
  /**
432
   * The main() method.
433
   * 
434
   * @param args    
435
   * 
436
   * Command line arguments:
437
   * 
438
   *  -dn distinguished_name    -- LDAP user name of the harvester account
439
   *  -password password        -- LDAP password of the harvester account
440
   *  <-metacatConfigdir dir>   -- Directory where metacat.properties file is
441
   *                               found.
442
   *  <-from date>              -- from date of the harvest documents
443
   *  <-until date>             -- until date of the harvest documents
444
   *  <-metadataPrefix prefix>  -- metadata prefix of the harvest documents,
445
   *                               e.g. 'oai_dc'
446
   *  <-setSpec setName>        -- set specification of the harvest documents
447
   *  baseURL                   -- base URL of the OAI-PMH data provider
448
   *
449
   *  Command options appearing inside angle brackets (<>) are optional.
450
   */
451
  public static void main(String[] args) {
452
    try {	    
453
      HashMap<String, String> options = getOptions(args);
454
      String baseURL = options.get("baseURL");
455
      String dn = options.get("-dn");                 // LDAP distinguished name
456
      String password = options.get("-password");     // LDAP password
457
      String from = (String) options.get("-from");
458
      String until = (String) options.get("-until");
459
      String metadataPrefix = (String) options.get("-metadataPrefix");
460
      String metacatConfigDir = (String) options.get("-metacatConfigDir");
461
      String setSpec = (String) options.get("-setSpec");
462
      
463
      /* Use default values if the values aren't specified on command line */
464
      if (metadataPrefix == null) { metadataPrefix = "oai_dc"; }
465
      if (metacatConfigDir == null) { metacatConfigDir = METACAT_CONFIG_DIR; }
466

  
467
      OaipmhHarvester.loadProperties(metacatConfigDir);
468
      metacatURL = SystemUtil.getServletURL();
469
      metacatClient = MetacatFactory.createMetacatConnection(metacatURL);
470
      OaipmhHarvester.loadMetacatCatalog();
471
      
472
      /* 
473
       * If the Metacat catalog failed to load then we can't continue on.
474
       */
475
      if ((metacatURL != null) && 
476
          (metacatClient != null) && 
477
          (metacatDatestamps != null)
478
         ) {
479
        run(baseURL, dn, password, from, until, metadataPrefix, setSpec); 
480
      }
481
      else {
482
        logger.error("Unable to load document catalog from Metacat database.");
483
      }
484
    }
485
	catch (IllegalArgumentException e) {
486
      logger.error("OaipmhHarvester " +
487
                   "-dn distinguished_name " +
488
                   "-password password " +
489
                   "<-from date> " +
490
                   "<-until date> " +
491
                   "<-metadataPrefix prefix> " +
492
                   "<-setSpec setName> " +
493
                   "baseURL"
494
                  );
495
	}
496
    catch (MetacatInaccessibleException e) {
497
      logger.error("MetacatInaccessibleException:\n" + e.getMessage());
498
    }
499
    catch (PropertyNotFoundException e) {
500
      logger.error("PropertyNotFoundException: " + 
501
             "unable to determine metacat URL from SystemUtil.getServletURL()");
502
    }
503
    catch (IOException e) {
504
      logger.error("Error reading EML document from metacat:\n" + 
505
                   e.getMessage()
506
                  );
507
    }
508
	catch (Exception e) {
509
	  e.printStackTrace();
510
	  System.exit(-1);
511
	}
512
  }
513

  
514
  
515
  /**
516
   * Determines the datestamp for a Metacat document based on the 'date_updated'
517
   * value stored in the Metacat database for a given 'docid' value.
518
   * 
519
   * @param   docid    The metacat docid (scope + revision).
520
   * @return  String representing the 'date_updated' value stored in the Metacat
521
   *          database for this document based on its 'docid' value.
522
   */
523
  private static String metacatDatestamp(String docid) {
524
    String metacatDatestamp = metacatDatestamps.get(docid);
525

  
526
    return metacatDatestamp;
527
  }
528
  
529
  
530
  /**
531
   * Boolean to determine whether Metacat has a document with the specified
532
   * docid.
533
   * 
534
   * @param   docid                   Metacat docid value
535
   * @return  true if Metacat has this docid, else false
536
   */
537
  private static boolean metacatHasDocid(String docid) {
538
    boolean hadDocid = false;
539
    String metacatDatestamp = metacatDatestamp(docid);
540

  
541
    if (metacatDatestamp != null) {
542
      hadDocid = true;                // Metacat has the docid
543
    }
544
    
545
    return hadDocid;
546
  }
547
  
548

  
549
  /**
550
   * Login to Metacat using the ldapDN and ldapPwd
551
   * 
552
   * @param  ldapDN   the LDAP distinguished name, e.g.
553
   *                  "uid=dryad,o=LTER,dc=ecoinformatics,dc=org"
554
   * @param  ldapPwd  the corresponding LDAP password string
555
   */
556
  private static void metacatLogin(String ldapDN, String ldapPwd) {
557
      try {
558
        logger.info("Logging in to Metacat: " + ldapDN);
559
        String response = metacatClient.login(ldapDN, ldapPwd);
560
        logger.info("Metacat login response: " + response);
561
      } 
562
      catch (MetacatInaccessibleException e) {
563
        logger.error("Metacat login failed." + e.getMessage());
564
      } 
565
      catch (Exception e) {
566
        logger.error("Metacat login failed." + e.getMessage());
567
      }
568
  }
569
  
570
  
571
  /**
572
   * Logout from Metacat
573
   */
574
  private static void metacatLogout() {
575
    try {    
576
      // Log out from the Metacat session
577
      logger.info("Logging out from Metacat");
578
      metacatClient.logout();
579
    }
580
    catch (MetacatInaccessibleException e) {
581
      logger.error("Metacat inaccessible: " + e.getMessage());
582
    }
583
    catch (MetacatException e) {
584
      logger.error("Metacat exception: " + e.getMessage());
585
    }
586
  }
587
 
588

  
589
  /**
590
   * Determines the revision for a Metacat document based on the 'rev'
591
   * value stored in the Metacat database for a given 'docid' value.
592
   * 
593
   * @param   docid    The metacat docid (scope + revision).
594
   * @return  Integer representing the 'rev' value stored in the Metacat
595
   *          database for this document based on its 'docid' value.
596
   */
597
  private static Integer metacatRevision(String docid) {
598
    Integer metacatRevision = metacatRevisions.get(docid);
599

  
600
    return metacatRevision;
601
  }
602
  
603
  
604
  /**
605
   * Process the output of the ListIdentifiers verb. For each identifier
606
   * listed, determine whether the document should be harvested (inserted or
607
   * updated), deleted, or if no action is needed.
608
   * 
609
   * @param baseURL          The base URL of the data provider.
610
   * @param from             Value of 'from' option, a date string or null
611
   * @param until            Value of 'until' option, a date string or null
612
   * @param metadataPrefix   Value of 'metadataPrefix' option, may be null
613
   * @param setSpec          Value of 'setSpec' option, may be null
614
   * @param xmlString        The XML string from ListIdentifiers
615
   * @param principal        Distinguished name of the LDAP account for the
616
   *                         harvester user, 
617
   *                         e.g. "uid=dryad,o=LTER,dc=ecoinformatics,dc=org"
618
   */
619
  private static void processListIdentifiers(String baseURL, 
620
                                             String from, 
621
                                             String until,
622
                                             String metadataPrefix,
623
                                             String setSpec,
624
                                             String xmlString,
625
                                             String principal) {
626
    DocumentBuilderFactory documentBuilderFactory =
627
                                           DocumentBuilderFactory.newInstance();
628
    StringReader stringReader = new StringReader(xmlString);
629
     
630
    try {
631
      DocumentBuilder documentBuilder = documentBuilderFactory.newDocumentBuilder();
632
      InputSource inputSource = new InputSource(stringReader);
633
      Document document = documentBuilder.parse(inputSource);
634
      Element rootElement = document.getDocumentElement();
635
      NodeList nodeList = rootElement.getChildNodes();
636
      
637
      for (int i = 0; i < nodeList.getLength(); i++) {
638
        Node child = nodeList.item(i);
639
        
640
        if (child instanceof Element) {
641
          Element childElement = (Element) child;
642

  
643
          if (childElement.getTagName().equals("ListIdentifiers")) {
644
            NodeList listIdentifiersNodeList = childElement.getChildNodes();
645
            
646
            for (int j = 0; j < listIdentifiersNodeList.getLength(); j++) {
647
              Node listIdentifiersNode = listIdentifiersNodeList.item(j);
648
              
649
              if (listIdentifiersNode instanceof Element) {
650
                Element listIdentifiersElement = (Element) listIdentifiersNode;
651

  
652
                if (listIdentifiersElement.getTagName().equals("header")) {
653
                  NodeList headerNodeList = listIdentifiersElement.getChildNodes();
654
                  String identifier = null;
655
                  String datestamp = null;
656
                  
657
                  for (int k = 0; k < headerNodeList.getLength(); k++) {
658
                    Node headerNode = headerNodeList.item(k);
659
                    
660
                    if (headerNode instanceof Element) {
661
                      Element headerElement = (Element) headerNode;
662
                      
663
                      if (headerElement.getTagName().equals("identifier")) {
664
                        Text textNode = (Text) headerElement.getFirstChild();
665
                        identifier = textNode.getData().trim();
666
                      }
667
                      else if (headerElement.getTagName().equals("datestamp")) {
668
                        Text textNode = (Text) headerElement.getFirstChild();
669
                        datestamp = textNode.getData().trim();
670
                      }             
671
                    }
672
                  }
673
                  
674
                  if (identifier != null) {
675
                    String docid = docidFromIdentifier(identifier);
676
                    logger.debug("identifier: " + identifier + 
677
                                 "; docid: " + docid + 
678
                                 "; datestamp: " + datestamp);
679
       
680
                    if (docid != null) { 
681
                      if (shouldHarvestDocument(docid, datestamp)) {                    
682
                        GetRecord getRecord = 
683
                             new GetRecord(baseURL, identifier, metadataPrefix);
684
                        getRecord.runVerb();  // Run the GetRecord verb
685
                        
686
                        NodeList errors = getRecord.getErrors();
687
                        if (errors != null && errors.getLength() > 0) {
688
                          logger.error("Found errors in GetRecord results");
689
                          int length = errors.getLength();
690

  
691
                          for (int l = 0; l < length; ++l) {
692
                            Node item = errors.item(l);
693
                            logger.error(item);
694
                          }
695

  
696
                          logger.error("Error record: " + getRecord.toString());
697
                        }
698
                        else {
699
                          String getRecordString = getRecord.toString();
700
                          boolean isDeleted = isDeletedRecord(getRecordString);
701
                          
702
                          if (isDeleted) {
703
                            logger.info("GetRecord indicates deleted record: " + 
704
                                        docid);
705
                            if (metacatHasDocid(docid)) {
706
                              logger.info(
707
                                        "Deleting " + docid + " from Metacat.");
708
                              String deleteReturnString = null;
709
                              deleteReturnString = metacatClient.delete(docid);
710
                              if (deleteReturnString != null && 
711
                                  !deleteReturnString.equals("")) {
712
                                logger.info(deleteReturnString);
713
                              }
714
                            }                           
715
                          }
716
                          else {
717
                            String metadataString = 
718
                                               extractMetadata(getRecordString);
719
                            uploadToMetacat(docid, datestamp, metadataString, 
720
                                            principal);
721
                          }
722
                        }
723
                      }
724
                      else {
725
                        logger.info(
726
                          "Not harvesting docid '" + docid + 
727
                          "' from the OAI-PMH provider. " +
728
                          "Metacat already has this document at datestamp '" + 
729
                          datestamp + "' or higher.");
730
                      }
731
                    }
732
                    else {
733
                      logger.warn("Unrecognized identifier format: " +
734
                                  identifier);
735
                    }
736
                  }
737
                }             
738
              }
739
            }
740
          }
741
        }
742
      }
743
    }
744
    catch (Exception e) {
745
      logger.error("General exception:\n" + e.getMessage());
746
      e.printStackTrace();
747
    }
748
  }
749
  
750
  
751
  /**
752
   * Runs a OAI-PMH harvest.
753
   * 
754
   * @param baseURL          The base URL of the data provider.
755
   * @param dn               Value of 'dn' option, a LDAP distinguished name,
756
   *                         e.g. "uid=dryad,o=LTER,dc=ecoinformatics,dc=org"
757
   * @param password         Value of 'password' option, a string
758
   * @param from             Value of 'from' option, a date string or null
759
   * @param until            Value of 'until' option, a date string or null
760
   * @param metadataPrefix   Value of 'metadataPrefix' option, may be null
761
   * @param setSpec          Value of 'setSpec' option, may be null
762
   * @throws IOException
763
   * @throws ParserConfigurationException
764
   * @throws SAXException
765
   * @throws TransformerException
766
   * @throws NoSuchFieldException
767
   */
768
  public static void run(String baseURL, String dn, String password, 
769
                         String from, String until,
770
                         String metadataPrefix, String setSpec
771
                        )
772
          throws IOException, ParserConfigurationException, SAXException, 
773
                 TransformerException, NoSuchFieldException 
774
  {
775
    logger.info("Starting OAI-PMH Harvester.");
776
    if ((dn != null) && (password != null)) {
777
      metacatLogin(dn, password);
778
    }
779
    else {
780
      logger.error("Distinguished name (-dn) and/or password (-password) " +
781
      		       "were not specified.");
782
      return;
783
    }
784
    
785
    ListIdentifiers listIdentifiers = 
786
             new ListIdentifiers(baseURL, from, until, metadataPrefix, setSpec);
787
    listIdentifiers.runVerb();
788
    
789
    while (listIdentifiers != null) {
790
      NodeList errors = listIdentifiers.getErrors();
791

  
792
      if (errors != null && errors.getLength() > 0) {
793
        logger.error("Found errors in ListIdentifier results");
794
        int length = errors.getLength();
795

  
796
        for (int i = 0; i < length; ++i) {
797
          Node item = errors.item(i);
798
          logger.error(item);
799
        }
800

  
801
        logger.error("Error record: " + listIdentifiers.toString());
802
        break;
803
      }
804

  
805
      String xmlString = listIdentifiers.toString();
806
      processListIdentifiers(baseURL, from, until, metadataPrefix, setSpec,
807
                             xmlString, dn);
808
      String resumptionToken = listIdentifiers.getResumptionToken();
809
      logger.debug("resumptionToken: " + resumptionToken);
810

  
811
      if (resumptionToken == null || resumptionToken.length() == 0) {
812
        listIdentifiers = null;
813
      } 
814
      else {
815
        listIdentifiers = new ListIdentifiers(baseURL, resumptionToken);
816
        listIdentifiers.runVerb();
817
      }
818
    }
819

  
820
    metacatLogout();
821
    logger.info("Shutting down OAI-PMH Harvester.");
822
  }
823
  
824
  
825
  /**
826
   * Should a document be harvested? Compare the OAI-PMH provider datestamp to 
827
   * the Metacat datestamp (the 'last_updated' date). If the Metacat datestamp 
828
   * is unknown, or if it's less than the OAI-PMH datestamp, then the document
829
   * should be harvested.
830
   *  
831
   * @param docid                   The Metacat docid value.
832
   * @param providerDatestamp       The OAI-PMH provider datestamp.
833
   * @return   true if the document should be harvested into Metacat, else false
834
   */
835
  private static boolean shouldHarvestDocument(String docid, 
836
                                               String providerTimestamp
837
                                              ) {
838
    String providerDatestamp;
839
    boolean shouldHarvest = false;
840
    String metacatDatestamp = metacatDatestamp(docid);
841
 
842
    /*
843
     * Since Metacat stores its 'last_updated' field as a datestamp (no time),
844
     * we need to strip off the timestamp part of the provider timestamp
845
     * before doing a comparison of the Metacat datestamp to the OAI-PMH
846
     * provider datestamp.
847
     */
848
    if (providerTimestamp.contains("T")) {
849
      int tIndex = providerTimestamp.indexOf('T');
850
      providerDatestamp = providerTimestamp.substring(0, tIndex);
851
    }
852
    else {
853
      providerDatestamp = providerTimestamp;
854
    }
855
    
856
    /*
857
     * If we don't have a Metacat datastamp for this document, or if the
858
     * Metacat datestamp is older than the provider datestamp, then we
859
     * should harvest the document.
860
     */
861
    if (metacatDatestamp == null) {
862
      shouldHarvest = true;
863
    }
864
    else if (metacatDatestamp.compareTo(providerDatestamp) < 0) {
865
        shouldHarvest = true;
866
    }
867
    
868
    return shouldHarvest;
869
  }
870
  
871

  
872
  /**
873
   * Insert or update the document to Metacat. If Metacat already has this
874
   * document, increment the 'rev' number by 1 to update it.
875
   * 
876
   * @param   docid           The Metacat docid
877
   * @param   datestamp       The datestamp in the OAI-PMH provider catalog.
878
   * @param   metadataString  The metadata string extracted by the GetRecord 
879
   * @param   principal       The distinguished name of the principal
880
   *                          verb
881
   * @return  true if the upload succeeded, else false.
882
   */
883
  private static boolean uploadToMetacat(String docid,
884
                                         String datestamp,
885
                                         String metadataString,
886
                                         String principal) {
887
    String docidFull = null;
888
    boolean success = true;
889
    String metacatDatestamp = metacatDatestamp(docid);
890
    Integer metacatRevision = metacatRevision(docid);
891
    boolean insert = false;
892
    StringReader stringReader = null;
893
    boolean update = false;
894
    
895
    if (metadataString != null ) {
896
      stringReader = new StringReader(metadataString);
897

  
898
      /* If metacat already has this document, determine the highest revision in
899
       * metacat and report it to the user; else, insert or delete the document 
900
       * into metacat.
901
       */
902
      if (metacatDatestamp == null) {
903
        insert = true;
904
        int newRevision = 1;
905
        docidFull = docid + "." + newRevision;
906
      }
907
      else if (metacatDatestamp.compareTo(datestamp) < 0) {
908
        update = true;
909
        int newRevision = metacatRevision + 1;
910
        docidFull = docid + "." + newRevision;
911
      }
912
      else if (metacatDatestamp.compareTo(datestamp) == 0) {
913
        logger.warn("Attempting to update " + docid + " to datestamp " + 
914
            datestamp + ". Metacat has document at datestamp " +
915
            metacatDatestamp + ".");
916
      }
917
        
918
      if (insert || update) {
919
        String metacatReturnString = "";
920
        String accessReturnString = "";
921
      
922
        try {
923
          if (insert) {
924
            logger.info("Inserting document: " + docidFull);
925
            metacatReturnString = 
926
                            metacatClient.insert(docidFull, stringReader, null);
927
          
928
            /* Add "all" permission for the dataset owner */
929
            String permission = "all";
930
            String permType = "allow";
931
            String permOrder = "allowFirst";
932
            accessReturnString = metacatClient.setAccess(
933
                             docid, principal, permission, permType, permOrder);
934
            if (accessReturnString != null && !accessReturnString.equals("")) {
935
              logger.info(accessReturnString);
936
            }
937
          
938
            /* Add "read" permission for public users */
939
            permission = "read";
940
            accessReturnString = metacatClient.setAccess(
941
                              docid, "public", permission, permType, permOrder);
942

  
943
            if (accessReturnString != null && !accessReturnString.equals("")) {
944
              logger.info(accessReturnString);
945
            }
946
          }
947
          else if (update) {
948
            logger.info("Updating document: " + docidFull);
949
            metacatReturnString = 
950
                            metacatClient.update(docidFull, stringReader, null);
951
          }
952
        
953
          if (metacatReturnString != null && !metacatReturnString.equals("")) {
954
            logger.info(metacatReturnString);
955
          }
956
        }
957
        catch (MetacatInaccessibleException e) {
958
          logger.error("MetacatInaccessibleException: " + e.getMessage());
959
        }
960
        catch (InsufficientKarmaException e) {
961
          logger.error("InsufficientKarmaException: " + e.getMessage());
962
        }
963
        catch (MetacatException e) {
964
          logger.error("MetacatException: " + e.getMessage());
965
        }
966
        catch (IOException e) {
967
          logger.error("IOException: " + e.getMessage());
968
        }
969
      }
970
    }
971
    
972
    return success;
973
  }
974

  
975
}
src/edu/ucsb/nceas/metacat/oaipmh/harvester/HarvesterVerb.java
1
/**
2
 *  '$RCSfile$'
3
 *  Copyright: 2009 University of New Mexico and the 
4
 *                  Regents of the University of California
5
 *
6
 *   '$Author: costa $'
7
 *     '$Date: 2009-07-27 17:47:44 -0400 (Mon, 27 Jul 2009) $'
8
 * '$Revision: 4999 $'
9
 *
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * This program is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
 * GNU General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU General Public License
21
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23
 * 
24
 * Additional Copyright 2006 OCLC, Online Computer Library Center
25
 * Licensed under the Apache License, Version 2.0 (the "License");
26
 * you may not use this file except in compliance with the License.
27
 * You may obtain a copy of the License at
28
 *
29
 * http://www.apache.org/licenses/LICENSE-2.0
30
 *
31
 * Unless required by applicable law or agreed to in writing, software
32
 * distributed under the License is distributed on an "AS IS" BASIS,
33
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
34
 * See the License for the specific language governing permissions and
35
 * limitations under the License.
36
 */
37

  
38
package edu.ucsb.nceas.metacat.oaipmh.harvester;
39

  
40
import java.io.FileNotFoundException;
41
import java.io.IOException;
42
import java.io.InputStream;
43
import java.io.StringWriter;
44
import java.net.HttpURLConnection;
45
import java.net.URL;
46
import java.util.Date;
47
import java.util.HashMap;
48
import java.util.StringTokenizer;
49
import java.util.zip.GZIPInputStream;
50
import java.util.zip.InflaterInputStream;
51
import java.util.zip.ZipInputStream;
52

  
53
import javax.xml.parsers.DocumentBuilder;
54
import javax.xml.parsers.DocumentBuilderFactory;
55
import javax.xml.parsers.ParserConfigurationException;
56
import javax.xml.transform.OutputKeys;
57
import javax.xml.transform.Result;
58
import javax.xml.transform.Source;
59
import javax.xml.transform.Transformer;
60
import javax.xml.transform.TransformerException;
61
import javax.xml.transform.TransformerFactory;
62
import javax.xml.transform.dom.DOMSource;
63
import javax.xml.transform.stream.StreamResult;
64

  
65
import org.apache.log4j.Logger;
66
import org.apache.xpath.XPathAPI;
67
import org.w3c.dom.DOMImplementation;
68
import org.w3c.dom.Document;
69
import org.w3c.dom.Element;
70
import org.w3c.dom.NodeList;
71
import org.xml.sax.InputSource;
72
import org.xml.sax.SAXException;
73

  
74

  
75
/**
76
 * HarvesterVerb is the parent class for each of the OAI verbs.
77
 * 
78
 * @author Duane Costa, University of New Mexico, LTER Network Office
79
 * @author Jeffrey A. Young, OCLC Online Computer Library Center
80
 */
81
public abstract class HarvesterVerb {
82
  
83
  /* Class variables */
84

  
85
  private static Logger logger = Logger.getLogger(HarvesterVerb.class);
86
  
87
  public static final String SCHEMA_LOCATION_V2_0 = 
88
    "http://www.openarchives.org/OAI/2.0/ " +
89
    "http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd";
90
  
91
  private static HashMap<Thread, DocumentBuilder> builderMap = 
92
                                         new HashMap<Thread, DocumentBuilder>();
93
  private static DocumentBuilderFactory documentBuilderFactory = null;
94
  private static Element namespaceElement = null;
95
  private static TransformerFactory transformerFactory = 
96
                                               TransformerFactory.newInstance();
97

  
98
  
99
  /* Instance variables */
100
  
101
  private Document document = null;
102
  private String schemaLocation = null;
103
  private String requestURL = null;
104
  
105
  
106
  /* Constructors */
107
  
108
  /**
109
   * Mock object creator (for unit testing purposes)
110
   */
111
  public HarvesterVerb() {
112
  }
113

  
114

  
115
  /**
116
   * Performs the OAI request
117
   * 
118
   * @param requestURL
119
   * @throws IOException
120
   * @throws ParserConfigurationException
121
   * @throws SAXException
122
   * @throws TransformerException
123
   */
124
  public HarvesterVerb(String requestURL) throws IOException,
125
      ParserConfigurationException, SAXException, TransformerException {
126
    this.requestURL = requestURL;
127
  }
128

  
129

  
130
  /* Static initialization code */
131
  
132
  static {
133
    try {
134
      /* Load DOM Document */
135
      documentBuilderFactory = DocumentBuilderFactory.newInstance();
136
      documentBuilderFactory.setNamespaceAware(true);
137
      Thread thread = Thread.currentThread();
138
      DocumentBuilder builder = documentBuilderFactory.newDocumentBuilder();
139
      builderMap.put(thread, builder);
140

  
141
      DOMImplementation impl = builder.getDOMImplementation();
142
      
143
      Document namespaceHolder = impl.createDocument(
144
                          "http://www.oclc.org/research/software/oai/harvester",
145
                          "harvester:namespaceHolder", 
146
                          null
147
                                                    );
148
      
149
      namespaceElement = namespaceHolder.getDocumentElement();
150
      
151
      namespaceElement.setAttributeNS(
152
                          "http://www.w3.org/2000/xmlns/",
153
                          "xmlns:harvester",
154
                          "http://www.oclc.org/research/software/oai/harvester"
155
                                     );
156
      
157
      namespaceElement.setAttributeNS(
158
                          "http://www.w3.org/2000/xmlns/",
159
                          "xmlns:xsi", 
160
                          "http://www.w3.org/2001/XMLSchema-instance"
161
                                     );
162
      
163
      namespaceElement.setAttributeNS(
164
                          "http://www.w3.org/2000/xmlns/",
165
                          "xmlns:oai20", 
166
                          "http://www.openarchives.org/OAI/2.0/"
167
                                     );
168
    } 
169
    catch (Exception e) {
170
      e.printStackTrace();
171
    }
172
    
173
  }
174

  
175
  
176
  /* Instance methods */
177

  
178
  /* Primary OAI namespaces */
179

  
180
  /**
181
   * Get the OAI response as a DOM object
182
   * 
183
   * @return the DOM for the OAI response
184
   */
185
  public Document getDocument() {
186
    return document;
187
  }
188

  
189

  
190
  /**
191
   * Get the OAI errors
192
   * 
193
   * @return a NodeList of /oai:OAI-PMH/oai:error elements
194
   * @throws TransformerException
195
   */
196
  public NodeList getErrors() throws TransformerException {
197
    if (SCHEMA_LOCATION_V2_0.equals(getSchemaLocation())) {
198
      return getNodeList("/oai20:OAI-PMH/oai20:error");
199
    } 
200
    else {
201
      return null;
202
    }
203
  }
204

  
205

  
206
  /**
207
   * Get a NodeList containing the nodes in the response DOM for the specified
208
   * xpath
209
   * 
210
   * @param xpath
211
   * @return the NodeList for the xpath into the response DOM
212
   * @throws TransformerException
213
   */
214
  public NodeList getNodeList(String xpath) throws TransformerException {
215
    Document document = getDocument();
216
    return XPathAPI.selectNodeList(document, xpath, namespaceElement);
217
  }
218

  
219

  
220
  /**
221
   * Get the OAI request URL for this response
222
   * 
223
   * @return the OAI request URL as a String
224
   */
225
  public String getRequestURL() {
226
    return requestURL;
227
  }
228

  
229

  
230
  /**
231
   * Get the xsi:schemaLocation for the OAI response
232
   * 
233
   * @return the xsi:schemaLocation value
234
   */
235
  public String getSchemaLocation() {
236
    return schemaLocation;
237
  }
238

  
239

  
240
  /**
241
   * Get the String value for the given XPath location in the response DOM
242
   * 
243
   * @param xpath
244
   * @return a String containing the value of the XPath location.
245
   * @throws TransformerException
246
   */
247
  public String getSingleString(String xpath) throws TransformerException {
248
    Document document = getDocument();
249
    org.apache.xpath.objects.XObject xobject;
250
    
251
    xobject = XPathAPI.eval(document, xpath, namespaceElement);
252
    String str = xobject.str();
253
    
254
    return str;
255
  }
256

  
257

  
258
  /**
259
   * Preforms the OAI request for this OAI-PMH verb
260
   * 
261
   * @throws IOException
262
   * @throws ParserConfigurationException
263
   * @throws SAXException
264
   * @throws TransformerException
265
   */
266
  public void runVerb() 
267
          throws IOException, ParserConfigurationException, 
268
                 SAXException, TransformerException {
269
    //logger.debug("requestURL=" + requestURL);
270
    InputStream in = null;
271
    URL url = new URL(requestURL);
272
    HttpURLConnection con = null;
273
    int responseCode = 0;
274
    
275
    do {
276
      con = (HttpURLConnection) url.openConnection();
277
      con.setRequestProperty("User-Agent", "OAIHarvester/2.0");
278
      con.setRequestProperty("Accept-Encoding", "compress, gzip, identify");
279
      
280
      try {
281
        responseCode = con.getResponseCode();
282
        //logger.debug("responseCode=" + responseCode);
283
      } 
284
      catch (FileNotFoundException e) {
285
        // assume it's a 503 response
286
        logger.info(requestURL, e);
287
        responseCode = HttpURLConnection.HTTP_UNAVAILABLE;
288
      }
289

  
290
      if (responseCode == HttpURLConnection.HTTP_UNAVAILABLE) {
291
        long retrySeconds = con.getHeaderFieldInt("Retry-After", -1);
292
        
293
        if (retrySeconds == -1) {
294
          long now = (new Date()).getTime();
295
          long retryDate = con.getHeaderFieldDate("Retry-After", now);
296
          retrySeconds = retryDate - now;
297
        }
298
        
299
        if (retrySeconds == 0) { // Apparently, it's a bad URL
300
          throw new FileNotFoundException("Bad URL?");
301
        }
302
        
303
        System.err.println("Server response: Retry-After=" + retrySeconds);
304
        
305
        if (retrySeconds > 0) {
306
          try {
307
            Thread.sleep(retrySeconds * 1000);
308
          } 
309
          catch (InterruptedException ex) {
310
            ex.printStackTrace();
311
          }
312
        }
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff