Project

General

Profile

1
/**
2
 *  '$RCSfile$'
3
 *  Copyright: 2009 University of New Mexico and the 
4
 *                  Regents of the University of California
5
 *
6
 *   '$Author: costa $'
7
 *     '$Date: 2009-07-27 17:47:44 -0400 (Mon, 27 Jul 2009) $'
8
 * '$Revision: 4999 $'
9
 *
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * This program is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
 * GNU General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU General Public License
21
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23
 * 
24
 * Additional Copyright 2006 OCLC, Online Computer Library Center
25
 * Licensed under the Apache License, Version 2.0 (the "License");
26
 * you may not use this file except in compliance with the License.
27
 * You may obtain a copy of the License at
28
 *
29
 * http://www.apache.org/licenses/LICENSE-2.0
30
 *
31
 * Unless required by applicable law or agreed to in writing, software
32
 * distributed under the License is distributed on an "AS IS" BASIS,
33
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
34
 * See the License for the specific language governing permissions and
35
 * limitations under the License.
36
 */
37

    
38
package edu.ucsb.nceas.metacat.oaipmh.harvester;
39

    
40
import java.io.*;
41
import java.lang.NoSuchFieldException;
42
import java.sql.Connection;
43
import java.sql.DriverManager;
44
import java.sql.ResultSet;
45
import java.sql.SQLException;
46
import java.sql.SQLWarning;
47
import java.sql.Statement;
48
import java.util.HashMap;
49
import java.util.StringTokenizer;
50

    
51
import javax.xml.parsers.DocumentBuilder;
52
import javax.xml.parsers.DocumentBuilderFactory;
53
import javax.xml.parsers.ParserConfigurationException;
54
import javax.xml.transform.TransformerException;
55

    
56
import org.apache.log4j.BasicConfigurator;
57
import org.apache.log4j.Logger;
58
import org.w3c.dom.Document;
59
import org.w3c.dom.Element;
60
import org.w3c.dom.Node;
61
import org.w3c.dom.NodeList;
62
import org.w3c.dom.Text;
63
import org.xml.sax.InputSource;
64
import org.xml.sax.SAXException;
65

    
66
import edu.ucsb.nceas.metacat.client.InsufficientKarmaException;
67
import edu.ucsb.nceas.metacat.client.Metacat;
68
import edu.ucsb.nceas.metacat.client.MetacatException;
69
import edu.ucsb.nceas.metacat.client.MetacatFactory;
70
import edu.ucsb.nceas.metacat.client.MetacatInaccessibleException;
71
import edu.ucsb.nceas.metacat.properties.PropertyService;
72
import edu.ucsb.nceas.metacat.shared.ServiceException;
73
import edu.ucsb.nceas.metacat.util.SystemUtil;
74
import edu.ucsb.nceas.utilities.PropertyNotFoundException;
75

    
76

    
77
/**
78
 * Main class for running the OAI-PMH Harvester program
79
 * 
80
 * @author dcosta
81
 *
82
 */
83
public class OaipmhHarvester {
84
  
85
  
86
  /* Class variables */
87

    
88
  private static final String METACAT_CONFIG_DIR = "../../build/war/WEB-INF";
89
  private static HashMap<String, String> metacatDatestamps = 
90
                                                  new HashMap<String, String>();
91
  private static HashMap<String, Integer> metacatRevisions = 
92
                                                 new HashMap<String, Integer>();
93
  private static Metacat metacatClient = null;
94
  private static String metacatURL = null;
95

    
96
  private static Logger logger = Logger.getLogger(OaipmhHarvester.class);
97
  static {
98
    BasicConfigurator.configure();
99
  }
100
  
101
  /*
102
   * Query string to determine the 'date_updated' value stored
103
   * in Metacat's 'xml_documents' table for a given docid value.
104
   */
105
  private static final String METACAT_QUERY =
106
                           "SELECT docid, rev, date_updated FROM xml_documents";
107

    
108

    
109
  /* Class methods */
110
  
111
  /**
112
   * Converts a Dryad identifier to a Metacat docid (scope + identifier)
113
   * 
114
   * @param dryadID  The Dryad identifier, e.g.
115
   *                 "oai:dryad-dev.nescent.org:10255/dryad.12"
116
   * @return  Metacat docid, e.g. "10255/dryad.12"
117
   */
118
  private static String docidFromDryadIdentifier(String dryadID) {
119
    String docid = null;
120
    String scopeAndIdentifier = null;
121
    String scope = null;
122
    String identifier = null;  
123
    StringTokenizer stringTokenizer = new StringTokenizer(dryadID, ":");
124
    
125
    String token = null;
126
    int tokenCount = stringTokenizer.countTokens();
127
    int i = 1;    
128
    while (stringTokenizer.hasMoreTokens()) {
129
      token = stringTokenizer.nextToken();
130
      if (i == tokenCount) { scopeAndIdentifier = token; }
131
      i++;
132
    }
133
    
134
    if (scopeAndIdentifier != null) {
135
      stringTokenizer = new StringTokenizer(scopeAndIdentifier, ".");
136
      
137
      tokenCount = stringTokenizer.countTokens();
138
      if (tokenCount == 2) {  
139
        i = 1;
140
        while (stringTokenizer.hasMoreTokens()) {
141
          token = stringTokenizer.nextToken();
142
          if (i == (tokenCount - 1)) { scope = token; }
143
          if (i == tokenCount) { identifier = token; }
144
          i++;
145
        }
146
      }
147
      else {
148
        logger.error("Error parsing Dryad identifier: " + dryadID);
149
      }
150
    }
151
    
152
    if (scope != null && identifier != null) {
153
      scope = scope.replace('/', '-'); // Metacat doesn't allow '/' in docid
154
      docid = scope + "." + identifier;
155
    }
156
    
157
    return docid;
158
  }
159
  
160
  
161
  /**
162
   * Converts an OAI-PMH identifier to a Metacat docid (scope + identifier)
163
   * 
164
   * @param   identifier    the OAI-PMH identifier
165
   * @return  docid         Metacat docid
166
   */
167
  private static String docidFromIdentifier(String identifier) {
168
    String docid = null;
169
    
170
    /*
171
     * Call the appropriate method to convert identifier to a Metacat docid.
172
     */
173
    if (identifier != null) {
174
      /*
175
       * Check for LSID syntax.
176
       */
177
      if (identifier.startsWith("urn:lsid:")) {
178
        docid = docidFromLSID(identifier);
179
      }
180
      /* Dryad identifier: http://hdl.handle.net/10255/dryad.66
181
       * Equivalent Metacat identifier: 10255-dryad.66.1
182
       */
183
      else if (identifier.contains("/dryad.")) {
184
        docid = docidFromDryadIdentifier(identifier);
185
      }
186
    }
187
    
188
    return docid;
189
  }
190
  
191
  
192
  /**
193
   * Converts an LSID identifier to a Metacat docid (scope + identifier)
194
   * 
195
   * @param lsidIdentifier  The LSID identifier, e.g.
196
   *                        "urn:lsid:knb.ecoinformatics.org:knb-lter-sgs:6"
197
   * @return  Metacat docid, e.g. "knb-lter-sgs.6"
198
   */
199
  private static String docidFromLSID(String lsidIdentifier) {
200
    String docid = null;
201
    String scope = null;
202
    String identifier = null;  
203
    StringTokenizer stringTokenizer = new StringTokenizer(lsidIdentifier, ":");
204
    
205
    int tokenCount = stringTokenizer.countTokens();
206
    int i = 1;    
207
    while (stringTokenizer.hasMoreTokens()) {
208
      String token = stringTokenizer.nextToken();
209
      if (i == (tokenCount - 1)) { scope = token; }
210
      if (i == tokenCount) { identifier = token; }
211
      i++;
212
    }
213
    
214
    if (scope != null && identifier != null) {
215
      docid = scope + "." + identifier;
216
    }
217
    
218
    return docid;
219
  }
220
  
221
  
222
  /**
223
   * Extracts the metadata content from the XML string returned by the GetRecord
224
   * verb.
225
   * 
226
   * @param getRecordString    The XML string returned by the GetRecord verb
227
   *                           operation.
228
   * @return  metadataString   The document string extracted from the GetRecord
229
   *                           XML string.
230
   */
231
  private static String extractMetadata(String getRecordString) {
232
    String metadataString = null;
233
    StringBuffer stringBuffer = 
234
               new StringBuffer("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
235
    
236
    /* The document string is everything between the <metadata> and </metadata>
237
     * tags.
238
     */
239
    int metadataStartIndex = getRecordString.indexOf("<metadata>");
240
    int metadataEndIndex = getRecordString.indexOf("</metadata>");
241

    
242
    if ((metadataStartIndex >= 0) &&
243
        (metadataEndIndex >= 0) &&
244
        (metadataStartIndex < metadataEndIndex)
245
       ) {
246
      int startPosition = metadataStartIndex + "<metadata>".length();
247
      int endPosition = metadataEndIndex;
248
      String docString = getRecordString.substring(startPosition, endPosition);
249
      stringBuffer.append(docString);
250
      stringBuffer.append("\n");
251
      metadataString = stringBuffer.toString();
252
    }
253
    
254
    return metadataString;
255
  }
256
  
257
  
258
  /**
259
   * Returns a connection to the database. Opens the connection if a connection
260
   * has not already been made previously.
261
   * 
262
   * @return  conn  the database Connection object
263
   */
264
  private static Connection getConnection() {
265
    Connection conn = null;
266
    String dbDriver = "";
267
    String defaultDB = null;
268
    String password = null;
269
    String user = null;
270
    SQLWarning warn;
271
    
272
    if (conn == null) {
273
        try {
274
          dbDriver = PropertyService.getProperty("database.driver");
275
          defaultDB = PropertyService.getProperty("database.connectionURI");
276
          password = PropertyService.getProperty("database.password");
277
          user = PropertyService.getProperty("database.user");
278
        } 
279
        catch (PropertyNotFoundException pnfe) {
280
          logger.error("Can't find database connection property " + pnfe);
281
          System.exit(1);
282
        }
283

    
284
      // Load the jdbc driver
285
      try {
286
        Class.forName(dbDriver);
287
      }
288
      catch (ClassNotFoundException e) {
289
        logger.error("Can't load driver " + e);
290
        System.exit(1);
291
      } 
292

    
293
      // Make the database connection
294
      try {
295
        conn = DriverManager.getConnection(defaultDB, user, password);
296

    
297
        // If a SQLWarning object is available, print its warning(s).
298
        // There may be multiple warnings chained.
299
        warn = conn.getWarnings();
300
      
301
        if (warn != null) {
302
          while (warn != null) {
303
            logger.warn("SQLState: " + warn.getSQLState());
304
            logger.warn("Message:  " + warn.getMessage());
305
            logger.warn("Vendor: " + warn.getErrorCode());
306
            warn = warn.getNextWarning();
307
          }
308
        }
309
      }
310
      catch (SQLException e) {
311
        logger.error("Database access failed " + e);
312
        System.exit(1);
313
      }
314
    }
315
    
316
    return conn;
317
  }
318

    
319

    
320
  /**
321
   * Parses command line options and packages them into a HashMap.
322
   *  
323
   * @param   args     array of command-line strings
324
   * @return  options  HashMap of option/value pairs
325
   */
326
  private static HashMap<String, String> getOptions(String[] args) {
327
    HashMap<String, String> options = new HashMap<String, String>();
328
    boolean foundDN = false;
329
    boolean foundPassword = false;
330
        
331
    for (int i=0; i<args.length; ++i) {
332
      if (args[i].charAt(0) != '-') {
333
        options.put("baseURL", args[i]);
334
      } 
335
      else if (i + 1 < args.length) {
336
        if (args[i].equals("-dn")) { foundDN = true; }
337
        if (args[i].equals("-password")) { foundPassword = true; }
338
        options.put(args[i], args[++i]);
339
      }
340
      else {
341
        throw new IllegalArgumentException();
342
      }
343
    }
344
    
345
    // Check for required command-line options "-dn" and "-password"
346
    if (!foundDN || !foundPassword) { throw new IllegalArgumentException(); }
347
    
348
    return options;
349
  }
350
  
351
  
352
  /**
353
   * Boolean to determine whether the content returned from the GetRecord verb
354
   * indicates a deleted document.
355
   * 
356
   * @param    getRecordString    the content returned by the GetRecord verb
357
   * @return   true if this is a deleted record, else false
358
   */
359
  private static boolean isDeletedRecord(String getRecordString) {
360
    boolean isDeleted = false;
361
    final String DELETED_FLAG_1 = "status=\"deleted\"";
362
    final String DELETED_FLAG_2 = "status='deleted'";
363
    
364
    if (getRecordString != null) {
365
      if ((getRecordString.contains(DELETED_FLAG_1) ||
366
           getRecordString.contains(DELETED_FLAG_2)
367
          ) &&
368
          !getRecordString.contains("<metadata>")
369
         ) {
370
        isDeleted = true;
371
      }
372
    }
373
    
374
    return isDeleted;
375
  }
376

    
377
  
378
  /**
379
   * Load datestamps for all Metacat documents. This will be used to determine
380
   * whether the document in the OAI-PMH repository is newer than the copy
381
   * in Metacat. If it is newer, the document should be harvested.
382
   */
383
  private static void loadMetacatCatalog() {
384
    try {
385
      Connection conn = getConnection();    
386

    
387
      if (conn != null) {
388
        Statement stmt = conn.createStatement();                          
389
        ResultSet rs = stmt.executeQuery(METACAT_QUERY);
390
        while (rs.next()) {
391
          String docid = rs.getString("docid");
392
          String dateUpdated = rs.getDate("date_updated").toString();
393
          int rev = rs.getInt("rev");
394
          Integer revInteger = new Integer(rev);
395
          metacatDatestamps.put(docid, dateUpdated);
396
          metacatRevisions.put(docid, revInteger);
397
        }
398
        stmt.close();   
399
        conn.close();
400
      }
401
    }
402
    catch(SQLException e) {
403
      metacatDatestamps = null;
404
      metacatRevisions = null;
405
      logger.error("SQLException: " + e.getMessage());
406
    }
407
  }
408
    
409
  
410
  /**
411
   * Loads OaipmhHarvester properties from a configuration file. These are
412
   * configuration values that are not specified on the command line, such
413
   * as the database connection values. They are typically stored in the
414
   * 'metacat.properties' file.
415
   * 
416
   * @param   metacatConfigDir   The metacat configuration directory.
417
   *                             Typically, the directory in which the
418
   *                             'metacat.properties' file is found.
419
   */
420
  private static void loadProperties(String metacatConfigDir) {   
421

    
422
    try {
423
        PropertyService.getInstance(metacatConfigDir);
424
    } 
425
    catch (ServiceException e) {
426
      logger.error("Error in loading properties: " + e.getMessage());
427
    }
428
  }
429
  
430
  
431
  /**
432
   * The main() method.
433
   * 
434
   * @param args    
435
   * 
436
   * Command line arguments:
437
   * 
438
   *  -dn distinguished_name    -- LDAP user name of the harvester account
439
   *  -password password        -- LDAP password of the harvester account
440
   *  <-metacatConfigdir dir>   -- Directory where metacat.properties file is
441
   *                               found.
442
   *  <-from date>              -- from date of the harvest documents
443
   *  <-until date>             -- until date of the harvest documents
444
   *  <-metadataPrefix prefix>  -- metadata prefix of the harvest documents,
445
   *                               e.g. 'oai_dc'
446
   *  <-setSpec setName>        -- set specification of the harvest documents
447
   *  baseURL                   -- base URL of the OAI-PMH data provider
448
   *
449
   *  Command options appearing inside angle brackets (<>) are optional.
450
   */
451
  public static void main(String[] args) {
452
    try {	    
453
      HashMap<String, String> options = getOptions(args);
454
      String baseURL = options.get("baseURL");
455
      String dn = options.get("-dn");                 // LDAP distinguished name
456
      String password = options.get("-password");     // LDAP password
457
      String from = (String) options.get("-from");
458
      String until = (String) options.get("-until");
459
      String metadataPrefix = (String) options.get("-metadataPrefix");
460
      String metacatConfigDir = (String) options.get("-metacatConfigDir");
461
      String setSpec = (String) options.get("-setSpec");
462
      
463
      /* Use default values if the values aren't specified on command line */
464
      if (metadataPrefix == null) { metadataPrefix = "oai_dc"; }
465
      if (metacatConfigDir == null) { metacatConfigDir = METACAT_CONFIG_DIR; }
466

    
467
      OaipmhHarvester.loadProperties(metacatConfigDir);
468
      metacatURL = SystemUtil.getServletURL();
469
      metacatClient = MetacatFactory.createMetacatConnection(metacatURL);
470
      OaipmhHarvester.loadMetacatCatalog();
471
      
472
      /* 
473
       * If the Metacat catalog failed to load then we can't continue on.
474
       */
475
      if ((metacatURL != null) && 
476
          (metacatClient != null) && 
477
          (metacatDatestamps != null)
478
         ) {
479
        run(baseURL, dn, password, from, until, metadataPrefix, setSpec); 
480
      }
481
      else {
482
        logger.error("Unable to load document catalog from Metacat database.");
483
      }
484
    }
485
	catch (IllegalArgumentException e) {
486
      logger.error("OaipmhHarvester " +
487
                   "-dn distinguished_name " +
488
                   "-password password " +
489
                   "<-from date> " +
490
                   "<-until date> " +
491
                   "<-metadataPrefix prefix> " +
492
                   "<-setSpec setName> " +
493
                   "baseURL"
494
                  );
495
	}
496
    catch (MetacatInaccessibleException e) {
497
      logger.error("MetacatInaccessibleException:\n" + e.getMessage());
498
    }
499
    catch (PropertyNotFoundException e) {
500
      logger.error("PropertyNotFoundException: " + 
501
             "unable to determine metacat URL from SystemUtil.getServletURL()");
502
    }
503
    catch (IOException e) {
504
      logger.error("Error reading EML document from metacat:\n" + 
505
                   e.getMessage()
506
                  );
507
    }
508
	catch (Exception e) {
509
	  e.printStackTrace();
510
	  System.exit(-1);
511
	}
512
  }
513

    
514
  
515
  /**
516
   * Determines the datestamp for a Metacat document based on the 'date_updated'
517
   * value stored in the Metacat database for a given 'docid' value.
518
   * 
519
   * @param   docid    The metacat docid (scope + revision).
520
   * @return  String representing the 'date_updated' value stored in the Metacat
521
   *          database for this document based on its 'docid' value.
522
   */
523
  private static String metacatDatestamp(String docid) {
524
    String metacatDatestamp = metacatDatestamps.get(docid);
525

    
526
    return metacatDatestamp;
527
  }
528
  
529
  
530
  /**
531
   * Boolean to determine whether Metacat has a document with the specified
532
   * docid.
533
   * 
534
   * @param   docid                   Metacat docid value
535
   * @return  true if Metacat has this docid, else false
536
   */
537
  private static boolean metacatHasDocid(String docid) {
538
    boolean hadDocid = false;
539
    String metacatDatestamp = metacatDatestamp(docid);
540

    
541
    if (metacatDatestamp != null) {
542
      hadDocid = true;                // Metacat has the docid
543
    }
544
    
545
    return hadDocid;
546
  }
547
  
548

    
549
  /**
550
   * Login to Metacat using the ldapDN and ldapPwd
551
   * 
552
   * @param  ldapDN   the LDAP distinguished name, e.g.
553
   *                  "uid=dryad,o=LTER,dc=ecoinformatics,dc=org"
554
   * @param  ldapPwd  the corresponding LDAP password string
555
   * 
556
   * @return  loginSuccess, true if login succeeded, else false
557
   */
558
  private static boolean metacatLogin(String ldapDN, String ldapPwd) {
559
    boolean loginSuccess = false;
560
    
561
    try {
562
      logger.info("Logging in to Metacat: " + ldapDN);
563
      String response = metacatClient.login(ldapDN, ldapPwd);
564
      logger.info("Metacat login response: " + response);
565
      loginSuccess = true;
566
    } 
567
    catch (MetacatInaccessibleException e) {
568
      logger.error("Metacat login failed." + e.getMessage());
569
    } 
570
    catch (Exception e) {
571
      logger.error("Metacat login failed." + e.getMessage());
572
    }
573
    
574
    return loginSuccess;
575
  }
576
  
577
  
578
  /**
579
   * Logout from Metacat
580
   */
581
  private static void metacatLogout() {
582
    try {    
583
      // Log out from the Metacat session
584
      logger.info("Logging out from Metacat");
585
      metacatClient.logout();
586
    }
587
    catch (MetacatInaccessibleException e) {
588
      logger.error("Metacat inaccessible: " + e.getMessage());
589
    }
590
    catch (MetacatException e) {
591
      logger.error("Metacat exception: " + e.getMessage());
592
    }
593
  }
594
 
595

    
596
  /**
597
   * Determines the revision for a Metacat document based on the 'rev'
598
   * value stored in the Metacat database for a given 'docid' value.
599
   * 
600
   * @param   docid    The metacat docid (scope + revision).
601
   * @return  Integer representing the 'rev' value stored in the Metacat
602
   *          database for this document based on its 'docid' value.
603
   */
604
  private static Integer metacatRevision(String docid) {
605
    Integer metacatRevision = metacatRevisions.get(docid);
606

    
607
    return metacatRevision;
608
  }
609
  
610
  
611
  /**
612
   * Process the output of the ListIdentifiers verb. For each identifier
613
   * listed, determine whether the document should be harvested (inserted or
614
   * updated), deleted, or if no action is needed.
615
   * 
616
   * @param baseURL          The base URL of the data provider.
617
   * @param from             Value of 'from' option, a date string or null
618
   * @param until            Value of 'until' option, a date string or null
619
   * @param metadataPrefix   Value of 'metadataPrefix' option, may be null
620
   * @param setSpec          Value of 'setSpec' option, may be null
621
   * @param xmlString        The XML string from ListIdentifiers
622
   * @param principal        Distinguished name of the LDAP account for the
623
   *                         harvester user, 
624
   *                         e.g. "uid=dryad,o=LTER,dc=ecoinformatics,dc=org"
625
   */
626
  private static void processListIdentifiers(String baseURL, 
627
                                             String from, 
628
                                             String until,
629
                                             String metadataPrefix,
630
                                             String setSpec,
631
                                             String xmlString,
632
                                             String principal) {
633
    DocumentBuilderFactory documentBuilderFactory =
634
                                           DocumentBuilderFactory.newInstance();
635
    StringReader stringReader = new StringReader(xmlString);
636
     
637
    try {
638
      DocumentBuilder documentBuilder = documentBuilderFactory.newDocumentBuilder();
639
      InputSource inputSource = new InputSource(stringReader);
640
      Document document = documentBuilder.parse(inputSource);
641
      Element rootElement = document.getDocumentElement();
642
      NodeList nodeList = rootElement.getChildNodes();
643
      
644
      for (int i = 0; i < nodeList.getLength(); i++) {
645
        Node child = nodeList.item(i);
646
        
647
        if (child instanceof Element) {
648
          Element childElement = (Element) child;
649

    
650
          if (childElement.getTagName().equals("ListIdentifiers")) {
651
            NodeList listIdentifiersNodeList = childElement.getChildNodes();
652
            
653
            for (int j = 0; j < listIdentifiersNodeList.getLength(); j++) {
654
              Node listIdentifiersNode = listIdentifiersNodeList.item(j);
655
              
656
              if (listIdentifiersNode instanceof Element) {
657
                Element listIdentifiersElement = (Element) listIdentifiersNode;
658

    
659
                if (listIdentifiersElement.getTagName().equals("header")) {
660
                  NodeList headerNodeList = listIdentifiersElement.getChildNodes();
661
                  String identifier = null;
662
                  String datestamp = null;
663
                  
664
                  for (int k = 0; k < headerNodeList.getLength(); k++) {
665
                    Node headerNode = headerNodeList.item(k);
666
                    
667
                    if (headerNode instanceof Element) {
668
                      Element headerElement = (Element) headerNode;
669
                      
670
                      if (headerElement.getTagName().equals("identifier")) {
671
                        Text textNode = (Text) headerElement.getFirstChild();
672
                        identifier = textNode.getData().trim();
673
                      }
674
                      else if (headerElement.getTagName().equals("datestamp")) {
675
                        Text textNode = (Text) headerElement.getFirstChild();
676
                        datestamp = textNode.getData().trim();
677
                      }             
678
                    }
679
                  }
680
                  
681
                  if (identifier != null) {
682
                    String docid = docidFromIdentifier(identifier);
683
                    logger.debug("identifier: " + identifier + 
684
                                 "; docid: " + docid + 
685
                                 "; datestamp: " + datestamp);
686
       
687
                    if (docid != null) { 
688
                      if (shouldHarvestDocument(docid, datestamp)) {                    
689
                        GetRecord getRecord = 
690
                             new GetRecord(baseURL, identifier, metadataPrefix);
691
                        getRecord.runVerb();  // Run the GetRecord verb
692
                        
693
                        NodeList errors = getRecord.getErrors();
694
                        if (errors != null && errors.getLength() > 0) {
695
                          logger.error("Found errors in GetRecord results");
696
                          int length = errors.getLength();
697

    
698
                          for (int l = 0; l < length; ++l) {
699
                            Node item = errors.item(l);
700
                            logger.error(item);
701
                          }
702

    
703
                          logger.error("Error record: " + getRecord.toString());
704
                        }
705
                        else {
706
                          String getRecordString = getRecord.toString();
707
                          boolean isDeleted = isDeletedRecord(getRecordString);
708
                          
709
                          if (isDeleted) {
710
                            logger.info("GetRecord indicates deleted record: " + 
711
                                        docid);
712
                            if (metacatHasDocid(docid)) {
713
                              logger.info(
714
                                        "Deleting " + docid + " from Metacat.");
715
                              String deleteReturnString = null;
716
                              deleteReturnString = metacatClient.delete(docid);
717
                              if (deleteReturnString != null && 
718
                                  !deleteReturnString.equals("")) {
719
                                logger.info(deleteReturnString);
720
                              }
721
                            }                           
722
                          }
723
                          else {
724
                            String metadataString = 
725
                                               extractMetadata(getRecordString);
726
                            uploadToMetacat(docid, datestamp, metadataString, 
727
                                            principal);
728
                          }
729
                        }
730
                      }
731
                      else {
732
                        logger.info(
733
                          "Not harvesting docid '" + docid + 
734
                          "' from the OAI-PMH provider. " +
735
                          "Metacat already has this document at datestamp '" + 
736
                          datestamp + "' or higher.");
737
                      }
738
                    }
739
                    else {
740
                      logger.warn("Unrecognized identifier format: " +
741
                                  identifier);
742
                    }
743
                  }
744
                }             
745
              }
746
            }
747
          }
748
        }
749
      }
750
    }
751
    catch (Exception e) {
752
      logger.error("General exception:\n" + e.getMessage());
753
      e.printStackTrace();
754
    }
755
  }
756
  
757
  
758
  /**
759
   * Runs a OAI-PMH harvest.
760
   * 
761
   * @param baseURL          The base URL of the data provider.
762
   * @param dn               Value of 'dn' option, a LDAP distinguished name,
763
   *                         e.g. "uid=dryad,o=LTER,dc=ecoinformatics,dc=org"
764
   * @param password         Value of 'password' option, a string
765
   * @param from             Value of 'from' option, a date string or null
766
   * @param until            Value of 'until' option, a date string or null
767
   * @param metadataPrefix   Value of 'metadataPrefix' option, may be null
768
   * @param setSpec          Value of 'setSpec' option, may be null
769
   * @throws IOException
770
   * @throws ParserConfigurationException
771
   * @throws SAXException
772
   * @throws TransformerException
773
   * @throws NoSuchFieldException
774
   */
775
  public static void run(String baseURL, String dn, String password, 
776
                         String from, String until,
777
                         String metadataPrefix, String setSpec
778
                        )
779
          throws IOException, ParserConfigurationException, SAXException, 
780
                 TransformerException, NoSuchFieldException 
781
  {
782
    logger.info("Starting OAI-PMH Harvester.");
783
    if ((dn != null) && (password != null)) {
784
      boolean loginSuccess = metacatLogin(dn, password);
785
      
786
      // Terminate harvester execution if login failed
787
      if (!loginSuccess) { 
788
        logger.warn("Terminating OAI-PMH Harvester execution due to login failure.");
789
        return; 
790
      } 
791
    }
792
    else {
793
      logger.error("Distinguished name (-dn) and/or password (-password) " +
794
      		       "were not specified.");
795
      return;
796
    }
797
    
798
    ListIdentifiers listIdentifiers = 
799
             new ListIdentifiers(baseURL, from, until, metadataPrefix, setSpec);
800
    listIdentifiers.runVerb();
801
    
802
    while (listIdentifiers != null) {
803
      NodeList errors = listIdentifiers.getErrors();
804

    
805
      if (errors != null && errors.getLength() > 0) {
806
        logger.error("Found errors in ListIdentifier results");
807
        int length = errors.getLength();
808

    
809
        for (int i = 0; i < length; ++i) {
810
          Node item = errors.item(i);
811
          logger.error(item);
812
        }
813

    
814
        logger.error("Error record: " + listIdentifiers.toString());
815
        break;
816
      }
817

    
818
      String xmlString = listIdentifiers.toString();
819
      processListIdentifiers(baseURL, from, until, metadataPrefix, setSpec,
820
                             xmlString, dn);
821
      String resumptionToken = listIdentifiers.getResumptionToken();
822
      logger.debug("resumptionToken: " + resumptionToken);
823

    
824
      if (resumptionToken == null || resumptionToken.length() == 0) {
825
        listIdentifiers = null;
826
      } 
827
      else {
828
        listIdentifiers = new ListIdentifiers(baseURL, resumptionToken);
829
        listIdentifiers.runVerb();
830
      }
831
    }
832

    
833
    metacatLogout();
834
    logger.info("Harvest completed. Shutting down OAI-PMH Harvester.");
835
  }
836
  
837
  
838
  /**
839
   * Should a document be harvested? Compare the OAI-PMH provider datestamp to 
840
   * the Metacat datestamp (the 'last_updated' date). If the Metacat datestamp 
841
   * is unknown, or if it's less than the OAI-PMH datestamp, then the document
842
   * should be harvested.
843
   *  
844
   * @param docid                   The Metacat docid value.
845
   * @param providerDatestamp       The OAI-PMH provider datestamp.
846
   * @return   true if the document should be harvested into Metacat, else false
847
   */
848
  private static boolean shouldHarvestDocument(String docid, 
849
                                               String providerTimestamp
850
                                              ) {
851
    String providerDatestamp;
852
    boolean shouldHarvest = false;
853
    String metacatDatestamp = metacatDatestamp(docid);
854
 
855
    /*
856
     * Since Metacat stores its 'last_updated' field as a datestamp (no time),
857
     * we need to strip off the timestamp part of the provider timestamp
858
     * before doing a comparison of the Metacat datestamp to the OAI-PMH
859
     * provider datestamp.
860
     */
861
    if (providerTimestamp.contains("T")) {
862
      int tIndex = providerTimestamp.indexOf('T');
863
      providerDatestamp = providerTimestamp.substring(0, tIndex);
864
    }
865
    else {
866
      providerDatestamp = providerTimestamp;
867
    }
868
    
869
    /*
870
     * If we don't have a Metacat datastamp for this document, or if the
871
     * Metacat datestamp is older than the provider datestamp, then we
872
     * should harvest the document.
873
     */
874
    if (metacatDatestamp == null) {
875
      shouldHarvest = true;
876
    }
877
    else if (metacatDatestamp.compareTo(providerDatestamp) < 0) {
878
        shouldHarvest = true;
879
    }
880
    
881
    return shouldHarvest;
882
  }
883
  
884

    
885
  /**
886
   * Insert or update the document to Metacat. If Metacat already has this
887
   * document, increment the 'rev' number by 1 to update it.
888
   * 
889
   * @param   docid           The Metacat docid
890
   * @param   datestamp       The datestamp in the OAI-PMH provider catalog.
891
   * @param   metadataString  The metadata string extracted by the GetRecord 
892
   * @param   principal       The distinguished name of the principal
893
   *                          verb
894
   * @return  true if the upload succeeded, else false.
895
   */
896
  private static boolean uploadToMetacat(String docid,
897
                                         String datestamp,
898
                                         String metadataString,
899
                                         String principal) {
900
    String docidFull = null;
901
    boolean success = true;
902
    String metacatDatestamp = metacatDatestamp(docid);
903
    Integer metacatRevision = metacatRevision(docid);
904
    boolean insert = false;
905
    StringReader stringReader = null;
906
    boolean update = false;
907
    
908
    if (metadataString != null ) {
909
      stringReader = new StringReader(metadataString);
910

    
911
      /* If metacat already has this document, determine the highest revision in
912
       * metacat and report it to the user; else, insert or delete the document 
913
       * into metacat.
914
       */
915
      if (metacatDatestamp == null) {
916
        insert = true;
917
        int newRevision = 1;
918
        docidFull = docid + "." + newRevision;
919
      }
920
      else if (metacatDatestamp.compareTo(datestamp) < 0) {
921
        update = true;
922
        int newRevision = metacatRevision + 1;
923
        docidFull = docid + "." + newRevision;
924
      }
925
      else if (metacatDatestamp.compareTo(datestamp) == 0) {
926
        logger.warn("Attempting to update " + docid + " to datestamp " + 
927
            datestamp + ". Metacat has document at datestamp " +
928
            metacatDatestamp + ".");
929
      }
930
        
931
      if (insert || update) {
932
        String metacatReturnString = "";
933
        String accessReturnString = "";
934
      
935
        try {
936
          if (insert) {
937
            logger.info("Inserting document: " + docidFull);
938
            metacatReturnString = 
939
                            metacatClient.insert(docidFull, stringReader, null);
940
          
941
            /* Add "all" permission for the dataset owner */
942
            String permission = "all";
943
            String permType = "allow";
944
            String permOrder = "allowFirst";
945
            accessReturnString = metacatClient.setAccess(
946
                             docid, principal, permission, permType, permOrder);
947
            if (accessReturnString != null && !accessReturnString.equals("")) {
948
              logger.info(accessReturnString);
949
            }
950
          
951
            /* Add "read" permission for public users */
952
            permission = "read";
953
            accessReturnString = metacatClient.setAccess(
954
                              docid, "public", permission, permType, permOrder);
955

    
956
            if (accessReturnString != null && !accessReturnString.equals("")) {
957
              logger.info(accessReturnString);
958
            }
959
          }
960
          else if (update) {
961
            logger.info("Updating document: " + docidFull);
962
            metacatReturnString = 
963
                            metacatClient.update(docidFull, stringReader, null);
964
          }
965
        
966
          if (metacatReturnString != null && !metacatReturnString.equals("")) {
967
            logger.info(metacatReturnString);
968
          }
969
        }
970
        catch (MetacatInaccessibleException e) {
971
          logger.error("MetacatInaccessibleException: " + e.getMessage());
972
        }
973
        catch (InsufficientKarmaException e) {
974
          logger.error("InsufficientKarmaException: " + e.getMessage());
975
        }
976
        catch (MetacatException e) {
977
          logger.error("MetacatException: " + e.getMessage());
978
        }
979
        catch (IOException e) {
980
          logger.error("IOException: " + e.getMessage());
981
        }
982
      }
983
    }
984
    
985
    return success;
986
  }
987

    
988
}
(8-8/8)