Project

General

Profile

1 2094 jones
/**
2
 *  '$RCSfile$'
3
 *  Copyright: 2004 University of New Mexico and the
4
 *                  Regents of the University of California
5 2022 costa
 *
6 2094 jones
 *   '$Author$'
7
 *     '$Date$'
8
 * '$Revision$'
9
 *
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * This program is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
 * GNU General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU General Public License
21
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 2022 costa
 */
24
25
package edu.ucsb.nceas.metacat.harvesterClient;
26
27 2086 costa
import com.oreilly.servlet.MailMessage;
28 2031 costa
import java.io.FileNotFoundException;
29
import java.io.IOException;
30
import java.io.InputStream;
31
import java.io.InputStreamReader;
32 2086 costa
import java.io.PrintStream;
33 2031 costa
import java.io.Reader;
34
import java.net.MalformedURLException;
35
import java.net.URL;
36 2022 costa
import java.sql.Connection;
37
import java.sql.SQLException;
38
import java.sql.Statement;
39 2031 costa
import java.text.DateFormat;
40
import java.text.ParseException;
41
import java.text.SimpleDateFormat;
42
import java.util.ArrayList;
43
import java.util.Date;
44
import javax.xml.parsers.ParserConfigurationException;
45
import org.xml.sax.Attributes;
46
import org.xml.sax.ContentHandler;
47
import org.xml.sax.ErrorHandler;
48
import org.xml.sax.InputSource;
49
import org.xml.sax.SAXException;
50
import org.xml.sax.SAXParseException;
51
import org.xml.sax.XMLReader;
52
import org.xml.sax.helpers.DefaultHandler;
53
import org.xml.sax.helpers.XMLReaderFactory;
54 2022 costa
55 2031 costa
import edu.ucsb.nceas.metacat.client.Metacat;
56
import edu.ucsb.nceas.metacat.client.MetacatException;
57
import edu.ucsb.nceas.metacat.client.MetacatInaccessibleException;
58 2022 costa
59
60
/**
61
 * HarvestSiteSchedule manages a single entry in the HARVEST_SITE_SCHEDULE
62
 * table, determining when and how to harvest the documents for a given site.
63
 *
64
 * @author  costa
65
 */
66
class HarvestSiteSchedule {
67
68
  private String contactEmail;
69
  private String dateLastHarvest;
70
  private String dateNextHarvest;
71
  private long delta;
72
  private String documentListURL;
73
  private Harvester harvester;
74 2031 costa
  private ArrayList harvestDocumentList = new ArrayList();
75 2022 costa
  private String harvestSiteEndTime;
76
  private String harvestSiteStartTime;
77
  private String ldapDN;
78 2031 costa
  private String ldapPwd;
79 2022 costa
  final private long millisecondsPerDay = (1000 * 60 * 60 * 24);
80 2031 costa
  int siteScheduleID;
81 2022 costa
  private String unit;
82
  private int updateFrequency;
83
84
  /**
85
   * Creates a new instance of HarvestSiteSchedule. Initialized with the data
86
   * that was read from a single row in the HARVEST_SITE_SCHEDULE table.
87
   *
88
   * @param harvester       the parent Harvester object
89
   * @param siteScheduleID  the value of the SITE_SCHEDULE_ID field
90
   * @param documentListURL the value of the DOCUMENTLISTURL field
91
   * @param ldapDN          the value of the LDAPDN field
92 2031 costa
   * @param ldapPwd    the value of the LDAPPASSWORD field
93 2022 costa
   * @param dateNextHarvest the value of the DATENEXTHARVEST field
94
   * @param dateLastHarvest the value of the DATELASTHARVEST field
95
   * @param updateFrequency the value of the UPDATEFREQUENCY field
96
   * @param unit            the value of the UNIT field
97
   * @param contactEmail    the value of the CONTACT_EMAIL field
98
   */
99
  public HarvestSiteSchedule(
100
                              Harvester harvester,
101
                              int    siteScheduleID,
102
                              String documentListURL,
103
                              String ldapDN,
104 2031 costa
                              String ldapPwd,
105 2022 costa
                              String dateNextHarvest,
106
                              String dateLastHarvest,
107
                              int    updateFrequency,
108
                              String unit,
109
                              String contactEmail
110
                            )
111
  {
112
    this.harvester = harvester;
113
    this.siteScheduleID = siteScheduleID;
114
    this.documentListURL = documentListURL;
115
    this.ldapDN = ldapDN;
116 2031 costa
    this.ldapPwd = ldapPwd;
117 2022 costa
    this.dateNextHarvest = dateNextHarvest;
118
    this.dateLastHarvest = dateLastHarvest;
119
    this.updateFrequency = updateFrequency;
120
    this.unit = unit;
121
    this.contactEmail = contactEmail;
122
123
    // Calculate the value of delta, the number of milliseconds between the
124
    // last harvest date and the next harvest date.
125
    delta = updateFrequency * millisecondsPerDay;
126
127
    if (unit.equals("weeks")) {
128
      delta *= 7;
129
    }
130
    else if (unit.equals("months")) {
131
      delta *= 30;
132
    }
133
  }
134
135
136
  /**
137 2060 costa
   * Updates the DATELASTHARVEST and DATENEXTHARVEST values of the
138
   * HARVEST_SITE_SCHEDULE table after a harvest operation has completed.
139
   * Calculates the date of the next harvest based on today's date and the
140
   * update frequency.
141 2022 costa
   */
142 2060 costa
  private void dbUpdateHarvestDates() {
143 2031 costa
		Connection conn;
144 2022 costa
    long currentTime;                    // Current time in milliseconds
145
    Date dateNextHarvest;                // Date of next harvest
146
    String lastHarvest;
147
    String nextHarvest;
148
    Date now = new Date();
149
    SimpleDateFormat simpleDateFormat = new SimpleDateFormat("dd-MMM-yyyy");
150
		Statement stmt;
151
    long timeNextHarvest;
152
153 2031 costa
    conn = harvester.conn;
154 2022 costa
    now = new Date();
155 2031 costa
    currentTime = now.getTime();
156 2022 costa
    timeNextHarvest = currentTime + delta;
157
    dateNextHarvest = new Date(timeNextHarvest);
158
    nextHarvest = "'" + simpleDateFormat.format(dateNextHarvest) + "'";
159
    lastHarvest = "'" + simpleDateFormat.format(now) + "'";
160
161
		try {
162 2031 costa
			stmt = conn.createStatement();
163
			stmt.executeUpdate("UPDATE HARVEST_SITE_SCHEDULE SET DATENEXTHARVEST = " +
164
                         nextHarvest +
165
                         " WHERE SITE_SCHEDULE_ID = " +
166
                         siteScheduleID);
167
			stmt.executeUpdate("UPDATE HARVEST_SITE_SCHEDULE SET DATELASTHARVEST = " +
168
                         lastHarvest +
169
                         " WHERE SITE_SCHEDULE_ID = " +
170
                         siteScheduleID);
171 2022 costa
			stmt.close();
172
		}
173
    catch(SQLException e) {
174 2031 costa
			System.out.println("SQLException: " + e.getMessage());
175 2022 costa
		}
176
  }
177
178
179
  /**
180
   * Boolean to determine whether this site is currently due for its next
181
   * harvest.
182
   *
183
   * @retrun     true if due for harvest, otherwise false
184
   */
185
  private boolean dueForHarvest() {
186
    boolean dueForHarvest = false;
187
    DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S");
188
    Date now = new Date();
189
    Date dnh;                          // Date of next harvest
190
    long currentTime = now.getTime();  // Current time in milliseconds
191
    long timeNextHarvest = 0;
192
193
    try {
194 2036 costa
      dnh = dateFormat.parse(dateNextHarvest);
195
      timeNextHarvest = dnh.getTime();
196 2022 costa
197
      if (timeNextHarvest < currentTime) {
198
        dueForHarvest = true;
199 2031 costa
        System.out.println("Due for harvest: " + documentListURL);
200 2022 costa
      }
201
      else {
202 2031 costa
        System.out.println("Not due for harvest: " + documentListURL);
203 2022 costa
      }
204
    }
205
    catch (ParseException e) {
206 2031 costa
      System.out.println("Error parsing date: " + e.getMessage());
207 2022 costa
    }
208
209 2031 costa
    return dueForHarvest;
210 2022 costa
  }
211
212
213
  /**
214
   * Harvests each document in the site document list.
215
   *
216
   * @throws SAXException
217
   * @throws IOException
218
   * @throws ParserConfigurationException
219
   */
220
  public void harvestDocumentList() {
221
    HarvestDocument harvestDocument;
222 2036 costa
    boolean success;
223 2022 costa
224
    if (dueForHarvest()) {
225
      try {
226 2036 costa
        success = parseDocumentList();
227
228
        /* If the document list was validated, then proceed with harvesting
229
         * the documents
230
         */
231
        if (success) {
232
          metacatLogin();
233 2022 costa
234 2036 costa
          for (int i = 0; i < harvestDocumentList.size(); i++) {
235
            harvestDocument = (HarvestDocument) harvestDocumentList.get(i);
236 2022 costa
237 2036 costa
            if (harvestDocument != null) {
238
              harvestDocument.harvestDocument();
239
            }
240 2022 costa
          }
241 2036 costa
242
          metacatLogout();
243 2060 costa
          dbUpdateHarvestDates();  // Update the schedule
244 2022 costa
        }
245
      }
246
      catch (ParserConfigurationException e) {
247 2031 costa
        System.out.println("ParserConfigurationException: " + e.getMessage());
248 2022 costa
      }
249
250
      reportToSite();
251
    }
252
  }
253
254
255
  /**
256 2031 costa
   * Login to Metacat using the ldapDN and ldapPwd
257 2022 costa
   */
258
  private void metacatLogin() {
259
    Metacat metacat = harvester.metacat;
260
261
    if (harvester.connectToMetacat()) {
262
      try {
263
        System.out.println("Logging in to Metacat: " + ldapDN);
264 2031 costa
        metacat.login(ldapDN, ldapPwd);
265 2022 costa
        //System.out.println("Metacat login response: " + response);
266
        //sessionId = metacat.getSessionId();
267
        //System.out.println("Session ID: " + sessionId);
268
      }
269
      catch (MetacatInaccessibleException e) {
270
        System.out.println("Metacat login failed." + e.getMessage());
271
      }
272
      catch (Exception e) {
273
        System.out.println("Metacat login failed." + e.getMessage());
274
      }
275 2031 costa
    }
276 2022 costa
  }
277
278
279
  /**
280
   * Logout from Metacat
281
   */
282
  private void metacatLogout() {
283
    Metacat metacat = harvester.metacat;
284
285
    if (harvester.connectToMetacat()) {
286
      try {
287
        // Log out from the Metacat session
288
        System.out.println("Logging out from Metacat");
289
        metacat.logout();
290
      }
291
      catch (MetacatInaccessibleException e) {
292
        System.out.println("Metacat inaccessible: " + e.getMessage());
293
      }
294
      catch (MetacatException e) {
295
        System.out.println("Metacat exception: " + e.getMessage());
296
      }
297
    }
298
  }
299
300
301
  /**
302
   * Parse the site document list to find out which documents to harvest.
303 2036 costa
   *
304
   * @return  true if successful, otherwise false
305 2022 costa
   */
306 2036 costa
  private boolean parseDocumentList()
307 2031 costa
          throws ParserConfigurationException {
308
    DocumentListHandler documentListHandler = new DocumentListHandler();
309
    InputStream inputStream;
310
    InputStreamReader inputStreamReader;
311 2126 costa
    String schemaLocation =
312
    "eml://ecoinformatics.org/harvestList ../../lib/harvester/harvestList.xsd";
313 2036 costa
    boolean success = false;
314 2031 costa
    URL url;
315
316
    try {
317
      url = new URL(documentListURL);
318
      inputStream = url.openStream();
319 2060 costa
      harvester.addLogEntry(0,
320
                            "Retrieved: " + documentListURL,
321
                            "GetDocListSuccess",
322
                            siteScheduleID,
323
                            null,
324
                            "");
325 2031 costa
      inputStreamReader = new InputStreamReader(inputStream);
326
      documentListHandler.runParser(inputStreamReader, schemaLocation);
327 2060 costa
      harvester.addLogEntry(0,
328
                            "Validated: " + documentListURL,
329
                            "ValidateDocListSuccess",
330
                            siteScheduleID,
331
                            null,
332
                            "");
333 2036 costa
      success = true;
334 2031 costa
    }
335
    catch (MalformedURLException e){
336
      harvester.addLogEntry(1, "MalformedURLException: " + e.getMessage(),
337
                            "GetDocListError", siteScheduleID, null, "");
338
    }
339
    catch (FileNotFoundException e) {
340
      harvester.addLogEntry(1, "FileNotFoundException: " + e.getMessage(),
341
                            "GetDocListError", siteScheduleID, null, "");
342
    }
343
    catch (SAXException e) {
344
      harvester.addLogEntry(1, "SAXException: " + e.getMessage(),
345
                            "ValidateDocListError", siteScheduleID, null, "");
346
    }
347
    catch (ClassNotFoundException e) {
348
      harvester.addLogEntry(1, "ClassNotFoundException: " + e.getMessage(),
349
                            "ValidateDocListError", siteScheduleID, null, "");
350
    }
351
    catch (IOException e) {
352
      harvester.addLogEntry(1, "IOException: " + e.getMessage(),
353
                            "GetDocListError", siteScheduleID, null, "");
354
    }
355 2036 costa
356
    return success;
357 2022 costa
  }
358
359
360
  /**
361
   * Prints the data that is stored in this HarvestSiteSchedule object.
362 2086 costa
   *
363
   * @param out   the PrintStream to write to
364 2022 costa
   */
365 2086 costa
  void printOutput(PrintStream out) {
366
    out.println("* siteScheduleID:       " + siteScheduleID);
367
    out.println("* documentListURL:      " + documentListURL);
368
    out.println("* ldapDN:               " + ldapDN);
369
    out.println("* dateNextHarvest:      " + dateNextHarvest);
370
    out.println("* dateLastHarvest:      " + dateLastHarvest);
371
    out.println("* updateFrequency:      " + updateFrequency);
372
    out.println("* unit:                 " + unit);
373
    out.println("* contactEmail:         " + contactEmail);
374 2022 costa
  }
375
376
377
  /**
378 2105 costa
   * Sends a report to the site summarizing the results of the harvest at
379
   * that site.
380 2022 costa
   */
381
  void reportToSite() {
382 2086 costa
    PrintStream body;
383 2105 costa
    String from = harvester.harvesterAdministrator;
384
    String maxCodeLevel = "info";
385 2086 costa
    MailMessage msg;
386 2105 costa
    int nErrors = 0;
387 2108 costa
    String subject = "Report from Metacat Harvester: " + harvester.timestamp;
388 2086 costa
    String to = contactEmail;
389
390
    if (!to.equals("")) {
391
      System.out.println("Sending report to siteScheduleID=" + siteScheduleID +
392
                         " at address: " + contactEmail);
393
      try {
394 2105 costa
        msg = new MailMessage(harvester.smtpServer);
395 2086 costa
        msg.from(from);
396
        msg.to(to);
397
        msg.setSubject(subject);
398
        body = msg.getPrintStream();
399 2105 costa
        harvester.printHarvestLog(body, maxCodeLevel, siteScheduleID);
400
        msg.sendAndClose();
401 2086 costa
      }
402
      catch (IOException e) {
403
        System.out.println("There was a problem sending email to " + to);
404
        System.out.println("IOException: " + e.getMessage());
405
      }
406
    }
407 2022 costa
  }
408
409
410
  /**
411
   * This inner class extends DefaultHandler. It parses the document list,
412
   * creating a new HarvestDocument object every time it finds a </Document>
413
   * end tag.
414
   */
415 2031 costa
  class DocumentListHandler extends DefaultHandler implements ErrorHandler {
416 2022 costa
417
    public String scope;
418
    public int identifier;
419 2036 costa
    public String identifierString;
420
    public String documentType;
421 2022 costa
    public int revision;
422 2036 costa
    public String revisionString;
423 2022 costa
    public String documentURL;
424
    private String currentQname;
425 2031 costa
    public final static String DEFAULT_PARSER =
426
           "org.apache.xerces.parsers.SAXParser";
427
    private boolean schemaValidate = true;
428 2022 costa
429
430 2031 costa
	  /**
431
     * This method is called for any plain text within an element.
432
     * It parses the value for any of the following elements:
433
     * <scope>, <identifier>, <revision>, <documentType>, <documentURL>
434
     *
435
     * @param ch          the character array holding the parsed text
436
     * @param start       the start index
437
     * @param length      the text length
438
     *
439 2022 costa
     */
440 2031 costa
    public void characters (char ch[], int start, int length) {
441
      String s = new String(ch, start, length);
442
443
      if (length > 0) {
444
        if (currentQname.equals("scope")) {
445 2036 costa
          scope += s;
446 2031 costa
        }
447
        else if (currentQname.equals("identifier")) {
448 2036 costa
          identifierString += s;
449 2031 costa
        }
450
        else if (currentQname.equals("revision")) {
451 2036 costa
          revisionString += s;
452 2031 costa
        }
453
        else if (currentQname.equals("documentType")) {
454 2036 costa
          documentType += s;
455 2031 costa
        }
456
        else if (currentQname.equals("documentURL")) {
457 2036 costa
          documentURL += s;
458 2031 costa
        }
459
      }
460 2022 costa
    }
461
462
463
    /**
464
     * Handles an end-of-document event.
465
     */
466
    public void endDocument () {
467
      System.out.println("Finished parsing " + documentListURL);
468
    }
469
470
471
    /**
472
     * Handles an end-of-element event. If the end tag is </Document>, then
473
     * creates a new HarvestDocument object and pushes it to the document
474
     * list.
475
     *
476
     * @param uri
477
     * @param localname
478
     * @param qname
479
     */
480
    public void endElement(String uri,
481
                           String localname,
482
                           String qname) {
483
484
      HarvestDocument harvestDocument;
485
486 2036 costa
      if (qname.equals("identifier")) {
487
        identifier = Integer.parseInt(identifierString);
488
      }
489
      else if (qname.equals("revision")) {
490
        revision = Integer.parseInt(revisionString);
491
      }
492
      else if (qname.equals("document")) {
493 2022 costa
        harvestDocument = new HarvestDocument(
494
                                              harvester,
495
                                              HarvestSiteSchedule.this,
496
                                              scope,
497
                                              identifier,
498
                                              revision,
499
                                              documentType,
500
                                              documentURL
501
                                             );
502 2031 costa
        harvestDocumentList.add(harvestDocument);
503 2022 costa
      }
504 2036 costa
505
      currentQname = "";
506 2022 costa
    }
507
508
509 2031 costa
    /**
510
     * Method for handling errors during a parse
511
     *
512
     * @param exception         The parsing error
513
     * @exception SAXException  Description of Exception
514 2022 costa
     */
515 2031 costa
     public void error(SAXParseException e) throws SAXParseException {
516
        System.out.println("SAXParseException: " + e.getMessage());
517
        throw e;
518
    }
519
520
521
    /**
522
     * Run the validating parser
523
     *
524
     * @param xml             the xml stream to be validated
525
     * @schemaLocation        relative path the to XML Schema file, e.g. "."
526
     * @exception IOException thrown when test files can't be opened
527
     * @exception ClassNotFoundException thrown when SAX Parser class not found
528
     * @exception SAXException
529
     * @exception SAXParserException
530
     */
531
    public void runParser(Reader xml, String schemaLocation)
532
           throws IOException, ClassNotFoundException,
533
                  SAXException, SAXParseException {
534
535
      // Get an instance of the parser
536
      XMLReader parser;
537
538
      parser = XMLReaderFactory.createXMLReader(DEFAULT_PARSER);
539
      // Set Handlers in the parser
540
      parser.setContentHandler((ContentHandler)this);
541
      parser.setErrorHandler((ErrorHandler)this);
542
      parser.setFeature("http://xml.org/sax/features/namespaces", true);
543
      parser.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
544
      parser.setFeature("http://xml.org/sax/features/validation", true);
545
      parser.setProperty(
546
              "http://apache.org/xml/properties/schema/external-schemaLocation",
547
              schemaLocation);
548
549
      if (schemaValidate) {
550
        parser.setFeature("http://apache.org/xml/features/validation/schema",
551
                          true);
552 2022 costa
      }
553 2031 costa
554
      // Parse the document
555
      parser.parse(new InputSource(xml));
556 2022 costa
    }
557 2031 costa
    /**
558
     * Handles a start-of-document event.
559
     */
560
    public void startDocument () {
561
      System.out.println("Started parsing " + documentListURL);
562
    }
563 2022 costa
564 2031 costa
565
    /**
566
     * Handles a start-of-element event.
567
     *
568
     * @param uri
569
     * @param localname
570
     * @param qname
571
     * @param attributes
572
     */
573
    public void startElement(String uri,
574
                             String localname,
575
                             String qname,
576
                             Attributes attributes) {
577
578
      currentQname = qname;
579 2036 costa
580
      if (qname.equals("scope")) {
581
        scope = "";
582
      }
583
      else if (qname.equals("identifier")) {
584
        identifierString = "";
585
      }
586
      else if (qname.equals("revision")) {
587
        revisionString = "";
588
      }
589
      else if (qname.equals("documentType")) {
590
        documentType = "";
591
      }
592
      else if (qname.equals("documentURL")) {
593
        documentURL = "";
594
      }
595 2031 costa
    }
596 2022 costa
  }
597
}