Project

General

Profile

1 2094 jones
/**
2
 *  '$RCSfile$'
3
 *  Copyright: 2004 University of New Mexico and the
4
 *                  Regents of the University of California
5 2022 costa
 *
6 2094 jones
 *   '$Author$'
7
 *     '$Date$'
8
 * '$Revision$'
9
 *
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * This program is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
 * GNU General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU General Public License
21
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 2022 costa
 */
24
25
package edu.ucsb.nceas.metacat.harvesterClient;
26
27 2086 costa
import com.oreilly.servlet.MailMessage;
28 2031 costa
import java.io.FileNotFoundException;
29
import java.io.IOException;
30
import java.io.InputStream;
31
import java.io.InputStreamReader;
32 2086 costa
import java.io.PrintStream;
33 2031 costa
import java.io.Reader;
34
import java.net.MalformedURLException;
35
import java.net.URL;
36 2022 costa
import java.sql.Connection;
37
import java.sql.SQLException;
38
import java.sql.Statement;
39 2031 costa
import java.text.DateFormat;
40
import java.text.ParseException;
41
import java.text.SimpleDateFormat;
42
import java.util.ArrayList;
43
import java.util.Date;
44
import javax.xml.parsers.ParserConfigurationException;
45
import org.xml.sax.Attributes;
46
import org.xml.sax.ContentHandler;
47
import org.xml.sax.ErrorHandler;
48
import org.xml.sax.InputSource;
49
import org.xml.sax.SAXException;
50
import org.xml.sax.SAXParseException;
51
import org.xml.sax.XMLReader;
52
import org.xml.sax.helpers.DefaultHandler;
53
import org.xml.sax.helpers.XMLReaderFactory;
54 2022 costa
55 2031 costa
import edu.ucsb.nceas.metacat.client.Metacat;
56
import edu.ucsb.nceas.metacat.client.MetacatException;
57
import edu.ucsb.nceas.metacat.client.MetacatInaccessibleException;
58 2022 costa
59
60
/**
61
 * HarvestSiteSchedule manages a single entry in the HARVEST_SITE_SCHEDULE
62
 * table, determining when and how to harvest the documents for a given site.
63
 *
64
 * @author  costa
65
 */
66
class HarvestSiteSchedule {
67
68
  private String contactEmail;
69
  private String dateLastHarvest;
70
  private String dateNextHarvest;
71
  private long delta;
72
  private String documentListURL;
73
  private Harvester harvester;
74 2031 costa
  private ArrayList harvestDocumentList = new ArrayList();
75 2022 costa
  private String harvestSiteEndTime;
76
  private String harvestSiteStartTime;
77
  private String ldapDN;
78 2031 costa
  private String ldapPwd;
79 2022 costa
  final private long millisecondsPerDay = (1000 * 60 * 60 * 24);
80 2031 costa
  int siteScheduleID;
81 2022 costa
  private String unit;
82
  private int updateFrequency;
83
84
  /**
85
   * Creates a new instance of HarvestSiteSchedule. Initialized with the data
86
   * that was read from a single row in the HARVEST_SITE_SCHEDULE table.
87
   *
88
   * @param harvester       the parent Harvester object
89
   * @param siteScheduleID  the value of the SITE_SCHEDULE_ID field
90
   * @param documentListURL the value of the DOCUMENTLISTURL field
91
   * @param ldapDN          the value of the LDAPDN field
92 2031 costa
   * @param ldapPwd    the value of the LDAPPASSWORD field
93 2022 costa
   * @param dateNextHarvest the value of the DATENEXTHARVEST field
94
   * @param dateLastHarvest the value of the DATELASTHARVEST field
95
   * @param updateFrequency the value of the UPDATEFREQUENCY field
96
   * @param unit            the value of the UNIT field
97
   * @param contactEmail    the value of the CONTACT_EMAIL field
98
   */
99
  public HarvestSiteSchedule(
100
                              Harvester harvester,
101
                              int    siteScheduleID,
102
                              String documentListURL,
103
                              String ldapDN,
104 2031 costa
                              String ldapPwd,
105 2022 costa
                              String dateNextHarvest,
106
                              String dateLastHarvest,
107
                              int    updateFrequency,
108
                              String unit,
109
                              String contactEmail
110
                            )
111
  {
112
    this.harvester = harvester;
113
    this.siteScheduleID = siteScheduleID;
114
    this.documentListURL = documentListURL;
115
    this.ldapDN = ldapDN;
116 2031 costa
    this.ldapPwd = ldapPwd;
117 2022 costa
    this.dateNextHarvest = dateNextHarvest;
118
    this.dateLastHarvest = dateLastHarvest;
119
    this.updateFrequency = updateFrequency;
120
    this.unit = unit;
121
    this.contactEmail = contactEmail;
122
123
    // Calculate the value of delta, the number of milliseconds between the
124
    // last harvest date and the next harvest date.
125
    delta = updateFrequency * millisecondsPerDay;
126
127
    if (unit.equals("weeks")) {
128
      delta *= 7;
129
    }
130
    else if (unit.equals("months")) {
131
      delta *= 30;
132
    }
133
  }
134
135
136
  /**
137 2060 costa
   * Updates the DATELASTHARVEST and DATENEXTHARVEST values of the
138
   * HARVEST_SITE_SCHEDULE table after a harvest operation has completed.
139
   * Calculates the date of the next harvest based on today's date and the
140
   * update frequency.
141 2022 costa
   */
142 2060 costa
  private void dbUpdateHarvestDates() {
143 2031 costa
		Connection conn;
144 2022 costa
    long currentTime;                    // Current time in milliseconds
145
    Date dateNextHarvest;                // Date of next harvest
146
    String lastHarvest;
147
    String nextHarvest;
148
    Date now = new Date();
149
    SimpleDateFormat simpleDateFormat = new SimpleDateFormat("dd-MMM-yyyy");
150
		Statement stmt;
151
    long timeNextHarvest;
152
153 2031 costa
    conn = harvester.conn;
154 2022 costa
    now = new Date();
155 2031 costa
    currentTime = now.getTime();
156 2022 costa
    timeNextHarvest = currentTime + delta;
157
    dateNextHarvest = new Date(timeNextHarvest);
158
    nextHarvest = "'" + simpleDateFormat.format(dateNextHarvest) + "'";
159
    lastHarvest = "'" + simpleDateFormat.format(now) + "'";
160
161
		try {
162 2031 costa
			stmt = conn.createStatement();
163
			stmt.executeUpdate("UPDATE HARVEST_SITE_SCHEDULE SET DATENEXTHARVEST = " +
164
                         nextHarvest +
165
                         " WHERE SITE_SCHEDULE_ID = " +
166
                         siteScheduleID);
167
			stmt.executeUpdate("UPDATE HARVEST_SITE_SCHEDULE SET DATELASTHARVEST = " +
168
                         lastHarvest +
169
                         " WHERE SITE_SCHEDULE_ID = " +
170
                         siteScheduleID);
171 2022 costa
			stmt.close();
172
		}
173
    catch(SQLException e) {
174 2031 costa
			System.out.println("SQLException: " + e.getMessage());
175 2022 costa
		}
176
  }
177
178
179
  /**
180
   * Boolean to determine whether this site is currently due for its next
181
   * harvest.
182
   *
183
   * @retrun     true if due for harvest, otherwise false
184
   */
185
  private boolean dueForHarvest() {
186
    boolean dueForHarvest = false;
187
    DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S");
188
    Date now = new Date();
189
    Date dnh;                          // Date of next harvest
190
    long currentTime = now.getTime();  // Current time in milliseconds
191
    long timeNextHarvest = 0;
192
193
    try {
194 2036 costa
      dnh = dateFormat.parse(dateNextHarvest);
195
      timeNextHarvest = dnh.getTime();
196 2022 costa
197
      if (timeNextHarvest < currentTime) {
198
        dueForHarvest = true;
199 2031 costa
        System.out.println("Due for harvest: " + documentListURL);
200 2022 costa
      }
201
      else {
202 2031 costa
        System.out.println("Not due for harvest: " + documentListURL);
203 2022 costa
      }
204
    }
205
    catch (ParseException e) {
206 2031 costa
      System.out.println("Error parsing date: " + e.getMessage());
207 2022 costa
    }
208
209 2031 costa
    return dueForHarvest;
210 2022 costa
  }
211
212
213
  /**
214
   * Harvests each document in the site document list.
215
   *
216
   * @throws SAXException
217
   * @throws IOException
218
   * @throws ParserConfigurationException
219
   */
220
  public void harvestDocumentList() {
221
    HarvestDocument harvestDocument;
222 2036 costa
    boolean success;
223 2022 costa
224
    if (dueForHarvest()) {
225
      try {
226 2036 costa
        success = parseDocumentList();
227
228
        /* If the document list was validated, then proceed with harvesting
229
         * the documents
230
         */
231
        if (success) {
232
          metacatLogin();
233 2022 costa
234 2036 costa
          for (int i = 0; i < harvestDocumentList.size(); i++) {
235
            harvestDocument = (HarvestDocument) harvestDocumentList.get(i);
236 2022 costa
237 2036 costa
            if (harvestDocument != null) {
238
              harvestDocument.harvestDocument();
239
            }
240 2022 costa
          }
241 2036 costa
242
          metacatLogout();
243 2060 costa
          dbUpdateHarvestDates();  // Update the schedule
244 2022 costa
        }
245
      }
246
      catch (ParserConfigurationException e) {
247 2031 costa
        System.out.println("ParserConfigurationException: " + e.getMessage());
248 2022 costa
      }
249
250
      reportToSite();
251
    }
252
  }
253
254
255
  /**
256 2031 costa
   * Login to Metacat using the ldapDN and ldapPwd
257 2022 costa
   */
258
  private void metacatLogin() {
259
    Metacat metacat = harvester.metacat;
260
261
    if (harvester.connectToMetacat()) {
262
      try {
263
        System.out.println("Logging in to Metacat: " + ldapDN);
264 2031 costa
        metacat.login(ldapDN, ldapPwd);
265 2022 costa
        //System.out.println("Metacat login response: " + response);
266
        //sessionId = metacat.getSessionId();
267
        //System.out.println("Session ID: " + sessionId);
268
      }
269
      catch (MetacatInaccessibleException e) {
270
        System.out.println("Metacat login failed." + e.getMessage());
271
      }
272
      catch (Exception e) {
273
        System.out.println("Metacat login failed." + e.getMessage());
274
      }
275 2031 costa
    }
276 2022 costa
  }
277
278
279
  /**
280
   * Logout from Metacat
281
   */
282
  private void metacatLogout() {
283
    Metacat metacat = harvester.metacat;
284
285
    if (harvester.connectToMetacat()) {
286
      try {
287
        // Log out from the Metacat session
288
        System.out.println("Logging out from Metacat");
289
        metacat.logout();
290
      }
291
      catch (MetacatInaccessibleException e) {
292
        System.out.println("Metacat inaccessible: " + e.getMessage());
293
      }
294
      catch (MetacatException e) {
295
        System.out.println("Metacat exception: " + e.getMessage());
296
      }
297
    }
298
  }
299
300
301
  /**
302
   * Parse the site document list to find out which documents to harvest.
303 2036 costa
   *
304
   * @return  true if successful, otherwise false
305 2022 costa
   */
306 2036 costa
  private boolean parseDocumentList()
307 2031 costa
          throws ParserConfigurationException {
308
    DocumentListHandler documentListHandler = new DocumentListHandler();
309
    InputStream inputStream;
310
    InputStreamReader inputStreamReader;
311
    String schemaLocation = ".";
312 2036 costa
    boolean success = false;
313 2031 costa
    URL url;
314
315
    try {
316
      url = new URL(documentListURL);
317
      inputStream = url.openStream();
318 2060 costa
      harvester.addLogEntry(0,
319
                            "Retrieved: " + documentListURL,
320
                            "GetDocListSuccess",
321
                            siteScheduleID,
322
                            null,
323
                            "");
324 2031 costa
      inputStreamReader = new InputStreamReader(inputStream);
325
      documentListHandler.runParser(inputStreamReader, schemaLocation);
326 2060 costa
      harvester.addLogEntry(0,
327
                            "Validated: " + documentListURL,
328
                            "ValidateDocListSuccess",
329
                            siteScheduleID,
330
                            null,
331
                            "");
332 2036 costa
      success = true;
333 2031 costa
    }
334
    catch (MalformedURLException e){
335
      harvester.addLogEntry(1, "MalformedURLException: " + e.getMessage(),
336
                            "GetDocListError", siteScheduleID, null, "");
337
    }
338
    catch (FileNotFoundException e) {
339
      harvester.addLogEntry(1, "FileNotFoundException: " + e.getMessage(),
340
                            "GetDocListError", siteScheduleID, null, "");
341
    }
342
    catch (SAXException e) {
343
      harvester.addLogEntry(1, "SAXException: " + e.getMessage(),
344
                            "ValidateDocListError", siteScheduleID, null, "");
345
    }
346
    catch (ClassNotFoundException e) {
347
      harvester.addLogEntry(1, "ClassNotFoundException: " + e.getMessage(),
348
                            "ValidateDocListError", siteScheduleID, null, "");
349
    }
350
    catch (IOException e) {
351
      harvester.addLogEntry(1, "IOException: " + e.getMessage(),
352
                            "GetDocListError", siteScheduleID, null, "");
353
    }
354 2036 costa
355
    return success;
356 2022 costa
  }
357
358
359
  /**
360
   * Prints the data that is stored in this HarvestSiteSchedule object.
361 2086 costa
   *
362
   * @param out   the PrintStream to write to
363 2022 costa
   */
364 2086 costa
  void printOutput(PrintStream out) {
365
    out.println("* siteScheduleID:       " + siteScheduleID);
366
    out.println("* documentListURL:      " + documentListURL);
367
    out.println("* ldapDN:               " + ldapDN);
368
    out.println("* dateNextHarvest:      " + dateNextHarvest);
369
    out.println("* dateLastHarvest:      " + dateLastHarvest);
370
    out.println("* updateFrequency:      " + updateFrequency);
371
    out.println("* unit:                 " + unit);
372
    out.println("* contactEmail:         " + contactEmail);
373 2022 costa
  }
374
375
376
  /**
377 2105 costa
   * Sends a report to the site summarizing the results of the harvest at
378
   * that site.
379 2022 costa
   */
380
  void reportToSite() {
381 2086 costa
    PrintStream body;
382 2105 costa
    String from = harvester.harvesterAdministrator;
383
    String maxCodeLevel = "info";
384 2086 costa
    MailMessage msg;
385 2105 costa
    int nErrors = 0;
386 2108 costa
    String subject = "Report from Metacat Harvester: " + harvester.timestamp;
387 2086 costa
    String to = contactEmail;
388
389
    if (!to.equals("")) {
390
      System.out.println("Sending report to siteScheduleID=" + siteScheduleID +
391
                         " at address: " + contactEmail);
392
      try {
393 2105 costa
        msg = new MailMessage(harvester.smtpServer);
394 2086 costa
        msg.from(from);
395
        msg.to(to);
396
        msg.setSubject(subject);
397
        body = msg.getPrintStream();
398 2105 costa
        harvester.printHarvestLog(body, maxCodeLevel, siteScheduleID);
399
        msg.sendAndClose();
400 2086 costa
      }
401
      catch (IOException e) {
402
        System.out.println("There was a problem sending email to " + to);
403
        System.out.println("IOException: " + e.getMessage());
404
      }
405
    }
406 2022 costa
  }
407
408
409
  /**
410
   * This inner class extends DefaultHandler. It parses the document list,
411
   * creating a new HarvestDocument object every time it finds a </Document>
412
   * end tag.
413
   */
414 2031 costa
  class DocumentListHandler extends DefaultHandler implements ErrorHandler {
415 2022 costa
416
    public String scope;
417
    public int identifier;
418 2036 costa
    public String identifierString;
419
    public String documentType;
420 2022 costa
    public int revision;
421 2036 costa
    public String revisionString;
422 2022 costa
    public String documentURL;
423
    private String currentQname;
424 2031 costa
    public final static String DEFAULT_PARSER =
425
           "org.apache.xerces.parsers.SAXParser";
426
    private boolean schemaValidate = true;
427 2022 costa
428
429 2031 costa
	  /**
430
     * This method is called for any plain text within an element.
431
     * It parses the value for any of the following elements:
432
     * <scope>, <identifier>, <revision>, <documentType>, <documentURL>
433
     *
434
     * @param ch          the character array holding the parsed text
435
     * @param start       the start index
436
     * @param length      the text length
437
     *
438 2022 costa
     */
439 2031 costa
    public void characters (char ch[], int start, int length) {
440
      String s = new String(ch, start, length);
441
442
      if (length > 0) {
443
        if (currentQname.equals("scope")) {
444 2036 costa
          scope += s;
445 2031 costa
        }
446
        else if (currentQname.equals("identifier")) {
447 2036 costa
          identifierString += s;
448 2031 costa
        }
449
        else if (currentQname.equals("revision")) {
450 2036 costa
          revisionString += s;
451 2031 costa
        }
452
        else if (currentQname.equals("documentType")) {
453 2036 costa
          documentType += s;
454 2031 costa
        }
455
        else if (currentQname.equals("documentURL")) {
456 2036 costa
          documentURL += s;
457 2031 costa
        }
458
      }
459 2022 costa
    }
460
461
462
    /**
463
     * Handles an end-of-document event.
464
     */
465
    public void endDocument () {
466
      System.out.println("Finished parsing " + documentListURL);
467
    }
468
469
470
    /**
471
     * Handles an end-of-element event. If the end tag is </Document>, then
472
     * creates a new HarvestDocument object and pushes it to the document
473
     * list.
474
     *
475
     * @param uri
476
     * @param localname
477
     * @param qname
478
     */
479
    public void endElement(String uri,
480
                           String localname,
481
                           String qname) {
482
483
      HarvestDocument harvestDocument;
484
485 2036 costa
      if (qname.equals("identifier")) {
486
        identifier = Integer.parseInt(identifierString);
487
      }
488
      else if (qname.equals("revision")) {
489
        revision = Integer.parseInt(revisionString);
490
      }
491
      else if (qname.equals("document")) {
492 2022 costa
        harvestDocument = new HarvestDocument(
493
                                              harvester,
494
                                              HarvestSiteSchedule.this,
495
                                              scope,
496
                                              identifier,
497
                                              revision,
498
                                              documentType,
499
                                              documentURL
500
                                             );
501 2031 costa
        harvestDocumentList.add(harvestDocument);
502 2022 costa
      }
503 2036 costa
504
      currentQname = "";
505 2022 costa
    }
506
507
508 2031 costa
    /**
509
     * Method for handling errors during a parse
510
     *
511
     * @param exception         The parsing error
512
     * @exception SAXException  Description of Exception
513 2022 costa
     */
514 2031 costa
     public void error(SAXParseException e) throws SAXParseException {
515
        System.out.println("SAXParseException: " + e.getMessage());
516
        throw e;
517
    }
518
519
520
    /**
521
     * Run the validating parser
522
     *
523
     * @param xml             the xml stream to be validated
524
     * @schemaLocation        relative path the to XML Schema file, e.g. "."
525
     * @exception IOException thrown when test files can't be opened
526
     * @exception ClassNotFoundException thrown when SAX Parser class not found
527
     * @exception SAXException
528
     * @exception SAXParserException
529
     */
530
    public void runParser(Reader xml, String schemaLocation)
531
           throws IOException, ClassNotFoundException,
532
                  SAXException, SAXParseException {
533
534
      // Get an instance of the parser
535
      XMLReader parser;
536
537
      parser = XMLReaderFactory.createXMLReader(DEFAULT_PARSER);
538
      // Set Handlers in the parser
539
      parser.setContentHandler((ContentHandler)this);
540
      parser.setErrorHandler((ErrorHandler)this);
541
      parser.setFeature("http://xml.org/sax/features/namespaces", true);
542
      parser.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
543
      parser.setFeature("http://xml.org/sax/features/validation", true);
544
      parser.setProperty(
545
              "http://apache.org/xml/properties/schema/external-schemaLocation",
546
              schemaLocation);
547
548
      if (schemaValidate) {
549
        parser.setFeature("http://apache.org/xml/features/validation/schema",
550
                          true);
551 2022 costa
      }
552 2031 costa
553
      // Parse the document
554
      parser.parse(new InputSource(xml));
555 2022 costa
    }
556 2031 costa
    /**
557
     * Handles a start-of-document event.
558
     */
559
    public void startDocument () {
560
      System.out.println("Started parsing " + documentListURL);
561
    }
562 2022 costa
563 2031 costa
564
    /**
565
     * Handles a start-of-element event.
566
     *
567
     * @param uri
568
     * @param localname
569
     * @param qname
570
     * @param attributes
571
     */
572
    public void startElement(String uri,
573
                             String localname,
574
                             String qname,
575
                             Attributes attributes) {
576
577
      currentQname = qname;
578 2036 costa
579
      if (qname.equals("scope")) {
580
        scope = "";
581
      }
582
      else if (qname.equals("identifier")) {
583
        identifierString = "";
584
      }
585
      else if (qname.equals("revision")) {
586
        revisionString = "";
587
      }
588
      else if (qname.equals("documentType")) {
589
        documentType = "";
590
      }
591
      else if (qname.equals("documentURL")) {
592
        documentURL = "";
593
      }
594 2031 costa
    }
595 2022 costa
  }
596
}