Project

General

Profile

1 2094 jones
/**
2
 *  '$RCSfile$'
3
 *  Copyright: 2004 University of New Mexico and the
4
 *                  Regents of the University of California
5 2022 costa
 *
6 2094 jones
 *   '$Author$'
7
 *     '$Date$'
8
 * '$Revision$'
9
 *
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * This program is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
 * GNU General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU General Public License
21
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 2022 costa
 */
24
25
package edu.ucsb.nceas.metacat.harvesterClient;
26
27 2086 costa
import com.oreilly.servlet.MailMessage;
28 2031 costa
import java.io.FileNotFoundException;
29
import java.io.IOException;
30
import java.io.InputStream;
31
import java.io.InputStreamReader;
32 2086 costa
import java.io.PrintStream;
33 2031 costa
import java.io.Reader;
34
import java.net.MalformedURLException;
35
import java.net.URL;
36 2022 costa
import java.sql.Connection;
37
import java.sql.SQLException;
38
import java.sql.Statement;
39 2031 costa
import java.text.DateFormat;
40
import java.text.ParseException;
41
import java.text.SimpleDateFormat;
42
import java.util.ArrayList;
43
import java.util.Date;
44
import javax.xml.parsers.ParserConfigurationException;
45
import org.xml.sax.Attributes;
46
import org.xml.sax.ContentHandler;
47
import org.xml.sax.ErrorHandler;
48
import org.xml.sax.InputSource;
49
import org.xml.sax.SAXException;
50
import org.xml.sax.SAXParseException;
51
import org.xml.sax.XMLReader;
52
import org.xml.sax.helpers.DefaultHandler;
53
import org.xml.sax.helpers.XMLReaderFactory;
54 2022 costa
55 2031 costa
import edu.ucsb.nceas.metacat.client.Metacat;
56
import edu.ucsb.nceas.metacat.client.MetacatException;
57
import edu.ucsb.nceas.metacat.client.MetacatInaccessibleException;
58 2022 costa
59
60
/**
61
 * HarvestSiteSchedule manages a single entry in the HARVEST_SITE_SCHEDULE
62
 * table, determining when and how to harvest the documents for a given site.
63
 *
64
 * @author  costa
65
 */
66 2139 costa
public class HarvestSiteSchedule {
67 2022 costa
68
  private String contactEmail;
69
  private String dateLastHarvest;
70
  private String dateNextHarvest;
71
  private long delta;
72
  private String documentListURL;
73
  private Harvester harvester;
74 2031 costa
  private ArrayList harvestDocumentList = new ArrayList();
75 2022 costa
  private String harvestSiteEndTime;
76
  private String harvestSiteStartTime;
77
  private String ldapDN;
78 2031 costa
  private String ldapPwd;
79 2022 costa
  final private long millisecondsPerDay = (1000 * 60 * 60 * 24);
80 2139 costa
  private String schemaLocation =
81
    "eml://ecoinformatics.org/harvestList ../../lib/harvester/harvestList.xsd";
82 2031 costa
  int siteScheduleID;
83 2022 costa
  private String unit;
84
  private int updateFrequency;
85
86
  /**
87
   * Creates a new instance of HarvestSiteSchedule. Initialized with the data
88
   * that was read from a single row in the HARVEST_SITE_SCHEDULE table.
89
   *
90
   * @param harvester       the parent Harvester object
91
   * @param siteScheduleID  the value of the SITE_SCHEDULE_ID field
92
   * @param documentListURL the value of the DOCUMENTLISTURL field
93
   * @param ldapDN          the value of the LDAPDN field
94 2031 costa
   * @param ldapPwd    the value of the LDAPPASSWORD field
95 2022 costa
   * @param dateNextHarvest the value of the DATENEXTHARVEST field
96
   * @param dateLastHarvest the value of the DATELASTHARVEST field
97
   * @param updateFrequency the value of the UPDATEFREQUENCY field
98
   * @param unit            the value of the UNIT field
99
   * @param contactEmail    the value of the CONTACT_EMAIL field
100
   */
101
  public HarvestSiteSchedule(
102
                              Harvester harvester,
103
                              int    siteScheduleID,
104
                              String documentListURL,
105
                              String ldapDN,
106 2031 costa
                              String ldapPwd,
107 2022 costa
                              String dateNextHarvest,
108
                              String dateLastHarvest,
109
                              int    updateFrequency,
110
                              String unit,
111
                              String contactEmail
112
                            )
113
  {
114
    this.harvester = harvester;
115
    this.siteScheduleID = siteScheduleID;
116
    this.documentListURL = documentListURL;
117
    this.ldapDN = ldapDN;
118 2031 costa
    this.ldapPwd = ldapPwd;
119 2022 costa
    this.dateNextHarvest = dateNextHarvest;
120
    this.dateLastHarvest = dateLastHarvest;
121
    this.updateFrequency = updateFrequency;
122
    this.unit = unit;
123
    this.contactEmail = contactEmail;
124
125
    // Calculate the value of delta, the number of milliseconds between the
126
    // last harvest date and the next harvest date.
127
    delta = updateFrequency * millisecondsPerDay;
128
129
    if (unit.equals("weeks")) {
130
      delta *= 7;
131
    }
132
    else if (unit.equals("months")) {
133
      delta *= 30;
134
    }
135
  }
136
137
138
  /**
139 2060 costa
   * Updates the DATELASTHARVEST and DATENEXTHARVEST values of the
140
   * HARVEST_SITE_SCHEDULE table after a harvest operation has completed.
141
   * Calculates the date of the next harvest based on today's date and the
142
   * update frequency.
143 2022 costa
   */
144 2060 costa
  private void dbUpdateHarvestDates() {
145 2380 costa
    Connection conn;
146 2022 costa
    long currentTime;                    // Current time in milliseconds
147
    Date dateNextHarvest;                // Date of next harvest
148
    String lastHarvest;
149
    String nextHarvest;
150
    Date now = new Date();
151
    SimpleDateFormat simpleDateFormat = new SimpleDateFormat("dd-MMM-yyyy");
152 2380 costa
    Statement stmt;
153 2022 costa
    long timeNextHarvest;
154
155 2139 costa
    conn = harvester.getConnection();
156 2022 costa
    now = new Date();
157 2031 costa
    currentTime = now.getTime();
158 2022 costa
    timeNextHarvest = currentTime + delta;
159
    dateNextHarvest = new Date(timeNextHarvest);
160
    nextHarvest = "'" + simpleDateFormat.format(dateNextHarvest) + "'";
161
    lastHarvest = "'" + simpleDateFormat.format(now) + "'";
162
163 2380 costa
	try {
164
      stmt = conn.createStatement();
165
      stmt.executeUpdate(
166 2367 costa
                         "UPDATE HARVEST_SITE_SCHEDULE SET DATENEXTHARVEST = " +
167 2031 costa
                         nextHarvest +
168
                         " WHERE SITE_SCHEDULE_ID = " +
169
                         siteScheduleID);
170 2380 costa
      stmt.executeUpdate(
171 2367 costa
                         "UPDATE HARVEST_SITE_SCHEDULE SET DATELASTHARVEST = " +
172 2031 costa
                         lastHarvest +
173
                         " WHERE SITE_SCHEDULE_ID = " +
174
                         siteScheduleID);
175 2380 costa
      stmt.close();
176
    }
177 2022 costa
    catch(SQLException e) {
178 2380 costa
      System.out.println("SQLException: " + e.getMessage());
179
    }
180 2022 costa
  }
181
182
183
  /**
184
   * Boolean to determine whether this site is currently due for its next
185
   * harvest.
186
   *
187
   * @retrun     true if due for harvest, otherwise false
188
   */
189 2139 costa
  public boolean dueForHarvest() {
190 2022 costa
    boolean dueForHarvest = false;
191 2303 costa
//    DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S");
192
    DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
193 2022 costa
    Date now = new Date();
194
    Date dnh;                          // Date of next harvest
195
    long currentTime = now.getTime();  // Current time in milliseconds
196
    long timeNextHarvest = 0;
197
198
    try {
199 2036 costa
      dnh = dateFormat.parse(dateNextHarvest);
200
      timeNextHarvest = dnh.getTime();
201 2022 costa
202
      if (timeNextHarvest < currentTime) {
203
        dueForHarvest = true;
204 2031 costa
        System.out.println("Due for harvest: " + documentListURL);
205 2022 costa
      }
206
      else {
207 2031 costa
        System.out.println("Not due for harvest: " + documentListURL);
208 2022 costa
      }
209
    }
210
    catch (ParseException e) {
211 2031 costa
      System.out.println("Error parsing date: " + e.getMessage());
212 2022 costa
    }
213
214 2031 costa
    return dueForHarvest;
215 2022 costa
  }
216 2139 costa
217 2022 costa
218 2139 costa
  /**
219
   * Accessor method for the schemaLocation field.
220
   *
221
   * @return schemaLocation  the schema location string
222
   */
223
  public String getSchemaLocation() {
224
    return schemaLocation;
225
  }
226 2022 costa
227 2139 costa
228 2022 costa
  /**
229
   * Harvests each document in the site document list.
230
   *
231
   * @throws SAXException
232
   * @throws IOException
233
   * @throws ParserConfigurationException
234
   */
235
  public void harvestDocumentList() {
236
    HarvestDocument harvestDocument;
237 2036 costa
    boolean success;
238 2022 costa
239
    if (dueForHarvest()) {
240
      try {
241 2139 costa
        success = parseHarvestList();
242 2036 costa
243
        /* If the document list was validated, then proceed with harvesting
244
         * the documents
245
         */
246
        if (success) {
247
          metacatLogin();
248 2022 costa
249 2036 costa
          for (int i = 0; i < harvestDocumentList.size(); i++) {
250
            harvestDocument = (HarvestDocument) harvestDocumentList.get(i);
251 2022 costa
252 2036 costa
            if (harvestDocument != null) {
253
              harvestDocument.harvestDocument();
254
            }
255 2022 costa
          }
256 2036 costa
257
          metacatLogout();
258 2060 costa
          dbUpdateHarvestDates();  // Update the schedule
259 2022 costa
        }
260
      }
261
      catch (ParserConfigurationException e) {
262 2031 costa
        System.out.println("ParserConfigurationException: " + e.getMessage());
263 2022 costa
      }
264
265 2155 costa
      reportToSiteContact();
266 2022 costa
    }
267
  }
268
269
270
  /**
271 2031 costa
   * Login to Metacat using the ldapDN and ldapPwd
272 2022 costa
   */
273 2139 costa
  public void metacatLogin() {
274 2022 costa
    Metacat metacat = harvester.metacat;
275 2139 costa
    String response;
276 2022 costa
277
    if (harvester.connectToMetacat()) {
278
      try {
279
        System.out.println("Logging in to Metacat: " + ldapDN);
280 2139 costa
        response = metacat.login(ldapDN, ldapPwd);
281 2022 costa
        //System.out.println("Metacat login response: " + response);
282
      }
283
      catch (MetacatInaccessibleException e) {
284
        System.out.println("Metacat login failed." + e.getMessage());
285
      }
286
      catch (Exception e) {
287
        System.out.println("Metacat login failed." + e.getMessage());
288
      }
289 2031 costa
    }
290 2022 costa
  }
291
292
293
  /**
294
   * Logout from Metacat
295
   */
296
  private void metacatLogout() {
297
    Metacat metacat = harvester.metacat;
298
299
    if (harvester.connectToMetacat()) {
300
      try {
301
        // Log out from the Metacat session
302
        System.out.println("Logging out from Metacat");
303
        metacat.logout();
304
      }
305
      catch (MetacatInaccessibleException e) {
306
        System.out.println("Metacat inaccessible: " + e.getMessage());
307
      }
308
      catch (MetacatException e) {
309
        System.out.println("Metacat exception: " + e.getMessage());
310
      }
311
    }
312
  }
313
314
315
  /**
316 2139 costa
   * Parses the site harvest list XML file to find out which documents to
317
   * harvest.
318 2036 costa
   *
319
   * @return  true if successful, otherwise false
320 2022 costa
   */
321 2139 costa
  public boolean parseHarvestList()
322 2031 costa
          throws ParserConfigurationException {
323
    DocumentListHandler documentListHandler = new DocumentListHandler();
324
    InputStream inputStream;
325
    InputStreamReader inputStreamReader;
326 2139 costa
    String schemaLocation = getSchemaLocation();
327 2036 costa
    boolean success = false;
328 2031 costa
    URL url;
329
330
    try {
331
      url = new URL(documentListURL);
332
      inputStream = url.openStream();
333 2060 costa
      harvester.addLogEntry(0,
334
                            "Retrieved: " + documentListURL,
335 4175 daigle
                            "harvester.GetHarvestListSuccess",
336 2060 costa
                            siteScheduleID,
337
                            null,
338
                            "");
339 2031 costa
      inputStreamReader = new InputStreamReader(inputStream);
340 5463 leinfelder
//      char[] harvestListChars = new char[1024];
341
//      inputStreamReader.read(harvestListChars, 0, 1024);
342
//      System.out.println("documentListURL: " + documentListURL);
343
//      String encoding = inputStreamReader.getEncoding();
344
//      System.out.println("encoding: " + encoding);
345
//      String harvestListStr = new String(harvestListChars);
346
//      System.out.println("harvestListStr:\n" + harvestListStr);
347 2031 costa
      documentListHandler.runParser(inputStreamReader, schemaLocation);
348 2060 costa
      harvester.addLogEntry(0,
349
                            "Validated: " + documentListURL,
350 4175 daigle
                            "harvester.ValidateHarvestListSuccess",
351 2060 costa
                            siteScheduleID,
352
                            null,
353
                            "");
354 2036 costa
      success = true;
355 2031 costa
    }
356
    catch (MalformedURLException e){
357
      harvester.addLogEntry(1, "MalformedURLException: " + e.getMessage(),
358 4175 daigle
                            "harvester.GetHarvestListError", siteScheduleID, null, "");
359 2031 costa
    }
360
    catch (FileNotFoundException e) {
361
      harvester.addLogEntry(1, "FileNotFoundException: " + e.getMessage(),
362 4175 daigle
                            "harvester.GetHarvestListError", siteScheduleID, null, "");
363 2031 costa
    }
364
    catch (SAXException e) {
365
      harvester.addLogEntry(1, "SAXException: " + e.getMessage(),
366 4175 daigle
                          "harvester.ValidateHarvestListError", siteScheduleID, null, "");
367 2031 costa
    }
368
    catch (ClassNotFoundException e) {
369
      harvester.addLogEntry(1, "ClassNotFoundException: " + e.getMessage(),
370 4175 daigle
                          "harvester.ValidateHarvestListError", siteScheduleID, null, "");
371 2031 costa
    }
372
    catch (IOException e) {
373
      harvester.addLogEntry(1, "IOException: " + e.getMessage(),
374 4175 daigle
                            "harvester.GetHarvestListError", siteScheduleID, null, "");
375 2031 costa
    }
376 2036 costa
377
    return success;
378 2022 costa
  }
379
380
381
  /**
382
   * Prints the data that is stored in this HarvestSiteSchedule object.
383 2086 costa
   *
384
   * @param out   the PrintStream to write to
385 2022 costa
   */
386 2139 costa
  public void printOutput(PrintStream out) {
387 2086 costa
    out.println("* siteScheduleID:       " + siteScheduleID);
388
    out.println("* documentListURL:      " + documentListURL);
389
    out.println("* ldapDN:               " + ldapDN);
390
    out.println("* dateNextHarvest:      " + dateNextHarvest);
391
    out.println("* dateLastHarvest:      " + dateLastHarvest);
392
    out.println("* updateFrequency:      " + updateFrequency);
393
    out.println("* unit:                 " + unit);
394
    out.println("* contactEmail:         " + contactEmail);
395 2022 costa
  }
396
397 2155 costa
  /**
398
   * Reports a summary of the site harvest. Includes the following:
399
   *   A list of documents that were successfully inserted.
400
   *   A list of documents that were successfully updated.
401
   *   A list of documents that could not be accessed at the site.
402
   *   A list of documents that could not be uploaded to Metacat.
403
   *   A list of documents that were already found in Metacat.
404
   *
405
   * @param out  the PrintStream to write to
406
   */
407
  void printSiteSummary(PrintStream out) {
408
    HarvestDocument harvestDocument;
409
    int nAccessError = 0;
410
    int nInserted = 0;
411
    int nMetacatHasIt = 0;
412
    int nUpdated = 0;
413
    int nUploadError = 0;
414
415
    for (int i = 0; i < harvestDocumentList.size(); i++) {
416
      harvestDocument = (HarvestDocument) harvestDocumentList.get(i);
417
418
      if (harvestDocument != null) {
419
        if (harvestDocument.accessError)  { nAccessError++; }
420
        if (harvestDocument.inserted)     { nInserted++; }
421
        if (harvestDocument.metacatHasIt) { nMetacatHasIt++; }
422
        if (harvestDocument.updated)      { nUpdated++; }
423
        if (harvestDocument.uploadError)  { nUploadError++; }
424
      }
425
    }
426
427
    if (nInserted > 0) {
428
      printSiteSummaryHeader(out);
429
      out.println("* The following document(s) were successfully inserted:");
430
      for (int i = 0; i < harvestDocumentList.size(); i++) {
431
        harvestDocument = (HarvestDocument) harvestDocumentList.get(i);
432
        if (harvestDocument != null) {
433
          if (harvestDocument.inserted)  {
434
            harvestDocument.prettyPrint(out);
435
          }
436
        }
437
      }
438
      printSiteSummaryTrailer(out);
439
    }
440 2022 costa
441 2155 costa
    if (nUpdated > 0) {
442
      printSiteSummaryHeader(out);
443
      out.println("* The following document(s) were successfully updated:");
444
      for (int i = 0; i < harvestDocumentList.size(); i++) {
445
        harvestDocument = (HarvestDocument) harvestDocumentList.get(i);
446
        if (harvestDocument != null) {
447
          if (harvestDocument.updated)  {
448
            harvestDocument.prettyPrint(out);
449
          }
450
        }
451
      }
452
      printSiteSummaryTrailer(out);
453
    }
454
455
    if (nAccessError > 0) {
456
      printSiteSummaryHeader(out);
457
      out.println("* The following document(s) could not be accessed");
458
      out.println("* at the site. Please check the URL to ensure that it is");
459
      out.println("* accessible at the site.");
460
      for (int i = 0; i < harvestDocumentList.size(); i++) {
461
        harvestDocument = (HarvestDocument) harvestDocumentList.get(i);
462
        if (harvestDocument != null) {
463
          if (harvestDocument.accessError)  {
464
            harvestDocument.prettyPrint(out);
465
          }
466
        }
467
      }
468
      printSiteSummaryTrailer(out);
469
    }
470
471
    if (nUploadError > 0) {
472
      printSiteSummaryHeader(out);
473
      out.println("* The following document(s) could not be uploaded to");
474
      out.println("* Metacat because an error of some kind occurred.");
475 2777 costa
      out.println("* (See log entries below for additional details.) :");
476 2155 costa
      for (int i = 0; i < harvestDocumentList.size(); i++) {
477
        harvestDocument = (HarvestDocument) harvestDocumentList.get(i);
478
        if (harvestDocument != null) {
479
          if (harvestDocument.uploadError)  {
480
            harvestDocument.prettyPrint(out);
481
          }
482
        }
483
      }
484
      printSiteSummaryTrailer(out);
485
    }
486
487
    if (nMetacatHasIt > 0) {
488
      printSiteSummaryHeader(out);
489 2777 costa
      out.println("* The following document(s) were already found in Metacat:");
490
491 2155 costa
      for (int i = 0; i < harvestDocumentList.size(); i++) {
492
        harvestDocument = (HarvestDocument) harvestDocumentList.get(i);
493
        if (harvestDocument != null) {
494
          if (harvestDocument.metacatHasIt)  {
495
            harvestDocument.prettyPrint(out);
496
          }
497
        }
498
      }
499
      printSiteSummaryTrailer(out);
500
    }
501
502
  }
503
504
505 2022 costa
  /**
506 2155 costa
   * Prints the header lines of a site summary entry.
507
   *
508
   * @param out    the PrintStream to write to
509 2022 costa
   */
510 2155 costa
  void printSiteSummaryHeader(PrintStream out) {
511
    final String filler = Harvester.filler;
512
    final String marker = Harvester.marker;
513
514
    out.println("");
515
    out.println(marker);
516
    out.println(filler);
517
  }
518
519
520
  /**
521
   * Prints the trailing lines of a site summary entry.
522
   *
523
   * @param out    the PrintStream to write to
524
   */
525
  void printSiteSummaryTrailer(PrintStream out) {
526
    final String filler = Harvester.filler;
527
    final String marker = Harvester.marker;
528
529
    out.println(filler);
530
    out.println(marker);
531
  }
532
533
534
  /**
535
   * Sends a report to the Site Contact summarizing the results of the harvest
536
   * at that site.
537
   */
538
  void reportToSiteContact() {
539 2086 costa
    PrintStream body;
540 2105 costa
    String from = harvester.harvesterAdministrator;
541 2330 costa
    String[] fromArray;
542 2765 costa
    String maxCodeLevel = "notice";
543 2086 costa
    MailMessage msg;
544 2105 costa
    int nErrors = 0;
545 2108 costa
    String subject = "Report from Metacat Harvester: " + harvester.timestamp;
546 2086 costa
    String to = contactEmail;
547 2330 costa
    String[] toArray;
548 2086 costa
549
    if (!to.equals("")) {
550
      System.out.println("Sending report to siteScheduleID=" + siteScheduleID +
551
                         " at address: " + contactEmail);
552
      try {
553 2105 costa
        msg = new MailMessage(harvester.smtpServer);
554 2330 costa
555
        if (from.indexOf(',') > 0) {
556
          fromArray = from.split(",");
557
558
          for (int i = 0; i < fromArray.length; i++) {
559
            if (i == 0) {
560
              msg.from(fromArray[i]);
561
            }
562
563
            msg.cc(fromArray[i]);
564
565
          }
566
        }
567
        else if (from.indexOf(';') > 0) {
568
          fromArray = from.split(";");
569
570
          for (int i = 0; i < fromArray.length; i++) {
571
            if (i == 0) {
572
              msg.from(fromArray[i]);
573
            }
574
575
            msg.cc(fromArray[i]);
576
577
          }
578
        }
579
        else {
580
          msg.from(from);
581
          msg.cc(from);
582
        }
583
584
        if (to.indexOf(',') > 0) {
585
          toArray = to.split(",");
586
587
          for (int i = 0; i < toArray.length; i++) {
588
            msg.to(toArray[i]);
589
          }
590
        }
591
        else if (to.indexOf(';') > 0) {
592
          toArray = to.split(";");
593
594
          for (int i = 0; i < toArray.length; i++) {
595
            msg.to(toArray[i]);
596
          }
597
        }
598
        else {
599
          msg.to(to);
600
        }
601
602 2086 costa
        msg.setSubject(subject);
603
        body = msg.getPrintStream();
604 2155 costa
        harvester.printHarvestHeader(body, siteScheduleID);
605
        printSiteSummary(body);
606 2105 costa
        harvester.printHarvestLog(body, maxCodeLevel, siteScheduleID);
607
        msg.sendAndClose();
608 2086 costa
      }
609
      catch (IOException e) {
610
        System.out.println("There was a problem sending email to " + to);
611
        System.out.println("IOException: " + e.getMessage());
612
      }
613
    }
614 2022 costa
  }
615
616
617
  /**
618 2139 costa
   * Accessor method for setting the value of the schemaLocation field.
619
   *
620
   * @param schemaLocation  the new value of the schemaLocation field
621
   */
622
  public void setSchemaLocation(String schemaLocation) {
623
    this.schemaLocation = schemaLocation;
624
  }
625
626
627
  /**
628 2022 costa
   * This inner class extends DefaultHandler. It parses the document list,
629
   * creating a new HarvestDocument object every time it finds a </Document>
630
   * end tag.
631
   */
632 2031 costa
  class DocumentListHandler extends DefaultHandler implements ErrorHandler {
633 2022 costa
634
    public String scope;
635
    public int identifier;
636 2036 costa
    public String identifierString;
637
    public String documentType;
638 2022 costa
    public int revision;
639 2036 costa
    public String revisionString;
640 2022 costa
    public String documentURL;
641
    private String currentQname;
642 2031 costa
    public final static String DEFAULT_PARSER =
643
           "org.apache.xerces.parsers.SAXParser";
644
    private boolean schemaValidate = true;
645 2022 costa
646
647 2031 costa
	  /**
648
     * This method is called for any plain text within an element.
649
     * It parses the value for any of the following elements:
650
     * <scope>, <identifier>, <revision>, <documentType>, <documentURL>
651
     *
652
     * @param ch          the character array holding the parsed text
653
     * @param start       the start index
654
     * @param length      the text length
655
     *
656 2022 costa
     */
657 2031 costa
    public void characters (char ch[], int start, int length) {
658
      String s = new String(ch, start, length);
659
660
      if (length > 0) {
661
        if (currentQname.equals("scope")) {
662 2036 costa
          scope += s;
663 2031 costa
        }
664
        else if (currentQname.equals("identifier")) {
665 2036 costa
          identifierString += s;
666 2031 costa
        }
667
        else if (currentQname.equals("revision")) {
668 2036 costa
          revisionString += s;
669 2031 costa
        }
670
        else if (currentQname.equals("documentType")) {
671 2036 costa
          documentType += s;
672 2031 costa
        }
673
        else if (currentQname.equals("documentURL")) {
674 2036 costa
          documentURL += s;
675 2031 costa
        }
676
      }
677 2022 costa
    }
678
679
680
    /**
681
     * Handles an end-of-document event.
682
     */
683
    public void endDocument () {
684
      System.out.println("Finished parsing " + documentListURL);
685
    }
686
687
688
    /**
689
     * Handles an end-of-element event. If the end tag is </Document>, then
690
     * creates a new HarvestDocument object and pushes it to the document
691
     * list.
692
     *
693
     * @param uri
694
     * @param localname
695
     * @param qname
696
     */
697
    public void endElement(String uri,
698
                           String localname,
699
                           String qname) {
700
701
      HarvestDocument harvestDocument;
702
703 2036 costa
      if (qname.equals("identifier")) {
704
        identifier = Integer.parseInt(identifierString);
705
      }
706
      else if (qname.equals("revision")) {
707
        revision = Integer.parseInt(revisionString);
708
      }
709
      else if (qname.equals("document")) {
710 2022 costa
        harvestDocument = new HarvestDocument(
711
                                              harvester,
712
                                              HarvestSiteSchedule.this,
713
                                              scope,
714
                                              identifier,
715
                                              revision,
716
                                              documentType,
717
                                              documentURL
718
                                             );
719 2031 costa
        harvestDocumentList.add(harvestDocument);
720 2022 costa
      }
721 2036 costa
722
      currentQname = "";
723 2022 costa
    }
724
725
726 2031 costa
    /**
727
     * Method for handling errors during a parse
728
     *
729
     * @param exception         The parsing error
730
     * @exception SAXException  Description of Exception
731 2022 costa
     */
732 2031 costa
     public void error(SAXParseException e) throws SAXParseException {
733
        System.out.println("SAXParseException: " + e.getMessage());
734
        throw e;
735
    }
736
737
738
    /**
739
     * Run the validating parser
740
     *
741
     * @param xml             the xml stream to be validated
742
     * @schemaLocation        relative path the to XML Schema file, e.g. "."
743
     * @exception IOException thrown when test files can't be opened
744
     * @exception ClassNotFoundException thrown when SAX Parser class not found
745
     * @exception SAXException
746
     * @exception SAXParserException
747
     */
748
    public void runParser(Reader xml, String schemaLocation)
749
           throws IOException, ClassNotFoundException,
750
                  SAXException, SAXParseException {
751
752
      // Get an instance of the parser
753
      XMLReader parser;
754
755
      parser = XMLReaderFactory.createXMLReader(DEFAULT_PARSER);
756
      // Set Handlers in the parser
757
      parser.setContentHandler((ContentHandler)this);
758
      parser.setErrorHandler((ErrorHandler)this);
759
      parser.setFeature("http://xml.org/sax/features/namespaces", true);
760
      parser.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
761
      parser.setFeature("http://xml.org/sax/features/validation", true);
762
      parser.setProperty(
763
              "http://apache.org/xml/properties/schema/external-schemaLocation",
764
              schemaLocation);
765
766
      if (schemaValidate) {
767
        parser.setFeature("http://apache.org/xml/features/validation/schema",
768
                          true);
769 2022 costa
      }
770 2031 costa
771
      // Parse the document
772
      parser.parse(new InputSource(xml));
773 2022 costa
    }
774 2031 costa
    /**
775
     * Handles a start-of-document event.
776
     */
777
    public void startDocument () {
778
      System.out.println("Started parsing " + documentListURL);
779
    }
780 2022 costa
781 2031 costa
782
    /**
783
     * Handles a start-of-element event.
784
     *
785
     * @param uri
786
     * @param localname
787
     * @param qname
788
     * @param attributes
789
     */
790
    public void startElement(String uri,
791
                             String localname,
792
                             String qname,
793
                             Attributes attributes) {
794
795
      currentQname = qname;
796 2036 costa
797
      if (qname.equals("scope")) {
798
        scope = "";
799
      }
800
      else if (qname.equals("identifier")) {
801
        identifierString = "";
802
      }
803
      else if (qname.equals("revision")) {
804
        revisionString = "";
805
      }
806
      else if (qname.equals("documentType")) {
807
        documentType = "";
808
      }
809
      else if (qname.equals("documentURL")) {
810
        documentURL = "";
811
      }
812 2031 costa
    }
813 2022 costa
  }
814
}