Project

General

Profile

1 2094 jones
/**
2
 *  '$RCSfile$'
3
 *  Copyright: 2004 University of New Mexico and the
4
 *                  Regents of the University of California
5 2022 costa
 *
6 2094 jones
 *   '$Author$'
7
 *     '$Date$'
8
 * '$Revision$'
9
 *
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * This program is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
 * GNU General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU General Public License
21
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 2022 costa
 */
24
25
package edu.ucsb.nceas.metacat.harvesterClient;
26
27 2086 costa
import com.oreilly.servlet.MailMessage;
28 2031 costa
import java.io.FileNotFoundException;
29
import java.io.IOException;
30
import java.io.InputStream;
31
import java.io.InputStreamReader;
32 2086 costa
import java.io.PrintStream;
33 2031 costa
import java.io.Reader;
34
import java.net.MalformedURLException;
35
import java.net.URL;
36 2022 costa
import java.sql.Connection;
37
import java.sql.SQLException;
38
import java.sql.Statement;
39 2031 costa
import java.text.DateFormat;
40
import java.text.ParseException;
41
import java.text.SimpleDateFormat;
42
import java.util.ArrayList;
43
import java.util.Date;
44
import javax.xml.parsers.ParserConfigurationException;
45
import org.xml.sax.Attributes;
46
import org.xml.sax.ContentHandler;
47
import org.xml.sax.ErrorHandler;
48
import org.xml.sax.InputSource;
49
import org.xml.sax.SAXException;
50
import org.xml.sax.SAXParseException;
51
import org.xml.sax.XMLReader;
52
import org.xml.sax.helpers.DefaultHandler;
53
import org.xml.sax.helpers.XMLReaderFactory;
54 2022 costa
55 2031 costa
import edu.ucsb.nceas.metacat.client.Metacat;
56
import edu.ucsb.nceas.metacat.client.MetacatException;
57
import edu.ucsb.nceas.metacat.client.MetacatInaccessibleException;
58 2022 costa
59
60
/**
61
 * HarvestSiteSchedule manages a single entry in the HARVEST_SITE_SCHEDULE
62
 * table, determining when and how to harvest the documents for a given site.
63
 *
64
 * @author  costa
65
 */
66 2139 costa
public class HarvestSiteSchedule {
67 2022 costa
68
  private String contactEmail;
69
  private String dateLastHarvest;
70
  private String dateNextHarvest;
71
  private long delta;
72
  private String documentListURL;
73
  private Harvester harvester;
74 2031 costa
  private ArrayList harvestDocumentList = new ArrayList();
75 2022 costa
  private String harvestSiteEndTime;
76
  private String harvestSiteStartTime;
77
  private String ldapDN;
78 2031 costa
  private String ldapPwd;
79 2022 costa
  final private long millisecondsPerDay = (1000 * 60 * 60 * 24);
80 2139 costa
  private String schemaLocation =
81
    "eml://ecoinformatics.org/harvestList ../../lib/harvester/harvestList.xsd";
82 2031 costa
  int siteScheduleID;
83 2022 costa
  private String unit;
84
  private int updateFrequency;
85
86
  /**
87
   * Creates a new instance of HarvestSiteSchedule. Initialized with the data
88
   * that was read from a single row in the HARVEST_SITE_SCHEDULE table.
89
   *
90
   * @param harvester       the parent Harvester object
91
   * @param siteScheduleID  the value of the SITE_SCHEDULE_ID field
92
   * @param documentListURL the value of the DOCUMENTLISTURL field
93
   * @param ldapDN          the value of the LDAPDN field
94 2031 costa
   * @param ldapPwd    the value of the LDAPPASSWORD field
95 2022 costa
   * @param dateNextHarvest the value of the DATENEXTHARVEST field
96
   * @param dateLastHarvest the value of the DATELASTHARVEST field
97
   * @param updateFrequency the value of the UPDATEFREQUENCY field
98
   * @param unit            the value of the UNIT field
99
   * @param contactEmail    the value of the CONTACT_EMAIL field
100
   */
101
  public HarvestSiteSchedule(
102
                              Harvester harvester,
103
                              int    siteScheduleID,
104
                              String documentListURL,
105
                              String ldapDN,
106 2031 costa
                              String ldapPwd,
107 2022 costa
                              String dateNextHarvest,
108
                              String dateLastHarvest,
109
                              int    updateFrequency,
110
                              String unit,
111
                              String contactEmail
112
                            )
113
  {
114
    this.harvester = harvester;
115
    this.siteScheduleID = siteScheduleID;
116
    this.documentListURL = documentListURL;
117
    this.ldapDN = ldapDN;
118 2031 costa
    this.ldapPwd = ldapPwd;
119 2022 costa
    this.dateNextHarvest = dateNextHarvest;
120
    this.dateLastHarvest = dateLastHarvest;
121
    this.updateFrequency = updateFrequency;
122
    this.unit = unit;
123
    this.contactEmail = contactEmail;
124
125
    // Calculate the value of delta, the number of milliseconds between the
126
    // last harvest date and the next harvest date.
127
    delta = updateFrequency * millisecondsPerDay;
128
129
    if (unit.equals("weeks")) {
130
      delta *= 7;
131
    }
132
    else if (unit.equals("months")) {
133
      delta *= 30;
134
    }
135
  }
136
137
138
  /**
139 2060 costa
   * Updates the DATELASTHARVEST and DATENEXTHARVEST values of the
140
   * HARVEST_SITE_SCHEDULE table after a harvest operation has completed.
141
   * Calculates the date of the next harvest based on today's date and the
142
   * update frequency.
143 2022 costa
   */
144 2060 costa
  private void dbUpdateHarvestDates() {
145 2380 costa
    Connection conn;
146 2022 costa
    long currentTime;                    // Current time in milliseconds
147
    Date dateNextHarvest;                // Date of next harvest
148
    String lastHarvest;
149
    String nextHarvest;
150
    Date now = new Date();
151
    SimpleDateFormat simpleDateFormat = new SimpleDateFormat("dd-MMM-yyyy");
152 2380 costa
    Statement stmt;
153 2022 costa
    long timeNextHarvest;
154
155 2139 costa
    conn = harvester.getConnection();
156 2022 costa
    now = new Date();
157 2031 costa
    currentTime = now.getTime();
158 2022 costa
    timeNextHarvest = currentTime + delta;
159
    dateNextHarvest = new Date(timeNextHarvest);
160
    nextHarvest = "'" + simpleDateFormat.format(dateNextHarvest) + "'";
161
    lastHarvest = "'" + simpleDateFormat.format(now) + "'";
162
163 2380 costa
	try {
164
      stmt = conn.createStatement();
165
      stmt.executeUpdate(
166 2367 costa
                         "UPDATE HARVEST_SITE_SCHEDULE SET DATENEXTHARVEST = " +
167 2031 costa
                         nextHarvest +
168
                         " WHERE SITE_SCHEDULE_ID = " +
169
                         siteScheduleID);
170 2380 costa
      stmt.executeUpdate(
171 2367 costa
                         "UPDATE HARVEST_SITE_SCHEDULE SET DATELASTHARVEST = " +
172 2031 costa
                         lastHarvest +
173
                         " WHERE SITE_SCHEDULE_ID = " +
174
                         siteScheduleID);
175 2380 costa
      stmt.close();
176
    }
177 2022 costa
    catch(SQLException e) {
178 2380 costa
      System.out.println("SQLException: " + e.getMessage());
179
    }
180 2022 costa
  }
181
182
183
  /**
184
   * Boolean to determine whether this site is currently due for its next
185
   * harvest.
186
   *
187
   * @retrun     true if due for harvest, otherwise false
188
   */
189 2139 costa
  public boolean dueForHarvest() {
190 2022 costa
    boolean dueForHarvest = false;
191 2303 costa
//    DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.S");
192
    DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
193 2022 costa
    Date now = new Date();
194
    Date dnh;                          // Date of next harvest
195
    long currentTime = now.getTime();  // Current time in milliseconds
196
    long timeNextHarvest = 0;
197
198
    try {
199 2036 costa
      dnh = dateFormat.parse(dateNextHarvest);
200
      timeNextHarvest = dnh.getTime();
201 2022 costa
202
      if (timeNextHarvest < currentTime) {
203
        dueForHarvest = true;
204 2031 costa
        System.out.println("Due for harvest: " + documentListURL);
205 2022 costa
      }
206
      else {
207 2031 costa
        System.out.println("Not due for harvest: " + documentListURL);
208 2022 costa
      }
209
    }
210
    catch (ParseException e) {
211 2031 costa
      System.out.println("Error parsing date: " + e.getMessage());
212 2022 costa
    }
213
214 2031 costa
    return dueForHarvest;
215 2022 costa
  }
216 2139 costa
217 2022 costa
218 2139 costa
  /**
219
   * Accessor method for the schemaLocation field.
220
   *
221
   * @return schemaLocation  the schema location string
222
   */
223
  public String getSchemaLocation() {
224
    return schemaLocation;
225
  }
226 2022 costa
227 2139 costa
228 2022 costa
  /**
229
   * Harvests each document in the site document list.
230
   *
231
   * @throws SAXException
232
   * @throws IOException
233
   * @throws ParserConfigurationException
234
   */
235
  public void harvestDocumentList() {
236
    HarvestDocument harvestDocument;
237 2036 costa
    boolean success;
238 2022 costa
239
    if (dueForHarvest()) {
240
      try {
241 2139 costa
        success = parseHarvestList();
242 2036 costa
243
        /* If the document list was validated, then proceed with harvesting
244
         * the documents
245
         */
246
        if (success) {
247
          metacatLogin();
248 2022 costa
249 2036 costa
          for (int i = 0; i < harvestDocumentList.size(); i++) {
250
            harvestDocument = (HarvestDocument) harvestDocumentList.get(i);
251 2022 costa
252 2036 costa
            if (harvestDocument != null) {
253
              harvestDocument.harvestDocument();
254
            }
255 2022 costa
          }
256 2036 costa
257
          metacatLogout();
258 2060 costa
          dbUpdateHarvestDates();  // Update the schedule
259 2022 costa
        }
260
      }
261
      catch (ParserConfigurationException e) {
262 2031 costa
        System.out.println("ParserConfigurationException: " + e.getMessage());
263 2022 costa
      }
264
265 2155 costa
      reportToSiteContact();
266 2022 costa
    }
267
  }
268
269
270
  /**
271 2031 costa
   * Login to Metacat using the ldapDN and ldapPwd
272 2022 costa
   */
273 2139 costa
  public void metacatLogin() {
274 2022 costa
    Metacat metacat = harvester.metacat;
275 2139 costa
    String response;
276 2022 costa
277
    if (harvester.connectToMetacat()) {
278
      try {
279
        System.out.println("Logging in to Metacat: " + ldapDN);
280 2139 costa
        response = metacat.login(ldapDN, ldapPwd);
281 2022 costa
        //System.out.println("Metacat login response: " + response);
282
      }
283
      catch (MetacatInaccessibleException e) {
284
        System.out.println("Metacat login failed." + e.getMessage());
285
      }
286
      catch (Exception e) {
287
        System.out.println("Metacat login failed." + e.getMessage());
288
      }
289 2031 costa
    }
290 2022 costa
  }
291
292
293
  /**
294
   * Logout from Metacat
295
   */
296
  private void metacatLogout() {
297
    Metacat metacat = harvester.metacat;
298
299
    if (harvester.connectToMetacat()) {
300
      try {
301
        // Log out from the Metacat session
302
        System.out.println("Logging out from Metacat");
303
        metacat.logout();
304
      }
305
      catch (MetacatInaccessibleException e) {
306
        System.out.println("Metacat inaccessible: " + e.getMessage());
307
      }
308
      catch (MetacatException e) {
309
        System.out.println("Metacat exception: " + e.getMessage());
310
      }
311
    }
312
  }
313
314
315
  /**
316 2139 costa
   * Parses the site harvest list XML file to find out which documents to
317
   * harvest.
318 2036 costa
   *
319
   * @return  true if successful, otherwise false
320 2022 costa
   */
321 2139 costa
  public boolean parseHarvestList()
322 2031 costa
          throws ParserConfigurationException {
323
    DocumentListHandler documentListHandler = new DocumentListHandler();
324
    InputStream inputStream;
325
    InputStreamReader inputStreamReader;
326 2139 costa
    String schemaLocation = getSchemaLocation();
327 2036 costa
    boolean success = false;
328 2031 costa
    URL url;
329
330
    try {
331
      url = new URL(documentListURL);
332
      inputStream = url.openStream();
333 2060 costa
      harvester.addLogEntry(0,
334
                            "Retrieved: " + documentListURL,
335 2139 costa
                            "GetHarvestListSuccess",
336 2060 costa
                            siteScheduleID,
337
                            null,
338
                            "");
339 2031 costa
      inputStreamReader = new InputStreamReader(inputStream);
340
      documentListHandler.runParser(inputStreamReader, schemaLocation);
341 2060 costa
      harvester.addLogEntry(0,
342
                            "Validated: " + documentListURL,
343 2139 costa
                            "ValidateHarvestListSuccess",
344 2060 costa
                            siteScheduleID,
345
                            null,
346
                            "");
347 2036 costa
      success = true;
348 2031 costa
    }
349
    catch (MalformedURLException e){
350
      harvester.addLogEntry(1, "MalformedURLException: " + e.getMessage(),
351 2139 costa
                            "GetHarvestListError", siteScheduleID, null, "");
352 2031 costa
    }
353
    catch (FileNotFoundException e) {
354
      harvester.addLogEntry(1, "FileNotFoundException: " + e.getMessage(),
355 2139 costa
                            "GetHarvestListError", siteScheduleID, null, "");
356 2031 costa
    }
357
    catch (SAXException e) {
358
      harvester.addLogEntry(1, "SAXException: " + e.getMessage(),
359 2367 costa
                          "ValidateHarvestListError", siteScheduleID, null, "");
360 2031 costa
    }
361
    catch (ClassNotFoundException e) {
362
      harvester.addLogEntry(1, "ClassNotFoundException: " + e.getMessage(),
363 2367 costa
                          "ValidateHarvestListError", siteScheduleID, null, "");
364 2031 costa
    }
365
    catch (IOException e) {
366
      harvester.addLogEntry(1, "IOException: " + e.getMessage(),
367 2139 costa
                            "GetHarvestListError", siteScheduleID, null, "");
368 2031 costa
    }
369 2036 costa
370
    return success;
371 2022 costa
  }
372
373
374
  /**
375
   * Prints the data that is stored in this HarvestSiteSchedule object.
376 2086 costa
   *
377
   * @param out   the PrintStream to write to
378 2022 costa
   */
379 2139 costa
  public void printOutput(PrintStream out) {
380 2086 costa
    out.println("* siteScheduleID:       " + siteScheduleID);
381
    out.println("* documentListURL:      " + documentListURL);
382
    out.println("* ldapDN:               " + ldapDN);
383
    out.println("* dateNextHarvest:      " + dateNextHarvest);
384
    out.println("* dateLastHarvest:      " + dateLastHarvest);
385
    out.println("* updateFrequency:      " + updateFrequency);
386
    out.println("* unit:                 " + unit);
387
    out.println("* contactEmail:         " + contactEmail);
388 2022 costa
  }
389
390 2155 costa
  /**
391
   * Reports a summary of the site harvest. Includes the following:
392
   *   A list of documents that were successfully inserted.
393
   *   A list of documents that were successfully updated.
394
   *   A list of documents that could not be accessed at the site.
395
   *   A list of documents that could not be uploaded to Metacat.
396
   *   A list of documents that were already found in Metacat.
397
   *
398
   * @param out  the PrintStream to write to
399
   */
400
  void printSiteSummary(PrintStream out) {
401
    HarvestDocument harvestDocument;
402
    int nAccessError = 0;
403
    int nInserted = 0;
404
    int nMetacatHasIt = 0;
405
    int nUpdated = 0;
406
    int nUploadError = 0;
407
408
    for (int i = 0; i < harvestDocumentList.size(); i++) {
409
      harvestDocument = (HarvestDocument) harvestDocumentList.get(i);
410
411
      if (harvestDocument != null) {
412
        if (harvestDocument.accessError)  { nAccessError++; }
413
        if (harvestDocument.inserted)     { nInserted++; }
414
        if (harvestDocument.metacatHasIt) { nMetacatHasIt++; }
415
        if (harvestDocument.updated)      { nUpdated++; }
416
        if (harvestDocument.uploadError)  { nUploadError++; }
417
      }
418
    }
419
420
    if (nInserted > 0) {
421
      printSiteSummaryHeader(out);
422
      out.println("* The following document(s) were successfully inserted:");
423
      for (int i = 0; i < harvestDocumentList.size(); i++) {
424
        harvestDocument = (HarvestDocument) harvestDocumentList.get(i);
425
        if (harvestDocument != null) {
426
          if (harvestDocument.inserted)  {
427
            harvestDocument.prettyPrint(out);
428
          }
429
        }
430
      }
431
      printSiteSummaryTrailer(out);
432
    }
433 2022 costa
434 2155 costa
    if (nUpdated > 0) {
435
      printSiteSummaryHeader(out);
436
      out.println("* The following document(s) were successfully updated:");
437
      for (int i = 0; i < harvestDocumentList.size(); i++) {
438
        harvestDocument = (HarvestDocument) harvestDocumentList.get(i);
439
        if (harvestDocument != null) {
440
          if (harvestDocument.updated)  {
441
            harvestDocument.prettyPrint(out);
442
          }
443
        }
444
      }
445
      printSiteSummaryTrailer(out);
446
    }
447
448
    if (nAccessError > 0) {
449
      printSiteSummaryHeader(out);
450
      out.println("* The following document(s) could not be accessed");
451
      out.println("* at the site. Please check the URL to ensure that it is");
452
      out.println("* accessible at the site.");
453
      for (int i = 0; i < harvestDocumentList.size(); i++) {
454
        harvestDocument = (HarvestDocument) harvestDocumentList.get(i);
455
        if (harvestDocument != null) {
456
          if (harvestDocument.accessError)  {
457
            harvestDocument.prettyPrint(out);
458
          }
459
        }
460
      }
461
      printSiteSummaryTrailer(out);
462
    }
463
464
    if (nUploadError > 0) {
465
      printSiteSummaryHeader(out);
466
      out.println("* The following document(s) could not be uploaded to");
467
      out.println("* Metacat because an error of some kind occurred.");
468 2777 costa
      out.println("* (See log entries below for additional details.) :");
469 2155 costa
      for (int i = 0; i < harvestDocumentList.size(); i++) {
470
        harvestDocument = (HarvestDocument) harvestDocumentList.get(i);
471
        if (harvestDocument != null) {
472
          if (harvestDocument.uploadError)  {
473
            harvestDocument.prettyPrint(out);
474
          }
475
        }
476
      }
477
      printSiteSummaryTrailer(out);
478
    }
479
480
    if (nMetacatHasIt > 0) {
481
      printSiteSummaryHeader(out);
482 2777 costa
      out.println("* The following document(s) were already found in Metacat:");
483
484 2155 costa
      for (int i = 0; i < harvestDocumentList.size(); i++) {
485
        harvestDocument = (HarvestDocument) harvestDocumentList.get(i);
486
        if (harvestDocument != null) {
487
          if (harvestDocument.metacatHasIt)  {
488
            harvestDocument.prettyPrint(out);
489
          }
490
        }
491
      }
492
      printSiteSummaryTrailer(out);
493
    }
494
495
  }
496
497
498 2022 costa
  /**
499 2155 costa
   * Prints the header lines of a site summary entry.
500
   *
501
   * @param out    the PrintStream to write to
502 2022 costa
   */
503 2155 costa
  void printSiteSummaryHeader(PrintStream out) {
504
    final String filler = Harvester.filler;
505
    final String marker = Harvester.marker;
506
507
    out.println("");
508
    out.println(marker);
509
    out.println(filler);
510
  }
511
512
513
  /**
514
   * Prints the trailing lines of a site summary entry.
515
   *
516
   * @param out    the PrintStream to write to
517
   */
518
  void printSiteSummaryTrailer(PrintStream out) {
519
    final String filler = Harvester.filler;
520
    final String marker = Harvester.marker;
521
522
    out.println(filler);
523
    out.println(marker);
524
  }
525
526
527
  /**
528
   * Sends a report to the Site Contact summarizing the results of the harvest
529
   * at that site.
530
   */
531
  void reportToSiteContact() {
532 2086 costa
    PrintStream body;
533 2105 costa
    String from = harvester.harvesterAdministrator;
534 2330 costa
    String[] fromArray;
535 2765 costa
    String maxCodeLevel = "notice";
536 2086 costa
    MailMessage msg;
537 2105 costa
    int nErrors = 0;
538 2108 costa
    String subject = "Report from Metacat Harvester: " + harvester.timestamp;
539 2086 costa
    String to = contactEmail;
540 2330 costa
    String[] toArray;
541 2086 costa
542
    if (!to.equals("")) {
543
      System.out.println("Sending report to siteScheduleID=" + siteScheduleID +
544
                         " at address: " + contactEmail);
545
      try {
546 2105 costa
        msg = new MailMessage(harvester.smtpServer);
547 2330 costa
548
        if (from.indexOf(',') > 0) {
549
          fromArray = from.split(",");
550
551
          for (int i = 0; i < fromArray.length; i++) {
552
            if (i == 0) {
553
              msg.from(fromArray[i]);
554
            }
555
556
            msg.cc(fromArray[i]);
557
558
          }
559
        }
560
        else if (from.indexOf(';') > 0) {
561
          fromArray = from.split(";");
562
563
          for (int i = 0; i < fromArray.length; i++) {
564
            if (i == 0) {
565
              msg.from(fromArray[i]);
566
            }
567
568
            msg.cc(fromArray[i]);
569
570
          }
571
        }
572
        else {
573
          msg.from(from);
574
          msg.cc(from);
575
        }
576
577
        if (to.indexOf(',') > 0) {
578
          toArray = to.split(",");
579
580
          for (int i = 0; i < toArray.length; i++) {
581
            msg.to(toArray[i]);
582
          }
583
        }
584
        else if (to.indexOf(';') > 0) {
585
          toArray = to.split(";");
586
587
          for (int i = 0; i < toArray.length; i++) {
588
            msg.to(toArray[i]);
589
          }
590
        }
591
        else {
592
          msg.to(to);
593
        }
594
595 2086 costa
        msg.setSubject(subject);
596
        body = msg.getPrintStream();
597 2155 costa
        harvester.printHarvestHeader(body, siteScheduleID);
598
        printSiteSummary(body);
599 2105 costa
        harvester.printHarvestLog(body, maxCodeLevel, siteScheduleID);
600
        msg.sendAndClose();
601 2086 costa
      }
602
      catch (IOException e) {
603
        System.out.println("There was a problem sending email to " + to);
604
        System.out.println("IOException: " + e.getMessage());
605
      }
606
    }
607 2022 costa
  }
608
609
610
  /**
611 2139 costa
   * Accessor method for setting the value of the schemaLocation field.
612
   *
613
   * @param schemaLocation  the new value of the schemaLocation field
614
   */
615
  public void setSchemaLocation(String schemaLocation) {
616
    this.schemaLocation = schemaLocation;
617
  }
618
619
620
  /**
621 2022 costa
   * This inner class extends DefaultHandler. It parses the document list,
622
   * creating a new HarvestDocument object every time it finds a </Document>
623
   * end tag.
624
   */
625 2031 costa
  class DocumentListHandler extends DefaultHandler implements ErrorHandler {
626 2022 costa
627
    public String scope;
628
    public int identifier;
629 2036 costa
    public String identifierString;
630
    public String documentType;
631 2022 costa
    public int revision;
632 2036 costa
    public String revisionString;
633 2022 costa
    public String documentURL;
634
    private String currentQname;
635 2031 costa
    public final static String DEFAULT_PARSER =
636
           "org.apache.xerces.parsers.SAXParser";
637
    private boolean schemaValidate = true;
638 2022 costa
639
640 2031 costa
	  /**
641
     * This method is called for any plain text within an element.
642
     * It parses the value for any of the following elements:
643
     * <scope>, <identifier>, <revision>, <documentType>, <documentURL>
644
     *
645
     * @param ch          the character array holding the parsed text
646
     * @param start       the start index
647
     * @param length      the text length
648
     *
649 2022 costa
     */
650 2031 costa
    public void characters (char ch[], int start, int length) {
651
      String s = new String(ch, start, length);
652
653
      if (length > 0) {
654
        if (currentQname.equals("scope")) {
655 2036 costa
          scope += s;
656 2031 costa
        }
657
        else if (currentQname.equals("identifier")) {
658 2036 costa
          identifierString += s;
659 2031 costa
        }
660
        else if (currentQname.equals("revision")) {
661 2036 costa
          revisionString += s;
662 2031 costa
        }
663
        else if (currentQname.equals("documentType")) {
664 2036 costa
          documentType += s;
665 2031 costa
        }
666
        else if (currentQname.equals("documentURL")) {
667 2036 costa
          documentURL += s;
668 2031 costa
        }
669
      }
670 2022 costa
    }
671
672
673
    /**
674
     * Handles an end-of-document event.
675
     */
676
    public void endDocument () {
677
      System.out.println("Finished parsing " + documentListURL);
678
    }
679
680
681
    /**
682
     * Handles an end-of-element event. If the end tag is </Document>, then
683
     * creates a new HarvestDocument object and pushes it to the document
684
     * list.
685
     *
686
     * @param uri
687
     * @param localname
688
     * @param qname
689
     */
690
    public void endElement(String uri,
691
                           String localname,
692
                           String qname) {
693
694
      HarvestDocument harvestDocument;
695
696 2036 costa
      if (qname.equals("identifier")) {
697
        identifier = Integer.parseInt(identifierString);
698
      }
699
      else if (qname.equals("revision")) {
700
        revision = Integer.parseInt(revisionString);
701
      }
702
      else if (qname.equals("document")) {
703 2022 costa
        harvestDocument = new HarvestDocument(
704
                                              harvester,
705
                                              HarvestSiteSchedule.this,
706
                                              scope,
707
                                              identifier,
708
                                              revision,
709
                                              documentType,
710
                                              documentURL
711
                                             );
712 2031 costa
        harvestDocumentList.add(harvestDocument);
713 2022 costa
      }
714 2036 costa
715
      currentQname = "";
716 2022 costa
    }
717
718
719 2031 costa
    /**
720
     * Method for handling errors during a parse
721
     *
722
     * @param exception         The parsing error
723
     * @exception SAXException  Description of Exception
724 2022 costa
     */
725 2031 costa
     public void error(SAXParseException e) throws SAXParseException {
726
        System.out.println("SAXParseException: " + e.getMessage());
727
        throw e;
728
    }
729
730
731
    /**
732
     * Run the validating parser
733
     *
734
     * @param xml             the xml stream to be validated
735
     * @schemaLocation        relative path the to XML Schema file, e.g. "."
736
     * @exception IOException thrown when test files can't be opened
737
     * @exception ClassNotFoundException thrown when SAX Parser class not found
738
     * @exception SAXException
739
     * @exception SAXParserException
740
     */
741
    public void runParser(Reader xml, String schemaLocation)
742
           throws IOException, ClassNotFoundException,
743
                  SAXException, SAXParseException {
744
745
      // Get an instance of the parser
746
      XMLReader parser;
747
748
      parser = XMLReaderFactory.createXMLReader(DEFAULT_PARSER);
749
      // Set Handlers in the parser
750
      parser.setContentHandler((ContentHandler)this);
751
      parser.setErrorHandler((ErrorHandler)this);
752
      parser.setFeature("http://xml.org/sax/features/namespaces", true);
753
      parser.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
754
      parser.setFeature("http://xml.org/sax/features/validation", true);
755
      parser.setProperty(
756
              "http://apache.org/xml/properties/schema/external-schemaLocation",
757
              schemaLocation);
758
759
      if (schemaValidate) {
760
        parser.setFeature("http://apache.org/xml/features/validation/schema",
761
                          true);
762 2022 costa
      }
763 2031 costa
764
      // Parse the document
765
      parser.parse(new InputSource(xml));
766 2022 costa
    }
767 2031 costa
    /**
768
     * Handles a start-of-document event.
769
     */
770
    public void startDocument () {
771
      System.out.println("Started parsing " + documentListURL);
772
    }
773 2022 costa
774 2031 costa
775
    /**
776
     * Handles a start-of-element event.
777
     *
778
     * @param uri
779
     * @param localname
780
     * @param qname
781
     * @param attributes
782
     */
783
    public void startElement(String uri,
784
                             String localname,
785
                             String qname,
786
                             Attributes attributes) {
787
788
      currentQname = qname;
789 2036 costa
790
      if (qname.equals("scope")) {
791
        scope = "";
792
      }
793
      else if (qname.equals("identifier")) {
794
        identifierString = "";
795
      }
796
      else if (qname.equals("revision")) {
797
        revisionString = "";
798
      }
799
      else if (qname.equals("documentType")) {
800
        documentType = "";
801
      }
802
      else if (qname.equals("documentURL")) {
803
        documentURL = "";
804
      }
805 2031 costa
    }
806 2022 costa
  }
807
}