Project

General

Profile

1 2094 jones
/**
2
 *  '$RCSfile$'
3
 *  Copyright: 2004 University of New Mexico and the
4
 *                  Regents of the University of California
5 2022 costa
 *
6 2094 jones
 *   '$Author$'
7
 *     '$Date$'
8
 * '$Revision$'
9
 *
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * This program is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
 * GNU General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU General Public License
21
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 2022 costa
 */
24
25
package edu.ucsb.nceas.metacat.harvesterClient;
26
27
import java.io.InputStream;
28
import java.io.InputStreamReader;
29
import java.io.IOException;
30 2086 costa
import java.io.PrintStream;
31 2022 costa
import java.io.StringReader;
32
import java.net.MalformedURLException;
33
import java.net.URL;
34 2139 costa
import java.sql.Connection;
35 2036 costa
import java.sql.ResultSet;
36
import java.sql.SQLException;
37
import java.sql.Statement;
38 2022 costa
39 2031 costa
import edu.ucsb.nceas.metacat.client.InsufficientKarmaException;
40
import edu.ucsb.nceas.metacat.client.Metacat;
41
import edu.ucsb.nceas.metacat.client.MetacatException;
42
import edu.ucsb.nceas.metacat.client.MetacatInaccessibleException;
43 2022 costa
import edu.ucsb.nceas.utilities.IOUtil;
44
45
46
/**
47
 * HarvestDocument manages operations and data for a single document to be
48
 * harvested.
49
 *
50
 * @author  costa
51
 */
52
public class HarvestDocument {
53 2036 costa
54
55
  private String docid;                      // scope + identifier
56
  private String docidFull;                  // scope + identifier + revision
57 2031 costa
  String documentType;
58
  String documentURL;
59 2022 costa
  private Harvester harvester;
60
  private HarvestSiteSchedule harvestSiteSchedule;
61 2031 costa
  int identifier;
62
  int revision;
63
  String scope;
64 2155 costa
65
  /* These booleans keep track of status information. They are used when
66
   * generating email reports.
67
   */
68
  boolean accessError = false;
69
  boolean inserted = false;
70
  boolean metacatHasIt = false;
71
  boolean updated = false;
72
  boolean uploadError = false;
73 2022 costa
74
75
  /**
76
   * Creates a new instance of HarvestDocument. Initialized with the data
77
   * that was read from a single <document> element in site document list.
78
   *
79
   * @param harvester            the parent Harvester object
80
   * @param harvestSiteSchedule  the parent HarvestSiteSchedule object
81
   * @param scope                the value of the <scope> element
82
   * @param identifier           the value of the <identifier> element
83
   * @param revision             the value of the <revision> element
84
   * @param documentType         the value of the <documentType> element
85
   * @param documentURL          the value of the <documentURL> element
86
   */
87 2139 costa
  public HarvestDocument (
88 2022 costa
                          Harvester harvester,
89
                          HarvestSiteSchedule harvestSiteSchedule,
90
                          String scope,
91
                          int identifier,
92
                          int revision,
93
                          String documentType,
94
                          String documentURL
95
                        ) {
96
    this.harvester = harvester;
97
    this.harvestSiteSchedule = harvestSiteSchedule;
98
    this.documentType = documentType;
99
    this.documentURL = documentURL;
100
    this.scope = scope;
101
    this.identifier = identifier;
102
    this.revision = revision;
103
104 2036 costa
    this.docid = scope + "." + identifier;
105
    this.docidFull = this.docid + "." + revision;
106 2022 costa
  }
107
108
109
  /**
110
   * Retrieve the document from the site using its <documentURL> value.
111
   *
112
   * @return   A StringReader containing the document string.
113
   */
114 2139 costa
  public StringReader getSiteDocument() {
115 2022 costa
    String documentString;
116
    InputStream inputStream;
117
    InputStreamReader inputStreamReader;
118
    StringReader stringReader = null;
119
    URL url;
120
121
    try {
122
      url = new URL(documentURL);
123
      inputStream = url.openStream();
124
      inputStreamReader = new InputStreamReader(inputStream);
125
      documentString = IOUtil.getAsString(inputStreamReader, true);
126
      stringReader = new StringReader(documentString);
127 2036 costa
      harvester.addLogEntry(0,
128
                            "Retrieved: " + documentURL,
129 4175 daigle
                            "harvester.GetDocSuccess",
130 2036 costa
                            harvestSiteSchedule.siteScheduleID,
131
                            null,
132
                            "");
133 2022 costa
    }
134
    catch (MalformedURLException e) {
135 2155 costa
      accessError = true;
136 4175 daigle
      harvester.addLogEntry(1, "MalformedURLException", "harvester.GetDocError",
137 2031 costa
                            harvestSiteSchedule.siteScheduleID, this,
138
                            "MalformedURLException: " + e.getMessage());
139 2022 costa
    }
140
    catch (IOException e) {
141 2155 costa
      accessError = true;
142 4175 daigle
      harvester.addLogEntry(1, "IOException", "harvester.GetDocError",
143 2031 costa
                            harvestSiteSchedule.siteScheduleID, this,
144
                            "IOException: " + e.getMessage());
145 2022 costa
    }
146
147
    return stringReader;
148
  }
149
150
151
  /**
152
   * Harvest the document from the site. Unless Metacat already has the
153
   * document, retrieve the document from the site and put (insert or
154
   * update) it to Metacat. If Metacat already has the document, determine
155
   * the highest revision stored in Metacat so that this can be reported
156
   * back to the user.
157
   */
158
  public void harvestDocument() {
159
    int highestRevision;
160 2036 costa
    boolean insert = false;
161 2022 costa
    String metacatReturnString;
162
    StringReader stringReader;
163 2036 costa
    boolean update = false;
164 2022 costa
165
    /* If metacat already has this document, determine the highest revision in
166
     * metacat and report it to the user; else, insert or delete the document
167
     * into metacat.
168
     */
169 2036 costa
    highestRevision = metacatHighestRevision();
170
171
    if (highestRevision == -1) {
172
      insert = true;
173
    }
174
    else if (revision > highestRevision) {
175
      update = true;
176
    }
177
    else {
178 2155 costa
      metacatHasIt = true;
179 2031 costa
      harvester.addLogEntry(0,
180 2036 costa
                            "Attempting to update " + docid + " to revision " +
181
                            revision + ". Metacat has document revision " +
182
                            highestRevision + ".",
183 4175 daigle
                            "harvester.MetacatHasDoc",
184 2036 costa
                            harvestSiteSchedule.siteScheduleID,
185
                            null,
186
                            "");
187 2022 costa
    }
188 2036 costa
189
    if (insert || update) {
190 2022 costa
      stringReader = getSiteDocument();
191
      if (stringReader != null) {
192 2036 costa
        if (validateDocument()) {
193
          putMetacatDocument(insert, update, stringReader);
194 2022 costa
        }
195
      }
196
    }
197
  }
198 2031 costa
199 2022 costa
200
  /**
201 5169 costa
   * Boolean to determine whether the string returned by the Metacat client for
202
   * an insert or update operation indicates that the operation succeeded.
203
   *
204
   * @param metacatReturnString     The string returned by the Metacat client.
205
   * @return true if the return string indicates success, else false
206
   */
207
  private boolean isMetacatSuccessString(String metacatReturnString) {
208
    boolean isMetacatSuccessString = false;
209
210
    if ((metacatReturnString != null) &&
211
        (metacatReturnString.contains("<success>"))
212
       ) {
213
      isMetacatSuccessString = true;
214
    }
215
216
    return isMetacatSuccessString;
217
  }
218
219
220
  /**
221 2031 costa
   * Logs a metacat document error to the harvest detail log.
222
   *
223
   * @param insert               true if insert operation, false is update
224
   * @param metacatReturnString  string returned from the insert or update
225
   * @param exceptionName        name of the exception class
226
   * @param e                    the exception object
227
   */
228
  private void logMetacatError (boolean insert,
229
                                String metacatReturnString,
230
                                String exceptionName,
231
                                Exception e
232
                               ) {
233 2155 costa
    uploadError = true;
234
235 2031 costa
    if (insert) {
236 2036 costa
      harvester.addLogEntry(1,
237
                            metacatReturnString,
238 4175 daigle
                            "harvester.InsertDocError",
239 2036 costa
                            harvestSiteSchedule.siteScheduleID,
240
                            this,
241
                            exceptionName + ": " + e.getMessage());
242 2031 costa
    }
243
    else {
244 2036 costa
      harvester.addLogEntry(1,
245
                            metacatReturnString,
246 4175 daigle
                            "harvester.UpdateDocError",
247 2036 costa
                            harvestSiteSchedule.siteScheduleID,
248
                            this,
249
                            exceptionName + ": " + e.getMessage());
250 2031 costa
    }
251
  }
252
253
254
  /**
255 2022 costa
   * Determines the highest revision that Metacat has for this document.
256
   *
257 2036 costa
   * @return  int representing the highest revision for this document in
258
   *          Metacat. Returns -1 if Metacat does not currently hold the
259
   *          document.
260 2022 costa
   */
261 2139 costa
  public int metacatHighestRevision() {
262
    Connection conn = harvester.getConnection();
263 2036 costa
    int         highestRevision = -1;
264
		String query = "SELECT REV FROM XML_DOCUMENTS WHERE DOCID = " +
265
                   "'" + docid + "'";
266
		Statement stmt;
267 2022 costa
268 2036 costa
		try {
269 2139 costa
			stmt = conn.createStatement();
270 2036 costa
			ResultSet rs = stmt.executeQuery(query);
271
272
			while (rs.next()) {
273
				highestRevision = rs.getInt("REV");
274
			}
275
276
			stmt.close();
277
		}
278
    catch(SQLException e) {
279
			System.out.println("SQLException: " + e.getMessage());
280 2031 costa
    }
281 2036 costa
282
    return highestRevision;
283 2022 costa
  }
284
285
286
  /**
287
   * Print the data fields and values in this HarvestDocument object.
288 2086 costa
   *
289
   * @param out   the PrintStream to write to
290 2022 costa
   */
291 2139 costa
  public void printOutput(PrintStream out) {
292 2086 costa
    out.println("* scope:                " + scope);
293
    out.println("* identifier:           " + identifier);
294
    out.println("* revision:             " + revision);
295
    out.println("* documentType:         " + documentType);
296
    out.println("* documentURL:          " + documentURL);
297 2022 costa
  }
298
299
300
  /**
301 2155 costa
   * Print the document URL following by its scope.identifier.revision.
302
   * Used for report generation.
303
   *
304
   * @param out   the PrintStream to write to
305
   */
306
  public void prettyPrint(PrintStream out) {
307
    out.println("*   " + docidFull + "  (" + documentURL + ")");
308
  }
309
310
311
  /**
312 2022 costa
   * Insert or update this document to Metacat. If revision equals 1, do an
313
   * insert; otherwise, do an update.
314 2036 costa
   *
315
   * @param insert       true if this is an insert operation
316
   * @param update       true if this is an update operation
317
   * @param stringReader the StringReader object holding the document text
318 2022 costa
   */
319 2036 costa
  private void putMetacatDocument(boolean insert,
320
                                  boolean update,
321
                                  StringReader stringReader) {
322 2022 costa
    Metacat metacat = harvester.metacat;
323
    String metacatReturnString = "";
324 2031 costa
325 2022 costa
    if (harvester.connectToMetacat()) {
326
      try {
327 5169 costa
        String harvestOperationCode = "";
328
329 2031 costa
        if (insert) {
330 5169 costa
          harvestOperationCode = "harvester.InsertDocSuccess";
331 2036 costa
          metacatReturnString = metacat.insert(docidFull, stringReader, null);
332 5169 costa
          this.inserted = true;
333 2022 costa
        }
334 2036 costa
        else if (update) {
335 5169 costa
          harvestOperationCode = "harvester.UpdateDocSuccess";
336 2036 costa
          metacatReturnString = metacat.update(docidFull, stringReader, null);
337 5169 costa
          this.updated = true;
338 2022 costa
        }
339 5169 costa
340
        if (isMetacatSuccessString(metacatReturnString)) {
341
          String message = docidFull + " : " + metacatReturnString;
342
          harvester.addLogEntry(0, message, harvestOperationCode,
343
                                harvestSiteSchedule.siteScheduleID, null, "");
344
        }
345
        else {
346
          this.inserted = false;
347
          this.updated = false;
348
          final String exceptionName = "UnreportedMetacatException";
349
          final String exceptionMessage =
350
                    "Metacat insert/update failed without reporting an exception";
351
          Exception e = new Exception(exceptionMessage);
352
          logMetacatError(insert, metacatReturnString, exceptionName, e);
353
        }
354 2022 costa
      }
355
      catch (MetacatInaccessibleException e) {
356 2031 costa
        logMetacatError(insert, metacatReturnString,
357
                        "MetacatInaccessibleException", e);
358 2022 costa
      }
359
      catch (InsufficientKarmaException e) {
360 2031 costa
        logMetacatError(insert, metacatReturnString,
361
                        "InsufficientKarmaException", e);
362 2022 costa
      }
363
      catch (MetacatException e) {
364 2031 costa
        logMetacatError(insert, metacatReturnString, "MetacatException", e);
365 2022 costa
      }
366
      catch (IOException e) {
367 2031 costa
        logMetacatError(insert, metacatReturnString, "IOException", e);
368 2022 costa
      }
369 5169 costa
      catch (Exception e) {
370
        logMetacatError(insert, metacatReturnString, "Exception", e);
371
      }
372 2022 costa
    }
373
  }
374 2036 costa
375
376
  /**
377
   * Validate the document to determine whether it is valid EML prior to
378
   * inserting or updating it to Metacat. This is QA/QC measure.
379
   * Not yet implemented.
380
   *
381
   * @return  true if the document is valid EML, otherwise false
382
   */
383
  private boolean validateDocument () {
384
    boolean success = true;
385
386
    /*if (success) {
387
      harvester.addLogEntry(0,
388
                            "Validated: " + documentURL,
389 4175 daigle
                            "harvester.ValidateDocSuccess",
390 2036 costa
                            harvestSiteSchedule.siteScheduleID,
391
                            null,
392
                            "");
393
    }
394
    else {
395 4175 daigle
      harvester.addLogEntry(1, "Error validating document", "harvester.ValidateDocError",
396 2036 costa
                            harvestSiteSchedule.siteScheduleID, this, "");
397
    }*/
398
399
    return success;
400
  }
401
402 2022 costa
}