Project

General

Profile

1 2094 jones
/**
2
 *  '$RCSfile$'
3
 *  Copyright: 2004 University of New Mexico and the
4
 *                  Regents of the University of California
5 2022 costa
 *
6 2094 jones
 *   '$Author$'
7
 *     '$Date$'
8
 * '$Revision$'
9
 *
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * This program is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
 * GNU General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU General Public License
21
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 2022 costa
 */
24
25
package edu.ucsb.nceas.metacat.harvesterClient;
26
27
import java.io.InputStream;
28
import java.io.InputStreamReader;
29
import java.io.IOException;
30 2086 costa
import java.io.PrintStream;
31 2022 costa
import java.io.StringReader;
32
import java.net.MalformedURLException;
33
import java.net.URL;
34 2139 costa
import java.sql.Connection;
35 2036 costa
import java.sql.ResultSet;
36
import java.sql.SQLException;
37
import java.sql.Statement;
38 2022 costa
39 2031 costa
import edu.ucsb.nceas.metacat.client.InsufficientKarmaException;
40
import edu.ucsb.nceas.metacat.client.Metacat;
41
import edu.ucsb.nceas.metacat.client.MetacatException;
42
import edu.ucsb.nceas.metacat.client.MetacatInaccessibleException;
43 2022 costa
import edu.ucsb.nceas.utilities.IOUtil;
44
45
46
/**
47
 * HarvestDocument manages operations and data for a single document to be
48
 * harvested.
49
 *
50
 * @author  costa
51
 */
52
public class HarvestDocument {
53 2036 costa
54
55
  private String docid;                      // scope + identifier
56
  private String docidFull;                  // scope + identifier + revision
57 2031 costa
  String documentType;
58
  String documentURL;
59 2022 costa
  private Harvester harvester;
60
  private HarvestSiteSchedule harvestSiteSchedule;
61 2031 costa
  int identifier;
62
  int revision;
63
  String scope;
64 2155 costa
65
  /* These booleans keep track of status information. They are used when
66
   * generating email reports.
67
   */
68
  boolean accessError = false;
69
  boolean inserted = false;
70
  boolean metacatHasIt = false;
71
  boolean updated = false;
72
  boolean uploadError = false;
73 2022 costa
74
75
  /**
76
   * Creates a new instance of HarvestDocument. Initialized with the data
77
   * that was read from a single <document> element in site document list.
78
   *
79
   * @param harvester            the parent Harvester object
80
   * @param harvestSiteSchedule  the parent HarvestSiteSchedule object
81
   * @param scope                the value of the <scope> element
82
   * @param identifier           the value of the <identifier> element
83
   * @param revision             the value of the <revision> element
84
   * @param documentType         the value of the <documentType> element
85
   * @param documentURL          the value of the <documentURL> element
86
   */
87 2139 costa
  public HarvestDocument (
88 2022 costa
                          Harvester harvester,
89
                          HarvestSiteSchedule harvestSiteSchedule,
90
                          String scope,
91
                          int identifier,
92
                          int revision,
93
                          String documentType,
94
                          String documentURL
95
                        ) {
96
    this.harvester = harvester;
97
    this.harvestSiteSchedule = harvestSiteSchedule;
98
    this.documentType = documentType;
99
    this.documentURL = documentURL;
100
    this.scope = scope;
101
    this.identifier = identifier;
102
    this.revision = revision;
103
104 2036 costa
    this.docid = scope + "." + identifier;
105
    this.docidFull = this.docid + "." + revision;
106 2022 costa
  }
107
108
109
  /**
110
   * Retrieve the document from the site using its <documentURL> value.
111
   *
112
   * @return   A StringReader containing the document string.
113
   */
114 2139 costa
  public StringReader getSiteDocument() {
115 2022 costa
    String documentString;
116
    InputStream inputStream;
117
    InputStreamReader inputStreamReader;
118
    StringReader stringReader = null;
119
    URL url;
120
121
    try {
122
      url = new URL(documentURL);
123
      inputStream = url.openStream();
124
      inputStreamReader = new InputStreamReader(inputStream);
125
      documentString = IOUtil.getAsString(inputStreamReader, true);
126
      stringReader = new StringReader(documentString);
127 2036 costa
      harvester.addLogEntry(0,
128
                            "Retrieved: " + documentURL,
129 4175 daigle
                            "harvester.GetDocSuccess",
130 2036 costa
                            harvestSiteSchedule.siteScheduleID,
131
                            null,
132
                            "");
133 2022 costa
    }
134
    catch (MalformedURLException e) {
135 2155 costa
      accessError = true;
136 4175 daigle
      harvester.addLogEntry(1, "MalformedURLException", "harvester.GetDocError",
137 2031 costa
                            harvestSiteSchedule.siteScheduleID, this,
138
                            "MalformedURLException: " + e.getMessage());
139 2022 costa
    }
140
    catch (IOException e) {
141 2155 costa
      accessError = true;
142 4175 daigle
      harvester.addLogEntry(1, "IOException", "harvester.GetDocError",
143 2031 costa
                            harvestSiteSchedule.siteScheduleID, this,
144
                            "IOException: " + e.getMessage());
145 2022 costa
    }
146
147
    return stringReader;
148
  }
149
150
151
  /**
152
   * Harvest the document from the site. Unless Metacat already has the
153
   * document, retrieve the document from the site and put (insert or
154
   * update) it to Metacat. If Metacat already has the document, determine
155
   * the highest revision stored in Metacat so that this can be reported
156
   * back to the user.
157
   */
158
  public void harvestDocument() {
159
    int highestRevision;
160 2036 costa
    boolean insert = false;
161 2022 costa
    String metacatReturnString;
162
    StringReader stringReader;
163 2036 costa
    boolean update = false;
164 2022 costa
165
    /* If metacat already has this document, determine the highest revision in
166
     * metacat and report it to the user; else, insert or delete the document
167
     * into metacat.
168
     */
169 2036 costa
    highestRevision = metacatHighestRevision();
170
171
    if (highestRevision == -1) {
172
      insert = true;
173
    }
174
    else if (revision > highestRevision) {
175
      update = true;
176
    }
177
    else {
178 2155 costa
      metacatHasIt = true;
179 2031 costa
      harvester.addLogEntry(0,
180 2036 costa
                            "Attempting to update " + docid + " to revision " +
181
                            revision + ". Metacat has document revision " +
182
                            highestRevision + ".",
183 4175 daigle
                            "harvester.MetacatHasDoc",
184 2036 costa
                            harvestSiteSchedule.siteScheduleID,
185
                            null,
186
                            "");
187 2022 costa
    }
188 2036 costa
189
    if (insert || update) {
190 2022 costa
      stringReader = getSiteDocument();
191
      if (stringReader != null) {
192 2036 costa
        if (validateDocument()) {
193
          putMetacatDocument(insert, update, stringReader);
194 2022 costa
        }
195
      }
196
    }
197
  }
198 2031 costa
199 2022 costa
200
  /**
201 2031 costa
   * Logs a metacat document error to the harvest detail log.
202
   *
203
   * @param insert               true if insert operation, false is update
204
   * @param metacatReturnString  string returned from the insert or update
205
   * @param exceptionName        name of the exception class
206
   * @param e                    the exception object
207
   */
208
  private void logMetacatError (boolean insert,
209
                                String metacatReturnString,
210
                                String exceptionName,
211
                                Exception e
212
                               ) {
213 2155 costa
    uploadError = true;
214
215 2031 costa
    if (insert) {
216 2036 costa
      harvester.addLogEntry(1,
217
                            metacatReturnString,
218 4175 daigle
                            "harvester.InsertDocError",
219 2036 costa
                            harvestSiteSchedule.siteScheduleID,
220
                            this,
221
                            exceptionName + ": " + e.getMessage());
222 2031 costa
    }
223
    else {
224 2036 costa
      harvester.addLogEntry(1,
225
                            metacatReturnString,
226 4175 daigle
                            "harvester.UpdateDocError",
227 2036 costa
                            harvestSiteSchedule.siteScheduleID,
228
                            this,
229
                            exceptionName + ": " + e.getMessage());
230 2031 costa
    }
231
  }
232
233
234
  /**
235 2022 costa
   * Determines the highest revision that Metacat has for this document.
236
   *
237 2036 costa
   * @return  int representing the highest revision for this document in
238
   *          Metacat. Returns -1 if Metacat does not currently hold the
239
   *          document.
240 2022 costa
   */
241 2139 costa
  public int metacatHighestRevision() {
242
    Connection conn = harvester.getConnection();
243 2036 costa
    int         highestRevision = -1;
244
		String query = "SELECT REV FROM XML_DOCUMENTS WHERE DOCID = " +
245
                   "'" + docid + "'";
246
		Statement stmt;
247 2022 costa
248 2036 costa
		try {
249 2139 costa
			stmt = conn.createStatement();
250 2036 costa
			ResultSet rs = stmt.executeQuery(query);
251
252
			while (rs.next()) {
253
				highestRevision = rs.getInt("REV");
254
			}
255
256
			stmt.close();
257
		}
258
    catch(SQLException e) {
259
			System.out.println("SQLException: " + e.getMessage());
260 2031 costa
    }
261 2036 costa
262
    return highestRevision;
263 2022 costa
  }
264
265
266
  /**
267
   * Print the data fields and values in this HarvestDocument object.
268 2086 costa
   *
269
   * @param out   the PrintStream to write to
270 2022 costa
   */
271 2139 costa
  public void printOutput(PrintStream out) {
272 2086 costa
    out.println("* scope:                " + scope);
273
    out.println("* identifier:           " + identifier);
274
    out.println("* revision:             " + revision);
275
    out.println("* documentType:         " + documentType);
276
    out.println("* documentURL:          " + documentURL);
277 2022 costa
  }
278
279
280
  /**
281 2155 costa
   * Print the document URL following by its scope.identifier.revision.
282
   * Used for report generation.
283
   *
284
   * @param out   the PrintStream to write to
285
   */
286
  public void prettyPrint(PrintStream out) {
287
    out.println("*   " + docidFull + "  (" + documentURL + ")");
288
  }
289
290
291
  /**
292 2022 costa
   * Insert or update this document to Metacat. If revision equals 1, do an
293
   * insert; otherwise, do an update.
294 2036 costa
   *
295
   * @param insert       true if this is an insert operation
296
   * @param update       true if this is an update operation
297
   * @param stringReader the StringReader object holding the document text
298 2022 costa
   */
299 2036 costa
  private void putMetacatDocument(boolean insert,
300
                                  boolean update,
301
                                  StringReader stringReader) {
302 2022 costa
    Metacat metacat = harvester.metacat;
303
    String metacatReturnString = "";
304 2031 costa
305 2022 costa
    if (harvester.connectToMetacat()) {
306
      try {
307 2031 costa
        if (insert) {
308 2036 costa
          metacatReturnString = metacat.insert(docidFull, stringReader, null);
309 2155 costa
          inserted = true;
310 2036 costa
          harvester.addLogEntry(0, docidFull + " : " + metacatReturnString,
311 4175 daigle
                                "harvester.InsertDocSuccess",
312 2031 costa
                                harvestSiteSchedule.siteScheduleID,
313
                                null, "");
314 2022 costa
        }
315 2036 costa
        else if (update) {
316
          metacatReturnString = metacat.update(docidFull, stringReader, null);
317 2155 costa
          updated = true;
318 2036 costa
          harvester.addLogEntry(0, docidFull + " : " + metacatReturnString,
319 4175 daigle
                                "harvester.UpdateDocSuccess",
320 2031 costa
                                harvestSiteSchedule.siteScheduleID,
321
                                null, "");
322 2022 costa
        }
323
      }
324
      catch (MetacatInaccessibleException e) {
325 2031 costa
        logMetacatError(insert, metacatReturnString,
326
                        "MetacatInaccessibleException", e);
327 2022 costa
      }
328
      catch (InsufficientKarmaException e) {
329 2031 costa
        logMetacatError(insert, metacatReturnString,
330
                        "InsufficientKarmaException", e);
331 2022 costa
      }
332
      catch (MetacatException e) {
333 2031 costa
        logMetacatError(insert, metacatReturnString, "MetacatException", e);
334 2022 costa
      }
335
      catch (IOException e) {
336 2031 costa
        logMetacatError(insert, metacatReturnString, "IOException", e);
337 2022 costa
      }
338
    }
339
  }
340 2036 costa
341
342
  /**
343
   * Validate the document to determine whether it is valid EML prior to
344
   * inserting or updating it to Metacat. This is QA/QC measure.
345
   * Not yet implemented.
346
   *
347
   * @return  true if the document is valid EML, otherwise false
348
   */
349
  private boolean validateDocument () {
350
    boolean success = true;
351
352
    /*if (success) {
353
      harvester.addLogEntry(0,
354
                            "Validated: " + documentURL,
355 4175 daigle
                            "harvester.ValidateDocSuccess",
356 2036 costa
                            harvestSiteSchedule.siteScheduleID,
357
                            null,
358
                            "");
359
    }
360
    else {
361 4175 daigle
      harvester.addLogEntry(1, "Error validating document", "harvester.ValidateDocError",
362 2036 costa
                            harvestSiteSchedule.siteScheduleID, this, "");
363
    }*/
364
365
    return success;
366
  }
367
368 2022 costa
}