Project

General

Profile

1 2022 costa
/*
2
 * HarvestDocument.java
3
 *
4
 * Created on January 14, 2004, 4:37 PM
5
 */
6
7
package edu.ucsb.nceas.metacat.harvesterClient;
8
9
import java.io.InputStream;
10
import java.io.InputStreamReader;
11
import java.io.IOException;
12 2086 costa
import java.io.PrintStream;
13 2022 costa
import java.io.StringReader;
14
import java.net.MalformedURLException;
15
import java.net.URL;
16 2036 costa
import java.sql.ResultSet;
17
import java.sql.SQLException;
18
import java.sql.Statement;
19 2022 costa
20 2031 costa
import edu.ucsb.nceas.metacat.client.InsufficientKarmaException;
21
import edu.ucsb.nceas.metacat.client.Metacat;
22
import edu.ucsb.nceas.metacat.client.MetacatException;
23
import edu.ucsb.nceas.metacat.client.MetacatInaccessibleException;
24 2022 costa
import edu.ucsb.nceas.utilities.IOUtil;
25
26
27
/**
28
 * HarvestDocument manages operations and data for a single document to be
29
 * harvested.
30
 *
31
 * @author  costa
32
 */
33
public class HarvestDocument {
34 2036 costa
35
36
  private String docid;                      // scope + identifier
37
  private String docidFull;                  // scope + identifier + revision
38 2031 costa
  String documentType;
39
  String documentURL;
40 2022 costa
  private Harvester harvester;
41
  private HarvestSiteSchedule harvestSiteSchedule;
42 2031 costa
  int identifier;
43
  int revision;
44
  String scope;
45 2022 costa
46
47
  /**
48
   * Creates a new instance of HarvestDocument. Initialized with the data
49
   * that was read from a single <document> element in site document list.
50
   *
51
   * @param harvester            the parent Harvester object
52
   * @param harvestSiteSchedule  the parent HarvestSiteSchedule object
53
   * @param scope                the value of the <scope> element
54
   * @param identifier           the value of the <identifier> element
55
   * @param revision             the value of the <revision> element
56
   * @param documentType         the value of the <documentType> element
57
   * @param documentURL          the value of the <documentURL> element
58
   */
59
  public HarvestDocument(
60
                          Harvester harvester,
61
                          HarvestSiteSchedule harvestSiteSchedule,
62
                          String scope,
63
                          int identifier,
64
                          int revision,
65
                          String documentType,
66
                          String documentURL
67
                        ) {
68
    this.harvester = harvester;
69
    this.harvestSiteSchedule = harvestSiteSchedule;
70
    this.documentType = documentType;
71
    this.documentURL = documentURL;
72
    this.scope = scope;
73
    this.identifier = identifier;
74
    this.revision = revision;
75
76 2036 costa
    this.docid = scope + "." + identifier;
77
    this.docidFull = this.docid + "." + revision;
78 2022 costa
  }
79
80
81
  /**
82
   * Retrieve the document from the site using its <documentURL> value.
83
   *
84
   * @return   A StringReader containing the document string.
85
   */
86
  private StringReader getSiteDocument() {
87
    String documentString;
88
    InputStream inputStream;
89
    InputStreamReader inputStreamReader;
90
    StringReader stringReader = null;
91
    URL url;
92
93
    try {
94
      url = new URL(documentURL);
95
      inputStream = url.openStream();
96
      inputStreamReader = new InputStreamReader(inputStream);
97
      documentString = IOUtil.getAsString(inputStreamReader, true);
98
      stringReader = new StringReader(documentString);
99 2036 costa
      harvester.addLogEntry(0,
100
                            "Retrieved: " + documentURL,
101
                            "GetDocSuccess",
102
                            harvestSiteSchedule.siteScheduleID,
103
                            null,
104
                            "");
105 2022 costa
    }
106
    catch (MalformedURLException e) {
107 2031 costa
      harvester.addLogEntry(1, "MalformedURLException", "GetDocError",
108
                            harvestSiteSchedule.siteScheduleID, this,
109
                            "MalformedURLException: " + e.getMessage());
110 2022 costa
    }
111
    catch (IOException e) {
112 2031 costa
      harvester.addLogEntry(1, "IOException", "GetDocError",
113
                            harvestSiteSchedule.siteScheduleID, this,
114
                            "IOException: " + e.getMessage());
115 2022 costa
    }
116
117
    return stringReader;
118
  }
119
120
121
  /**
122
   * Harvest the document from the site. Unless Metacat already has the
123
   * document, retrieve the document from the site and put (insert or
124
   * update) it to Metacat. If Metacat already has the document, determine
125
   * the highest revision stored in Metacat so that this can be reported
126
   * back to the user.
127
   */
128
  public void harvestDocument() {
129
    int highestRevision;
130 2036 costa
    boolean insert = false;
131 2022 costa
    String metacatReturnString;
132
    StringReader stringReader;
133 2036 costa
    boolean update = false;
134 2022 costa
135
    /* If metacat already has this document, determine the highest revision in
136
     * metacat and report it to the user; else, insert or delete the document
137
     * into metacat.
138
     */
139 2036 costa
    highestRevision = metacatHighestRevision();
140
141
    if (highestRevision == -1) {
142
      insert = true;
143
    }
144
    else if (revision > highestRevision) {
145
      update = true;
146
    }
147
    else {
148 2031 costa
      harvester.addLogEntry(0,
149 2036 costa
                            "Attempting to update " + docid + " to revision " +
150
                            revision + ". Metacat has document revision " +
151
                            highestRevision + ".",
152 2031 costa
                            "MetacatHasDoc",
153 2036 costa
                            harvestSiteSchedule.siteScheduleID,
154
                            null,
155
                            "");
156 2022 costa
    }
157 2036 costa
158
    if (insert || update) {
159 2022 costa
      stringReader = getSiteDocument();
160
      if (stringReader != null) {
161 2036 costa
        if (validateDocument()) {
162
          putMetacatDocument(insert, update, stringReader);
163 2022 costa
        }
164
      }
165
    }
166
  }
167 2031 costa
168 2022 costa
169
  /**
170 2031 costa
   * Logs a metacat document error to the harvest detail log.
171
   *
172
   * @param insert               true if insert operation, false is update
173
   * @param metacatReturnString  string returned from the insert or update
174
   * @param exceptionName        name of the exception class
175
   * @param e                    the exception object
176
   */
177
  private void logMetacatError (boolean insert,
178
                                String metacatReturnString,
179
                                String exceptionName,
180
                                Exception e
181
                               ) {
182
    if (insert) {
183 2036 costa
      harvester.addLogEntry(1,
184
                            metacatReturnString,
185
                            "InsertDocError",
186
                            harvestSiteSchedule.siteScheduleID,
187
                            this,
188
                            exceptionName + ": " + e.getMessage());
189 2031 costa
    }
190
    else {
191 2036 costa
      harvester.addLogEntry(1,
192
                            metacatReturnString,
193
                            "UpdateDocError",
194
                            harvestSiteSchedule.siteScheduleID,
195
                            this,
196
                            exceptionName + ": " + e.getMessage());
197 2031 costa
    }
198
  }
199
200
201
  /**
202 2022 costa
   * Determines the highest revision that Metacat has for this document.
203
   *
204 2036 costa
   * @return  int representing the highest revision for this document in
205
   *          Metacat. Returns -1 if Metacat does not currently hold the
206
   *          document.
207 2022 costa
   */
208
  private int metacatHighestRevision() {
209 2036 costa
    int         highestRevision = -1;
210
		String query = "SELECT REV FROM XML_DOCUMENTS WHERE DOCID = " +
211
                   "'" + docid + "'";
212
		Statement stmt;
213 2022 costa
214 2036 costa
		try {
215
			stmt = harvester.conn.createStatement();
216
			ResultSet rs = stmt.executeQuery(query);
217
218
			while (rs.next()) {
219
				highestRevision = rs.getInt("REV");
220
			}
221
222
			stmt.close();
223
		}
224
    catch(SQLException e) {
225
			System.out.println("SQLException: " + e.getMessage());
226 2031 costa
    }
227 2036 costa
228
    return highestRevision;
229 2022 costa
  }
230
231
232
  /**
233
   * Print the data fields and values in this HarvestDocument object.
234 2086 costa
   *
235
   * @param out   the PrintStream to write to
236 2022 costa
   */
237 2086 costa
  void printOutput(PrintStream out) {
238
    out.println("* scope:                " + scope);
239
    out.println("* identifier:           " + identifier);
240
    out.println("* revision:             " + revision);
241
    out.println("* documentType:         " + documentType);
242
    out.println("* documentURL:          " + documentURL);
243 2022 costa
  }
244
245
246
  /**
247
   * Insert or update this document to Metacat. If revision equals 1, do an
248
   * insert; otherwise, do an update.
249 2036 costa
   *
250
   * @param insert       true if this is an insert operation
251
   * @param update       true if this is an update operation
252
   * @param stringReader the StringReader object holding the document text
253 2022 costa
   */
254 2036 costa
  private void putMetacatDocument(boolean insert,
255
                                  boolean update,
256
                                  StringReader stringReader) {
257 2022 costa
    Metacat metacat = harvester.metacat;
258
    String metacatReturnString = "";
259 2031 costa
260 2022 costa
    if (harvester.connectToMetacat()) {
261
      try {
262 2031 costa
        if (insert) {
263 2036 costa
          metacatReturnString = metacat.insert(docidFull, stringReader, null);
264
          harvester.addLogEntry(0, docidFull + " : " + metacatReturnString,
265 2031 costa
                                "InsertDocSuccess",
266
                                harvestSiteSchedule.siteScheduleID,
267
                                null, "");
268 2022 costa
        }
269 2036 costa
        else if (update) {
270
          metacatReturnString = metacat.update(docidFull, stringReader, null);
271
          harvester.addLogEntry(0, docidFull + " : " + metacatReturnString,
272 2031 costa
                                "UpdateDocSuccess",
273
                                harvestSiteSchedule.siteScheduleID,
274
                                null, "");
275 2022 costa
        }
276
      }
277
      catch (MetacatInaccessibleException e) {
278 2031 costa
        logMetacatError(insert, metacatReturnString,
279
                        "MetacatInaccessibleException", e);
280 2022 costa
      }
281
      catch (InsufficientKarmaException e) {
282 2031 costa
        logMetacatError(insert, metacatReturnString,
283
                        "InsufficientKarmaException", e);
284 2022 costa
      }
285
      catch (MetacatException e) {
286 2031 costa
        logMetacatError(insert, metacatReturnString, "MetacatException", e);
287 2022 costa
      }
288
      catch (IOException e) {
289 2031 costa
        logMetacatError(insert, metacatReturnString, "IOException", e);
290 2022 costa
      }
291
    }
292
  }
293 2036 costa
294
295
  /**
296
   * Validate the document to determine whether it is valid EML prior to
297
   * inserting or updating it to Metacat. This is QA/QC measure.
298
   * Not yet implemented.
299
   *
300
   * @return  true if the document is valid EML, otherwise false
301
   */
302
  private boolean validateDocument () {
303
    boolean success = true;
304
305
    /*if (success) {
306
      harvester.addLogEntry(0,
307
                            "Validated: " + documentURL,
308
                            "ValidateDocSuccess",
309
                            harvestSiteSchedule.siteScheduleID,
310
                            null,
311
                            "");
312
    }
313
    else {
314
      harvester.addLogEntry(1, "Error validating document", "ValidateDocError",
315
                            harvestSiteSchedule.siteScheduleID, this, "");
316
    }*/
317
318
    return success;
319
  }
320
321 2022 costa
}