Project

General

Profile

1
/**
2
 *  '$RCSfile$'
3
 *  Copyright: 2009 University of New Mexico and the 
4
 *                  Regents of the University of California
5
 *
6
 *   '$Author: costa $'
7
 *     '$Date: 2009-07-27 17:47:44 -0400 (Mon, 27 Jul 2009) $'
8
 * '$Revision: 4999 $'
9
 *
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * This program is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
 * GNU General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU General Public License
21
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23
 * 
24
 * Additional Copyright 2006 OCLC, Online Computer Library Center
25
 * Licensed under the Apache License, Version 2.0 (the "License");
26
 * you may not use this file except in compliance with the License.
27
 * You may obtain a copy of the License at
28
 *
29
 * http://www.apache.org/licenses/LICENSE-2.0
30
 *
31
 * Unless required by applicable law or agreed to in writing, software
32
 * distributed under the License is distributed on an "AS IS" BASIS,
33
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
34
 * See the License for the specific language governing permissions and
35
 * limitations under the License.
36
 */
37

    
38
package edu.ucsb.nceas.metacat.oaipmh.harvester;
39

    
40
import java.io.FileNotFoundException;
41
import java.io.IOException;
42
import java.io.InputStream;
43
import java.io.StringWriter;
44
import java.net.HttpURLConnection;
45
import java.net.URL;
46
import java.util.Date;
47
import java.util.HashMap;
48
import java.util.StringTokenizer;
49
import java.util.zip.GZIPInputStream;
50
import java.util.zip.InflaterInputStream;
51
import java.util.zip.ZipInputStream;
52

    
53
import javax.xml.parsers.DocumentBuilder;
54
import javax.xml.parsers.DocumentBuilderFactory;
55
import javax.xml.parsers.ParserConfigurationException;
56
import javax.xml.transform.OutputKeys;
57
import javax.xml.transform.Result;
58
import javax.xml.transform.Source;
59
import javax.xml.transform.Transformer;
60
import javax.xml.transform.TransformerException;
61
import javax.xml.transform.TransformerFactory;
62
import javax.xml.transform.dom.DOMSource;
63
import javax.xml.transform.stream.StreamResult;
64

    
65
import org.apache.log4j.Logger;
66
import org.apache.xpath.XPathAPI;
67
import org.w3c.dom.DOMImplementation;
68
import org.w3c.dom.Document;
69
import org.w3c.dom.Element;
70
import org.w3c.dom.NodeList;
71
import org.xml.sax.InputSource;
72
import org.xml.sax.SAXException;
73

    
74

    
75
/**
76
 * HarvesterVerb is the parent class for each of the OAI verbs.
77
 * 
78
 * @author Duane Costa, University of New Mexico, LTER Network Office
79
 * @author Jeffrey A. Young, OCLC Online Computer Library Center
80
 */
81
public abstract class HarvesterVerb {
82
  
83
  /* Class variables */
84

    
85
  private static Logger logger = Logger.getLogger(HarvesterVerb.class);
86
  
87
  public static final String SCHEMA_LOCATION_V2_0 = 
88
    "http://www.openarchives.org/OAI/2.0/ " +
89
    "http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd";
90
  
91
  private static HashMap<Thread, DocumentBuilder> builderMap = 
92
                                         new HashMap<Thread, DocumentBuilder>();
93
  private static DocumentBuilderFactory documentBuilderFactory = null;
94
  private static Element namespaceElement = null;
95
  private static TransformerFactory transformerFactory = 
96
                                               TransformerFactory.newInstance();
97

    
98
  
99
  /* Instance variables */
100
  
101
  private Document document = null;
102
  private String schemaLocation = null;
103
  private String requestURL = null;
104
  
105
  
106
  /* Constructors */
107
  
108
  /**
109
   * Mock object creator (for unit testing purposes)
110
   */
111
  public HarvesterVerb() {
112
  }
113

    
114

    
115
  /**
116
   * Performs the OAI request
117
   * 
118
   * @param requestURL
119
   * @throws IOException
120
   * @throws ParserConfigurationException
121
   * @throws SAXException
122
   * @throws TransformerException
123
   */
124
  public HarvesterVerb(String requestURL) throws IOException,
125
      ParserConfigurationException, SAXException, TransformerException {
126
    this.requestURL = requestURL;
127
  }
128

    
129

    
130
  /* Static initialization code */
131
  
132
  static {
133
    try {
134
      /* Load DOM Document */
135
      documentBuilderFactory = DocumentBuilderFactory.newInstance();
136
      documentBuilderFactory.setNamespaceAware(true);
137
      Thread thread = Thread.currentThread();
138
      DocumentBuilder builder = documentBuilderFactory.newDocumentBuilder();
139
      builderMap.put(thread, builder);
140

    
141
      DOMImplementation impl = builder.getDOMImplementation();
142
      
143
      Document namespaceHolder = impl.createDocument(
144
                          "http://www.oclc.org/research/software/oai/harvester",
145
                          "harvester:namespaceHolder", 
146
                          null
147
                                                    );
148
      
149
      namespaceElement = namespaceHolder.getDocumentElement();
150
      
151
      namespaceElement.setAttributeNS(
152
                          "http://www.w3.org/2000/xmlns/",
153
                          "xmlns:harvester",
154
                          "http://www.oclc.org/research/software/oai/harvester"
155
                                     );
156
      
157
      namespaceElement.setAttributeNS(
158
                          "http://www.w3.org/2000/xmlns/",
159
                          "xmlns:xsi", 
160
                          "http://www.w3.org/2001/XMLSchema-instance"
161
                                     );
162
      
163
      namespaceElement.setAttributeNS(
164
                          "http://www.w3.org/2000/xmlns/",
165
                          "xmlns:oai20", 
166
                          "http://www.openarchives.org/OAI/2.0/"
167
                                     );
168
    } 
169
    catch (Exception e) {
170
      e.printStackTrace();
171
    }
172
    
173
  }
174

    
175
  
176
  /* Instance methods */
177

    
178
  /* Primary OAI namespaces */
179

    
180
  /**
181
   * Get the OAI response as a DOM object
182
   * 
183
   * @return the DOM for the OAI response
184
   */
185
  public Document getDocument() {
186
    return document;
187
  }
188

    
189

    
190
  /**
191
   * Get the OAI errors
192
   * 
193
   * @return a NodeList of /oai:OAI-PMH/oai:error elements
194
   * @throws TransformerException
195
   */
196
  public NodeList getErrors() throws TransformerException {
197
    if (SCHEMA_LOCATION_V2_0.equals(getSchemaLocation())) {
198
      return getNodeList("/oai20:OAI-PMH/oai20:error");
199
    } 
200
    else {
201
      return null;
202
    }
203
  }
204

    
205

    
206
  /**
207
   * Get a NodeList containing the nodes in the response DOM for the specified
208
   * xpath
209
   * 
210
   * @param xpath
211
   * @return the NodeList for the xpath into the response DOM
212
   * @throws TransformerException
213
   */
214
  public NodeList getNodeList(String xpath) throws TransformerException {
215
    Document document = getDocument();
216
    return XPathAPI.selectNodeList(document, xpath, namespaceElement);
217
  }
218

    
219

    
220
  /**
221
   * Get the OAI request URL for this response
222
   * 
223
   * @return the OAI request URL as a String
224
   */
225
  public String getRequestURL() {
226
    return requestURL;
227
  }
228

    
229

    
230
  /**
231
   * Get the xsi:schemaLocation for the OAI response
232
   * 
233
   * @return the xsi:schemaLocation value
234
   */
235
  public String getSchemaLocation() {
236
    return schemaLocation;
237
  }
238

    
239

    
240
  /**
241
   * Get the String value for the given XPath location in the response DOM
242
   * 
243
   * @param xpath
244
   * @return a String containing the value of the XPath location.
245
   * @throws TransformerException
246
   */
247
  public String getSingleString(String xpath) throws TransformerException {
248
    Document document = getDocument();
249
    org.apache.xpath.objects.XObject xobject;
250
    
251
    xobject = XPathAPI.eval(document, xpath, namespaceElement);
252
    String str = xobject.str();
253
    
254
    return str;
255
  }
256

    
257

    
258
  /**
259
   * Preforms the OAI request for this OAI-PMH verb
260
   * 
261
   * @throws IOException
262
   * @throws ParserConfigurationException
263
   * @throws SAXException
264
   * @throws TransformerException
265
   */
266
  public void runVerb() 
267
          throws IOException, ParserConfigurationException, 
268
                 SAXException, TransformerException {
269
    //logger.debug("requestURL=" + requestURL);
270
    InputStream in = null;
271
    URL url = new URL(requestURL);
272
    HttpURLConnection con = null;
273
    int responseCode = 0;
274
    
275
    do {
276
      con = (HttpURLConnection) url.openConnection();
277
      con.setRequestProperty("User-Agent", "OAIHarvester/2.0");
278
      con.setRequestProperty("Accept-Encoding", "compress, gzip, identify");
279
      
280
      try {
281
        responseCode = con.getResponseCode();
282
        //logger.debug("responseCode=" + responseCode);
283
      } 
284
      catch (FileNotFoundException e) {
285
        // assume it's a 503 response
286
        logger.info(requestURL, e);
287
        responseCode = HttpURLConnection.HTTP_UNAVAILABLE;
288
      }
289

    
290
      if (responseCode == HttpURLConnection.HTTP_UNAVAILABLE) {
291
        long retrySeconds = con.getHeaderFieldInt("Retry-After", -1);
292
        
293
        if (retrySeconds == -1) {
294
          long now = (new Date()).getTime();
295
          long retryDate = con.getHeaderFieldDate("Retry-After", now);
296
          retrySeconds = retryDate - now;
297
        }
298
        
299
        if (retrySeconds == 0) { // Apparently, it's a bad URL
300
          throw new FileNotFoundException("Bad URL?");
301
        }
302
        
303
        System.err.println("Server response: Retry-After=" + retrySeconds);
304
        
305
        if (retrySeconds > 0) {
306
          try {
307
            Thread.sleep(retrySeconds * 1000);
308
          } 
309
          catch (InterruptedException ex) {
310
            ex.printStackTrace();
311
          }
312
        }
313
        
314
      }
315
    } while (responseCode == HttpURLConnection.HTTP_UNAVAILABLE);
316
    
317
    String contentEncoding = con.getHeaderField("Content-Encoding");
318
    //logger.debug("contentEncoding=" + contentEncoding);
319
    if ("compress".equals(contentEncoding)) {
320
      ZipInputStream zis = new ZipInputStream(con.getInputStream());
321
      zis.getNextEntry();
322
      in = zis;
323
    } 
324
    else if ("gzip".equals(contentEncoding)) {
325
      in = new GZIPInputStream(con.getInputStream());
326
    } 
327
    else if ("deflate".equals(contentEncoding)) {
328
      in = new InflaterInputStream(con.getInputStream());
329
    } 
330
    else {
331
      in = con.getInputStream();
332
    }
333

    
334
    InputSource data = new InputSource(in);
335

    
336
    Thread t = Thread.currentThread();
337
    DocumentBuilder builder = builderMap.get(t);
338
    
339
    if (builder == null) {
340
      builder = documentBuilderFactory.newDocumentBuilder();
341
      builderMap.put(t, builder);
342
    }
343
    
344
    document = builder.parse(data);
345

    
346
    String singleString = getSingleString("/*/@xsi:schemaLocation");
347
    StringTokenizer tokenizer = new StringTokenizer(singleString, " ");
348
    StringBuffer sb = new StringBuffer();
349
    
350
    while (tokenizer.hasMoreTokens()) {
351
      if (sb.length() > 0) sb.append(" ");
352
      sb.append(tokenizer.nextToken());
353
    }
354
    
355
    String schemaLocationStr = sb.toString();
356
    this.schemaLocation = schemaLocationStr;
357
  }
358

    
359

    
360
  /**
361
   * Transform the document content to a string and return it.
362
   * 
363
   * @return returnString - the string that results from transforming the
364
   *                        document
365
   */
366
  public String toString() {
367
    Document document = getDocument();
368
    Source source = new DOMSource(document);
369
    StringWriter stringWriter = new StringWriter();
370
    Result result = new StreamResult(stringWriter);
371
    
372
    try {
373
      Transformer idTransformer = transformerFactory.newTransformer();
374
      idTransformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no");
375
      idTransformer.transform(source, result);
376
      String returnString = stringWriter.toString();
377
      return returnString;
378
    } 
379
    catch (TransformerException e) {
380
      return e.getMessage();
381
    }
382
  }
383
  
384
}
(2-2/8)