Revision 6873
Added by ben leinfelder almost 13 years ago
src/edu/ucsb/nceas/metacat/dataone/SystemMetadataFactory.java | ||
---|---|---|
29 | 29 |
import java.io.InputStream; |
30 | 30 |
import java.math.BigInteger; |
31 | 31 |
import java.net.URL; |
32 |
import java.net.URLConnection; |
|
32 | 33 |
import java.security.NoSuchAlgorithmException; |
33 | 34 |
import java.sql.SQLException; |
34 | 35 |
import java.util.ArrayList; |
... | ... | |
357 | 358 |
try { |
358 | 359 |
// download the data from the URL |
359 | 360 |
URL dataURL = new URL(dataDocUrl); |
360 |
dataObject = dataURL.openStream(); |
|
361 |
// TODO: check for valid content |
|
361 |
URLConnection dataConnection = dataURL.openConnection(); |
|
362 |
|
|
363 |
// default is to download the data |
|
364 |
dataObject = dataConnection.getInputStream(); |
|
365 |
|
|
366 |
String detectedContentType = dataConnection.getContentType(); |
|
367 |
logMetacat.info("Detected content type: " + detectedContentType); |
|
368 |
|
|
369 |
if (detectedContentType != null) { |
|
370 |
// seems to be HTML from the remote location |
|
371 |
if (detectedContentType.contains("html")) { |
|
372 |
// if we are not expecting it, we skip it |
|
373 |
if (!dataDocMimeType.contains("html")) { |
|
374 |
// set to null so we don't download it |
|
375 |
dataObject = null; |
|
376 |
logMetacat.warn("Skipping remote resource, unexpected HTML content type at: " + dataDocUrl); |
|
377 |
} |
|
378 |
} |
|
379 |
|
|
380 |
// TODO: any other special processing (csv, images, etc)? |
|
381 |
} else { |
|
382 |
// if we don't know what it is, should we skip it? |
|
383 |
dataObject = null; |
|
384 |
logMetacat.warn("Skipping remote resource, unknown content type at: " + dataDocUrl); |
|
385 |
} |
|
386 |
|
|
362 | 387 |
} catch (Exception e) { |
363 | 388 |
// error with the download |
364 | 389 |
logMetacat.warn("Error downloading remote data. " + e.getMessage()); |
Also available in: Unified diff
do not download and save remote data resources which are HTML but are not expected to be such (login or info/splash pages before data content).
http://bugzilla.ecoinformatics.org/show_bug.cgi?id=5522