Revision 9018
Added by ben leinfelder about 10 years ago
metacat-index/src/main/java/edu/ucsb/nceas/metacat/index/resourcemap/ResourceMapSubprocessor.java | ||
---|---|---|
16 | 16 |
package edu.ucsb.nceas.metacat.index.resourcemap; |
17 | 17 |
|
18 | 18 |
import java.io.IOException; |
19 |
import java.io.InputStream; |
|
19 | 20 |
import java.net.MalformedURLException; |
20 | 21 |
import java.util.ArrayList; |
21 | 22 |
import java.util.Date; |
... | ... | |
36 | 37 |
import org.apache.solr.common.params.SolrParams; |
37 | 38 |
import org.apache.solr.schema.IndexSchema; |
38 | 39 |
import org.apache.solr.servlet.SolrRequestParsers; |
40 |
import org.dataone.cn.indexer.XPathDocumentParser; |
|
39 | 41 |
import org.dataone.cn.indexer.convert.SolrDateConverter; |
40 | 42 |
import org.dataone.cn.indexer.parser.AbstractDocumentSubprocessor; |
41 | 43 |
import org.dataone.cn.indexer.parser.IDocumentSubprocessor; |
... | ... | |
77 | 79 |
|
78 | 80 |
@Override |
79 | 81 |
public Map<String, SolrDoc> processDocument(String identifier, Map<String, SolrDoc> docs, |
80 |
Document doc) throws IOException, EncoderException, SAXException,
|
|
82 |
InputStream is) throws IOException, EncoderException, SAXException,
|
|
81 | 83 |
XPathExpressionException, ParserConfigurationException, SolrServerException, NotImplemented, NotFound, UnsupportedType, OREParserException, ResourceMapException { |
82 | 84 |
SolrDoc resourceMapDoc = docs.get(identifier); |
83 |
List<SolrDoc> processedDocs = processResourceMap(resourceMapDoc, doc); |
|
85 |
Document doc = XPathDocumentParser.generateXmlDocument(is); |
|
86 |
List<SolrDoc> processedDocs = processResourceMap(resourceMapDoc, doc ); |
|
84 | 87 |
Map<String, SolrDoc> processedDocsMap = new HashMap<String, SolrDoc>(); |
85 | 88 |
for (SolrDoc processedDoc : processedDocs) { |
86 | 89 |
processedDocsMap.put(processedDoc.getIdentifier(), processedDoc); |
metacat-index/src/main/java/edu/ucsb/nceas/metacat/index/SolrIndex.java | ||
---|---|---|
58 | 58 |
import org.apache.solr.schema.IndexSchema; |
59 | 59 |
import org.dataone.cn.indexer.XMLNamespaceConfig; |
60 | 60 |
import org.dataone.cn.indexer.convert.SolrDateConverter; |
61 |
import org.dataone.cn.indexer.parser.AbstractDocumentSubprocessor; |
|
61 | 62 |
import org.dataone.cn.indexer.parser.IDocumentSubprocessor; |
62 | 63 |
import org.dataone.cn.indexer.parser.SolrField; |
63 |
import org.dataone.cn.indexer.resourcemap.ResourceEntry; |
|
64 |
import org.dataone.cn.indexer.resourcemap.ResourceMap; |
|
65 |
import org.dataone.cn.indexer.resourcemap.ResourceMapFactory; |
|
66 | 64 |
import org.dataone.cn.indexer.solrhttp.SolrDoc; |
67 | 65 |
import org.dataone.cn.indexer.solrhttp.SolrElementField; |
68 | 66 |
import org.dataone.service.exceptions.NotFound; |
... | ... | |
156 | 154 |
*/ |
157 | 155 |
public void setSubprocessors(List<IDocumentSubprocessor> subprocessorList) { |
158 | 156 |
for (IDocumentSubprocessor subprocessor : subprocessorList) { |
159 |
subprocessor.initExpression(xpath); |
|
157 |
if (subprocessor instanceof AbstractDocumentSubprocessor) { |
|
158 |
((AbstractDocumentSubprocessor)subprocessor).initExpression(xpath); |
|
159 |
} |
|
160 | 160 |
} |
161 | 161 |
this.subprocessors = subprocessorList; |
162 | 162 |
} |
... | ... | |
197 | 197 |
SolrDoc indexDocument = new SolrDoc(sysSolrFields); |
198 | 198 |
Map<String, SolrDoc> docs = new HashMap<String, SolrDoc>(); |
199 | 199 |
docs.put(id, indexDocument); |
200 |
|
|
201 |
// get the format id for this object |
|
202 |
String formatId = indexDocument.getFirstFieldValue(SolrElementField.FIELD_OBJECTFORMAT); |
|
200 | 203 |
|
201 | 204 |
// Determine if subprocessors are available for this ID |
202 | 205 |
if (subprocessors != null) { |
203 |
// for each subprocessor loaded from the spring config |
|
204 |
for (IDocumentSubprocessor subprocessor : subprocessors) { |
|
205 |
// Does this subprocessor apply? |
|
206 |
if (subprocessor.canProcess(sysMetaDoc)) { |
|
207 |
// if so, then extract the additional information from the |
|
208 |
// document. |
|
209 |
try { |
|
210 |
// docObject = the resource map document or science |
|
211 |
// metadata document. |
|
212 |
// note that resource map processing touches all objects |
|
213 |
// referenced by the resource map. |
|
214 |
InputStream dataStream = new FileInputStream(objectPath); |
|
215 |
Document docObject = generateXmlDocument(dataStream); |
|
216 |
if (docObject == null) { |
|
217 |
throw new Exception("Could not load OBJECT for ID " + id ); |
|
218 |
} else { |
|
219 |
docs = subprocessor.processDocument(id, docs, docObject); |
|
220 |
} |
|
221 |
} catch (Exception e) { |
|
222 |
log.error(e.getMessage(), e); |
|
223 |
throw new SolrServerException(e.getMessage()); |
|
224 |
} |
|
225 |
} |
|
226 |
} |
|
206 |
// for each subprocessor loaded from the spring config |
|
207 |
for (IDocumentSubprocessor subprocessor : subprocessors) { |
|
208 |
// Does this subprocessor apply? |
|
209 |
if (subprocessor.canProcess(formatId)) { |
|
210 |
// if so, then extract the additional information from the |
|
211 |
// document. |
|
212 |
try { |
|
213 |
// docObject = the resource map document or science |
|
214 |
// metadata document. |
|
215 |
// note that resource map processing touches all objects |
|
216 |
// referenced by the resource map. |
|
217 |
FileInputStream dataStream = new FileInputStream(objectPath); |
|
218 |
if (!dataStream.getFD().valid()) { |
|
219 |
log.error("Could not load OBJECT file for ID,Path=" + id + ", " |
|
220 |
+ objectPath); |
|
221 |
//throw new Exception("Could not load OBJECT for ID " + id ); |
|
222 |
} else { |
|
223 |
docs = subprocessor.processDocument(id, docs, dataStream); |
|
224 |
} |
|
225 |
} catch (Exception e) { |
|
226 |
log.error(e.getMessage(), e); |
|
227 |
throw new SolrServerException(e.getMessage()); |
|
228 |
} |
|
229 |
} |
|
230 |
} |
|
227 | 231 |
} |
228 | 232 |
|
229 | 233 |
// TODO: in the XPathDocumentParser class in d1_cn_index_process module, |
metacat-index/src/main/java/edu/ucsb/nceas/metacat/index/annotation/RdfXmlSubprocessor.java | ||
---|---|---|
15 | 15 |
*/ |
16 | 16 |
package edu.ucsb.nceas.metacat.index.annotation; |
17 | 17 |
|
18 |
import java.io.ByteArrayInputStream; |
|
19 |
import java.io.ByteArrayOutputStream; |
|
20 | 18 |
import java.io.IOException; |
21 | 19 |
import java.io.InputStream; |
22 | 20 |
import java.net.MalformedURLException; |
... | ... | |
32 | 30 |
import java.util.Set; |
33 | 31 |
|
34 | 32 |
import javax.xml.parsers.ParserConfigurationException; |
35 |
import javax.xml.transform.Result; |
|
36 |
import javax.xml.transform.Source; |
|
37 |
import javax.xml.transform.TransformerConfigurationException; |
|
38 |
import javax.xml.transform.TransformerException; |
|
39 |
import javax.xml.transform.TransformerFactory; |
|
40 |
import javax.xml.transform.TransformerFactoryConfigurationError; |
|
41 |
import javax.xml.transform.dom.DOMSource; |
|
42 |
import javax.xml.transform.stream.StreamResult; |
|
43 | 33 |
|
44 | 34 |
import org.apache.commons.logging.Log; |
45 | 35 |
import org.apache.commons.logging.LogFactory; |
... | ... | |
65 | 55 |
import org.dataone.service.types.v1.util.AccessUtil; |
66 | 56 |
import org.dataone.service.types.v1.util.AuthUtils; |
67 | 57 |
import org.dataone.service.util.DateTimeMarshaller; |
68 |
import org.w3c.dom.Document; |
|
69 | 58 |
import org.xml.sax.SAXException; |
70 | 59 |
|
71 | 60 |
import com.hp.hpl.jena.ontology.OntModel; |
... | ... | |
103 | 92 |
} |
104 | 93 |
|
105 | 94 |
@Override |
106 |
public Map<String, SolrDoc> processDocument(String identifier, Map<String, SolrDoc> docs, Document doc) throws Exception {
|
|
95 |
public Map<String, SolrDoc> processDocument(String identifier, Map<String, SolrDoc> docs, InputStream is) throws Exception {
|
|
107 | 96 |
SolrDoc resourceMapDoc = docs.get(identifier); |
108 |
List<SolrDoc> processedDocs = process(resourceMapDoc, doc);
|
|
97 |
List<SolrDoc> processedDocs = process(resourceMapDoc, is);
|
|
109 | 98 |
Map<String, SolrDoc> processedDocsMap = new HashMap<String, SolrDoc>(); |
110 | 99 |
for (SolrDoc processedDoc : processedDocs) { |
111 | 100 |
processedDocsMap.put(processedDoc.getIdentifier(), processedDoc); |
... | ... | |
114 | 103 |
Map<String, SolrDoc> mergedDocuments = mergeDocs(docs, processedDocsMap); |
115 | 104 |
return mergedDocuments; |
116 | 105 |
} |
117 |
|
|
118 |
private InputStream toInputStream(Document doc) throws TransformerConfigurationException, TransformerException, TransformerFactoryConfigurationError { |
|
119 |
ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); |
|
120 |
Source xmlSource = new DOMSource(doc); |
|
121 |
Result outputTarget = new StreamResult(outputStream); |
|
122 |
TransformerFactory.newInstance().newTransformer().transform(xmlSource, outputTarget); |
|
123 |
InputStream is = new ByteArrayInputStream(outputStream.toByteArray()); |
|
124 |
return is; |
|
125 |
} |
|
126 | 106 |
|
127 |
private List<SolrDoc> process(SolrDoc indexDocument, Document rdfXmlDocument) throws Exception {
|
|
107 |
private List<SolrDoc> process(SolrDoc indexDocument, InputStream is) throws Exception {
|
|
128 | 108 |
|
129 | 109 |
// get the triplestore dataset |
130 | 110 |
Dataset dataset = TripleStoreService.getInstance().getDataset(); |
131 | 111 |
|
132 | 112 |
// read the annotation |
133 |
InputStream source = toInputStream(rdfXmlDocument); |
|
134 | 113 |
String indexDocId = indexDocument.getIdentifier(); |
135 | 114 |
String name = indexDocId; |
136 | 115 |
|
... | ... | |
144 | 123 |
boolean loaded = dataset.containsNamedModel(name); |
145 | 124 |
if (!loaded) { |
146 | 125 |
OntModel ontModel = ModelFactory.createOntologyModel(); |
147 |
ontModel.read(source, name);
|
|
126 |
ontModel.read(is, name);
|
|
148 | 127 |
dataset.addNamedModel(name, ontModel); |
149 | 128 |
} |
150 | 129 |
//dataset.getDefaultModel().add(ontModel); |
metacat-index/src/main/resources/application-context-eml200.xml | ||
---|---|---|
4 | 4 |
|
5 | 5 |
<bean id="eml200Subprocessor" |
6 | 6 |
class="org.dataone.cn.indexer.parser.ScienceMetadataDocumentSubprocessor"> |
7 |
<property name="matchDocument" |
|
8 |
value="/d200:systemMetadata/formatId[text() = 'eml://ecoinformatics.org/eml-2.0.0']"></property> |
|
7 |
<property name="matchDocuments"> |
|
8 |
<list> |
|
9 |
<value>eml://ecoinformatics.org/eml-2.0.0</value> |
|
10 |
</list> |
|
11 |
</property> |
|
9 | 12 |
<property name="fieldList"> |
10 | 13 |
<list> |
11 | 14 |
<ref bean="eml.abstract" /> |
metacat-index/src/main/resources/application-context-oa.xml | ||
---|---|---|
4 | 4 |
xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd"> |
5 | 5 |
|
6 | 6 |
<bean id="rdfXmlSubprocessor" class="edu.ucsb.nceas.metacat.index.annotation.RdfXmlSubprocessor"> |
7 |
<property name="matchDocument" |
|
8 |
value="/d200:systemMetadata/formatId[text()='http://www.w3.org/TR/rdf-syntax-grammar'] | /d200:systemMetadata/formatId[text()='http://www.openarchives.org/ore/terms']" /> |
|
7 |
<property name="matchDocuments"> |
|
8 |
<list> |
|
9 |
<value>http://www.w3.org/TR/rdf-syntax-grammar</value> |
|
10 |
<value>http://www.openarchives.org/ore/terms</value> |
|
11 |
</list> |
|
12 |
</property> |
|
9 | 13 |
<property name="fieldList"> |
10 | 14 |
<list> |
11 | 15 |
<ref bean="annotation.standard" /> |
metacat-index/src/main/resources/application-context-eml201.xml | ||
---|---|---|
4 | 4 |
|
5 | 5 |
<bean id="eml201Subprocessor" |
6 | 6 |
class="org.dataone.cn.indexer.parser.ScienceMetadataDocumentSubprocessor"> |
7 |
<property name="matchDocument" |
|
8 |
value="/d200:systemMetadata/formatId[text() = 'eml://ecoinformatics.org/eml-2.0.1']"></property> |
|
7 |
<property name="matchDocuments"> |
|
8 |
<list> |
|
9 |
<value>eml://ecoinformatics.org/eml-2.0.1</value> |
|
10 |
</list> |
|
11 |
</property> |
|
9 | 12 |
<property name="fieldList"> |
10 | 13 |
<list> |
11 | 14 |
<ref bean="eml.abstract" /> |
metacat-index/src/main/resources/application-context-eml210.xml | ||
---|---|---|
8 | 8 |
|
9 | 9 |
<bean id="eml210Subprocessor" |
10 | 10 |
class="org.dataone.cn.indexer.parser.ScienceMetadataDocumentSubprocessor"> |
11 |
<property name="matchDocument" |
|
12 |
value="/d200:systemMetadata/formatId[text() = 'eml://ecoinformatics.org/eml-2.1.0']"></property> |
|
11 |
<property name="matchDocuments"> |
|
12 |
<list> |
|
13 |
<value>eml://ecoinformatics.org/eml-2.1.0</value> |
|
14 |
</list> |
|
15 |
</property> |
|
13 | 16 |
<property name="fieldList"> |
14 | 17 |
<list> |
15 | 18 |
<ref bean="eml.abstract" /> |
metacat-index/src/main/resources/application-context-eml211.xml | ||
---|---|---|
4 | 4 |
|
5 | 5 |
<bean id="eml211Subprocessor" |
6 | 6 |
class="org.dataone.cn.indexer.parser.ScienceMetadataDocumentSubprocessor"> |
7 |
<property name="matchDocument" |
|
8 |
value="/d200:systemMetadata/formatId[text() = 'eml://ecoinformatics.org/eml-2.1.1']"></property> |
|
7 |
<property name="matchDocuments"> |
|
8 |
<list> |
|
9 |
<value>eml://ecoinformatics.org/eml-2.1.1</value> |
|
10 |
</list> |
|
11 |
</property> |
|
9 | 12 |
<property name="fieldList"> |
10 | 13 |
<list> |
11 | 14 |
<ref bean="eml.abstract" /> |
metacat-index/src/main/resources/application-context-resource-map.xml | ||
---|---|---|
4 | 4 |
xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd"> |
5 | 5 |
|
6 | 6 |
<bean id="resourceMapSubprocessor" class="edu.ucsb.nceas.metacat.index.resourcemap.ResourceMapSubprocessor"> |
7 |
<property name="matchDocument" |
|
8 |
value="/d200:systemMetadata/formatId[text()='http://www.openarchives.org/ore/terms']" /> |
|
7 |
<property name="matchDocuments"> |
|
8 |
<list> |
|
9 |
<value>http://www.openarchives.org/ore/terms</value> |
|
10 |
</list> |
|
11 |
</property> |
|
9 | 12 |
<!-- <property name="httpService" ref="httpService"></property> |
10 | 13 |
<property name="solrQueryUri" value="${solr.query.uri}"></property> --> |
11 | 14 |
</bean> |
metacat-index/src/main/resources/application-context-annotator.xml | ||
---|---|---|
5 | 5 |
|
6 | 6 |
<bean id="annotatorSubprocessor" class="org.dataone.cn.indexer.annotation.AnnotatorSubprocessor"> |
7 | 7 |
|
8 |
<!-- match any document type --> |
|
9 |
<property name="matchDocument" |
|
10 |
value="/d200:systemMetadata/formatId[text() != '']" /> |
|
8 |
<!-- match annotation documents --> |
|
9 |
<property name="matchDocuments"> |
|
10 |
<list> |
|
11 |
<value>http://docs.annotatorjs.org/en/v1.2.x/annotation-format.html</value> |
|
12 |
</list> |
|
13 |
</property> |
|
11 | 14 |
<property name="fieldList"> |
12 | 15 |
<list> |
13 | 16 |
<ref bean="annotation.expansion" /> |
Also available in: Unified diff
update classes and context files that use cn-index-processor classes. allowing document subprocessors to be less tied to XML.