Project

General

Profile

« Previous | Next » 

Revision 9018

update classes and context files that use cn-index-processor classes. allowing document subprocessors to be less tied to XML.

View differences:

metacat-index/src/main/java/edu/ucsb/nceas/metacat/index/resourcemap/ResourceMapSubprocessor.java
16 16
package edu.ucsb.nceas.metacat.index.resourcemap;
17 17

  
18 18
import java.io.IOException;
19
import java.io.InputStream;
19 20
import java.net.MalformedURLException;
20 21
import java.util.ArrayList;
21 22
import java.util.Date;
......
36 37
import org.apache.solr.common.params.SolrParams;
37 38
import org.apache.solr.schema.IndexSchema;
38 39
import org.apache.solr.servlet.SolrRequestParsers;
40
import org.dataone.cn.indexer.XPathDocumentParser;
39 41
import org.dataone.cn.indexer.convert.SolrDateConverter;
40 42
import org.dataone.cn.indexer.parser.AbstractDocumentSubprocessor;
41 43
import org.dataone.cn.indexer.parser.IDocumentSubprocessor;
......
77 79
          
78 80
    @Override
79 81
    public Map<String, SolrDoc> processDocument(String identifier, Map<String, SolrDoc> docs,
80
    Document doc) throws IOException, EncoderException, SAXException,
82
    InputStream is) throws IOException, EncoderException, SAXException,
81 83
    XPathExpressionException, ParserConfigurationException, SolrServerException, NotImplemented, NotFound, UnsupportedType, OREParserException, ResourceMapException {
82 84
        SolrDoc resourceMapDoc = docs.get(identifier);
83
        List<SolrDoc> processedDocs = processResourceMap(resourceMapDoc, doc);
85
        Document doc = XPathDocumentParser.generateXmlDocument(is);
86
		List<SolrDoc> processedDocs = processResourceMap(resourceMapDoc, doc );
84 87
        Map<String, SolrDoc> processedDocsMap = new HashMap<String, SolrDoc>();
85 88
        for (SolrDoc processedDoc : processedDocs) {
86 89
            processedDocsMap.put(processedDoc.getIdentifier(), processedDoc);
metacat-index/src/main/java/edu/ucsb/nceas/metacat/index/SolrIndex.java
58 58
import org.apache.solr.schema.IndexSchema;
59 59
import org.dataone.cn.indexer.XMLNamespaceConfig;
60 60
import org.dataone.cn.indexer.convert.SolrDateConverter;
61
import org.dataone.cn.indexer.parser.AbstractDocumentSubprocessor;
61 62
import org.dataone.cn.indexer.parser.IDocumentSubprocessor;
62 63
import org.dataone.cn.indexer.parser.SolrField;
63
import org.dataone.cn.indexer.resourcemap.ResourceEntry;
64
import org.dataone.cn.indexer.resourcemap.ResourceMap;
65
import org.dataone.cn.indexer.resourcemap.ResourceMapFactory;
66 64
import org.dataone.cn.indexer.solrhttp.SolrDoc;
67 65
import org.dataone.cn.indexer.solrhttp.SolrElementField;
68 66
import org.dataone.service.exceptions.NotFound;
......
156 154
     */
157 155
    public void setSubprocessors(List<IDocumentSubprocessor> subprocessorList) {
158 156
        for (IDocumentSubprocessor subprocessor : subprocessorList) {
159
            subprocessor.initExpression(xpath);
157
        	if (subprocessor instanceof AbstractDocumentSubprocessor) {
158
        		((AbstractDocumentSubprocessor)subprocessor).initExpression(xpath);
159
        	}
160 160
        }
161 161
        this.subprocessors = subprocessorList;
162 162
    }
......
197 197
        SolrDoc indexDocument = new SolrDoc(sysSolrFields);
198 198
        Map<String, SolrDoc> docs = new HashMap<String, SolrDoc>();
199 199
        docs.put(id, indexDocument);
200
        
201
        // get the format id for this object
202
        String formatId = indexDocument.getFirstFieldValue(SolrElementField.FIELD_OBJECTFORMAT);
200 203

  
201 204
        // Determine if subprocessors are available for this ID
202 205
        if (subprocessors != null) {
203
                    // for each subprocessor loaded from the spring config
204
                    for (IDocumentSubprocessor subprocessor : subprocessors) {
205
                        // Does this subprocessor apply?
206
                        if (subprocessor.canProcess(sysMetaDoc)) {
207
                            // if so, then extract the additional information from the
208
                            // document.
209
                            try {
210
                                // docObject = the resource map document or science
211
                                // metadata document.
212
                                // note that resource map processing touches all objects
213
                                // referenced by the resource map.
214
                            	InputStream dataStream = new FileInputStream(objectPath);
215
                                Document docObject = generateXmlDocument(dataStream);
216
                                if (docObject == null) {
217
                                    throw new Exception("Could not load OBJECT for ID " + id );
218
                                } else {
219
                                    docs = subprocessor.processDocument(id, docs, docObject);
220
                                }
221
                            } catch (Exception e) {
222
                                log.error(e.getMessage(), e);
223
                                throw new SolrServerException(e.getMessage());
224
                            }
225
                        }
226
                    }
206
	        // for each subprocessor loaded from the spring config
207
	        for (IDocumentSubprocessor subprocessor : subprocessors) {
208
	            // Does this subprocessor apply?
209
	            if (subprocessor.canProcess(formatId)) {
210
	                // if so, then extract the additional information from the
211
	                // document.
212
	                try {
213
	                    // docObject = the resource map document or science
214
	                    // metadata document.
215
	                    // note that resource map processing touches all objects
216
	                    // referenced by the resource map.
217
	                	FileInputStream dataStream = new FileInputStream(objectPath);
218
	                    if (!dataStream.getFD().valid()) {
219
	                    	log.error("Could not load OBJECT file for ID,Path=" + id + ", "
220
                                    + objectPath);
221
	                        //throw new Exception("Could not load OBJECT for ID " + id );
222
	                    } else {
223
	                        docs = subprocessor.processDocument(id, docs, dataStream);
224
	                    }
225
	                } catch (Exception e) {
226
	                    log.error(e.getMessage(), e);
227
	                    throw new SolrServerException(e.getMessage());
228
	                }
229
	            }
230
	        }
227 231
       }
228 232

  
229 233
       // TODO: in the XPathDocumentParser class in d1_cn_index_process module,
metacat-index/src/main/java/edu/ucsb/nceas/metacat/index/annotation/RdfXmlSubprocessor.java
15 15
 */
16 16
package edu.ucsb.nceas.metacat.index.annotation;
17 17

  
18
import java.io.ByteArrayInputStream;
19
import java.io.ByteArrayOutputStream;
20 18
import java.io.IOException;
21 19
import java.io.InputStream;
22 20
import java.net.MalformedURLException;
......
32 30
import java.util.Set;
33 31

  
34 32
import javax.xml.parsers.ParserConfigurationException;
35
import javax.xml.transform.Result;
36
import javax.xml.transform.Source;
37
import javax.xml.transform.TransformerConfigurationException;
38
import javax.xml.transform.TransformerException;
39
import javax.xml.transform.TransformerFactory;
40
import javax.xml.transform.TransformerFactoryConfigurationError;
41
import javax.xml.transform.dom.DOMSource;
42
import javax.xml.transform.stream.StreamResult;
43 33

  
44 34
import org.apache.commons.logging.Log;
45 35
import org.apache.commons.logging.LogFactory;
......
65 55
import org.dataone.service.types.v1.util.AccessUtil;
66 56
import org.dataone.service.types.v1.util.AuthUtils;
67 57
import org.dataone.service.util.DateTimeMarshaller;
68
import org.w3c.dom.Document;
69 58
import org.xml.sax.SAXException;
70 59

  
71 60
import com.hp.hpl.jena.ontology.OntModel;
......
103 92
    }
104 93
          
105 94
    @Override
106
    public Map<String, SolrDoc> processDocument(String identifier, Map<String, SolrDoc> docs, Document doc) throws Exception {
95
    public Map<String, SolrDoc> processDocument(String identifier, Map<String, SolrDoc> docs, InputStream is) throws Exception {
107 96
        SolrDoc resourceMapDoc = docs.get(identifier);
108
        List<SolrDoc> processedDocs = process(resourceMapDoc, doc);
97
        List<SolrDoc> processedDocs = process(resourceMapDoc, is);
109 98
        Map<String, SolrDoc> processedDocsMap = new HashMap<String, SolrDoc>();
110 99
        for (SolrDoc processedDoc : processedDocs) {
111 100
            processedDocsMap.put(processedDoc.getIdentifier(), processedDoc);
......
114 103
        Map<String, SolrDoc> mergedDocuments = mergeDocs(docs, processedDocsMap);
115 104
        return mergedDocuments;
116 105
    }
117

  
118
    private InputStream toInputStream(Document doc) throws TransformerConfigurationException, TransformerException, TransformerFactoryConfigurationError {
119
    	ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
120
    	Source xmlSource = new DOMSource(doc);
121
    	Result outputTarget = new StreamResult(outputStream);
122
    	TransformerFactory.newInstance().newTransformer().transform(xmlSource, outputTarget);
123
    	InputStream is = new ByteArrayInputStream(outputStream.toByteArray());
124
    	return is;
125
    }
126 106
    
127
    private List<SolrDoc> process(SolrDoc indexDocument, Document rdfXmlDocument) throws Exception {
107
    private List<SolrDoc> process(SolrDoc indexDocument, InputStream is) throws Exception {
128 108
    	
129 109
    	// get the triplestore dataset
130 110
		Dataset dataset = TripleStoreService.getInstance().getDataset();
131 111
		
132 112
    	// read the annotation
133
		InputStream source = toInputStream(rdfXmlDocument);
134 113
    	String indexDocId = indexDocument.getIdentifier();
135 114
    	String name = indexDocId;
136 115
    			
......
144 123
    	boolean loaded = dataset.containsNamedModel(name);
145 124
		if (!loaded) {
146 125
			OntModel ontModel = ModelFactory.createOntologyModel();
147
			ontModel.read(source, name);
126
			ontModel.read(is, name);
148 127
			dataset.addNamedModel(name, ontModel);
149 128
		}
150 129
		//dataset.getDefaultModel().add(ontModel);
metacat-index/src/main/resources/application-context-eml200.xml
4 4

  
5 5
 <bean id="eml200Subprocessor"
6 6
  class="org.dataone.cn.indexer.parser.ScienceMetadataDocumentSubprocessor">
7
  <property name="matchDocument"
8
   value="/d200:systemMetadata/formatId[text() = 'eml://ecoinformatics.org/eml-2.0.0']"></property>
7
  	<property name="matchDocuments">
8
		<list>
9
			<value>eml://ecoinformatics.org/eml-2.0.0</value>
10
		</list>
11
	</property>
9 12
  <property name="fieldList">
10 13
   <list>
11 14
    <ref bean="eml.abstract" />
metacat-index/src/main/resources/application-context-oa.xml
4 4
	xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd">
5 5

  
6 6
	<bean id="rdfXmlSubprocessor" class="edu.ucsb.nceas.metacat.index.annotation.RdfXmlSubprocessor">
7
		<property name="matchDocument"
8
			value="/d200:systemMetadata/formatId[text()='http://www.w3.org/TR/rdf-syntax-grammar'] | /d200:systemMetadata/formatId[text()='http://www.openarchives.org/ore/terms']" />
7
		<property name="matchDocuments">
8
			<list>
9
				<value>http://www.w3.org/TR/rdf-syntax-grammar</value>
10
				<value>http://www.openarchives.org/ore/terms</value>
11
			</list>
12
		</property>	
9 13
		<property name="fieldList">
10 14
			<list>
11 15
				<ref bean="annotation.standard" />
metacat-index/src/main/resources/application-context-eml201.xml
4 4

  
5 5
 <bean id="eml201Subprocessor"
6 6
  class="org.dataone.cn.indexer.parser.ScienceMetadataDocumentSubprocessor">
7
  <property name="matchDocument"
8
   value="/d200:systemMetadata/formatId[text() = 'eml://ecoinformatics.org/eml-2.0.1']"></property>
7
  	<property name="matchDocuments">
8
		<list>
9
			<value>eml://ecoinformatics.org/eml-2.0.1</value>
10
		</list>
11
	</property>
9 12
  <property name="fieldList">
10 13
   <list>
11 14
    <ref bean="eml.abstract" />
metacat-index/src/main/resources/application-context-eml210.xml
8 8
    					
9 9
 <bean id="eml210Subprocessor"
10 10
  class="org.dataone.cn.indexer.parser.ScienceMetadataDocumentSubprocessor">
11
  <property name="matchDocument"
12
   value="/d200:systemMetadata/formatId[text() = 'eml://ecoinformatics.org/eml-2.1.0']"></property>
11
  	<property name="matchDocuments">
12
		<list>
13
			<value>eml://ecoinformatics.org/eml-2.1.0</value>
14
		</list>
15
	</property>
13 16
  <property name="fieldList">
14 17
   <list>
15 18
    <ref bean="eml.abstract" />
metacat-index/src/main/resources/application-context-eml211.xml
4 4

  
5 5
 <bean id="eml211Subprocessor"
6 6
  class="org.dataone.cn.indexer.parser.ScienceMetadataDocumentSubprocessor">
7
  <property name="matchDocument"
8
   value="/d200:systemMetadata/formatId[text() = 'eml://ecoinformatics.org/eml-2.1.1']"></property>
7
  	<property name="matchDocuments">
8
		<list>
9
			<value>eml://ecoinformatics.org/eml-2.1.1</value>
10
		</list>
11
	</property>
9 12
  <property name="fieldList">
10 13
   <list>
11 14
    <ref bean="eml.abstract" />
metacat-index/src/main/resources/application-context-resource-map.xml
4 4
	xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd">
5 5

  
6 6
	<bean id="resourceMapSubprocessor" class="edu.ucsb.nceas.metacat.index.resourcemap.ResourceMapSubprocessor">
7
		<property name="matchDocument"
8
			value="/d200:systemMetadata/formatId[text()='http://www.openarchives.org/ore/terms']" />
7
		<property name="matchDocuments">
8
			<list>
9
				<value>http://www.openarchives.org/ore/terms</value>
10
			</list>
11
		</property>
9 12
		<!-- <property name="httpService" ref="httpService"></property>
10 13
		<property name="solrQueryUri" value="${solr.query.uri}"></property> -->
11 14
	</bean>
metacat-index/src/main/resources/application-context-annotator.xml
5 5

  
6 6
	<bean id="annotatorSubprocessor" class="org.dataone.cn.indexer.annotation.AnnotatorSubprocessor">
7 7
		
8
		<!-- match any document type -->
9
		<property name="matchDocument"
10
			value="/d200:systemMetadata/formatId[text() != '']" />
8
		<!-- match annotation documents -->
9
		<property name="matchDocuments">
10
			<list>
11
				<value>http://docs.annotatorjs.org/en/v1.2.x/annotation-format.html</value>
12
			</list>
13
		</property>
11 14
		<property name="fieldList">
12 15
			<list>
13 16
				<ref bean="annotation.expansion" />

Also available in: Unified diff