Revision 9420
Added by ben leinfelder about 9 years ago
metacat-index/src/main/java/edu/ucsb/nceas/metacat/index/annotation/MetacatAnnotatorSubprocessor.java | ||
---|---|---|
1 |
package edu.ucsb.nceas.metacat.index.annotation; |
|
2 |
|
|
3 |
import java.io.IOException; |
|
4 |
import java.io.InputStream; |
|
5 |
import java.util.Iterator; |
|
6 |
import java.util.List; |
|
7 |
import java.util.Map; |
|
8 |
|
|
9 |
import javax.xml.xpath.XPathExpressionException; |
|
10 |
|
|
11 |
import org.apache.commons.codec.EncoderException; |
|
12 |
import org.apache.commons.logging.Log; |
|
13 |
import org.apache.commons.logging.LogFactory; |
|
14 |
import org.dataone.cn.indexer.annotation.AnnotatorSubprocessor; |
|
15 |
import org.dataone.cn.indexer.solrhttp.SolrDoc; |
|
16 |
import org.dataone.cn.indexer.solrhttp.SolrElementField; |
|
17 |
|
|
18 |
import edu.ucsb.nceas.metacat.index.resourcemap.ResourceMapSubprocessor; |
|
19 |
|
|
20 |
public class MetacatAnnotatorSubprocessor extends AnnotatorSubprocessor { |
|
21 |
|
|
22 |
private static Log log = LogFactory.getLog(AnnotatorSubprocessor.class); |
|
23 |
|
|
24 |
|
|
25 |
@Override |
|
26 |
public Map<String, SolrDoc> processDocument(String annotationId, Map<String, SolrDoc> docs, |
|
27 |
InputStream is) throws Exception { |
|
28 |
|
|
29 |
// check for annotations, and add them if found |
|
30 |
SolrDoc annotations = parseAnnotation(is); |
|
31 |
if (annotations != null) { |
|
32 |
String referencedPid = annotations.getIdentifier(); |
|
33 |
SolrDoc referencedDoc = docs.get(referencedPid); |
|
34 |
|
|
35 |
// make sure we have a reference for the document we annotating |
|
36 |
if (referencedDoc == null) { |
|
37 |
try { |
|
38 |
referencedDoc = ResourceMapSubprocessor.getSolrDoc(referencedPid); |
|
39 |
} catch (Exception e) { |
|
40 |
log.error("Unable to retrieve solr document: " + referencedPid |
|
41 |
+ ". Exception attempting to communicate with solr server.", e); |
|
42 |
} |
|
43 |
|
|
44 |
if (referencedDoc == null) { |
|
45 |
referencedDoc = new SolrDoc(); |
|
46 |
} |
|
47 |
docs.put(referencedPid, referencedDoc); |
|
48 |
} |
|
49 |
|
|
50 |
// make sure we say we annotate the object |
|
51 |
SolrDoc annotationDoc = docs.get(annotationId); |
|
52 |
if (annotationDoc != null) { |
|
53 |
annotationDoc.addField(new SolrElementField(FIELD_ANNOTATES, referencedPid)); |
|
54 |
} |
|
55 |
|
|
56 |
// add the annotations to the referenced document |
|
57 |
Iterator<SolrElementField> annotationIter = annotations.getFieldList().iterator(); |
|
58 |
while (annotationIter.hasNext()) { |
|
59 |
SolrElementField annotation = annotationIter.next(); |
|
60 |
if (!this.getFieldsToMerge().contains(annotation.getName())) { |
|
61 |
log.debug("SKIPPING field (not in fieldsToMerge): " + annotation.getName()); |
|
62 |
continue; |
|
63 |
} |
|
64 |
referencedDoc.addField(annotation); |
|
65 |
log.debug("ADDING annotation to " + referencedPid + ": " + annotation.getName() |
|
66 |
+ "=" + annotation.getValue()); |
|
67 |
} |
|
68 |
} else { |
|
69 |
log.warn("Annotations were not found when parsing: " + annotationId); |
|
70 |
} |
|
71 |
// return the collection that we have augmented |
|
72 |
return docs; |
|
73 |
} |
|
74 |
|
|
75 |
/** |
|
76 |
* Merge updates with existing solr documents |
|
77 |
* |
|
78 |
* @param indexDocument |
|
79 |
* @return |
|
80 |
* @throws IOException |
|
81 |
* @throws EncoderException |
|
82 |
* @throws XPathExpressionException |
|
83 |
*/ |
|
84 |
public SolrDoc mergeWithIndexedDocument(SolrDoc indexDocument) throws IOException, |
|
85 |
EncoderException, XPathExpressionException { |
|
86 |
|
|
87 |
return mergeWithIndexedDocument(indexDocument, getFieldsToMerge()); |
|
88 |
} |
|
89 |
|
|
90 |
/** |
|
91 |
* Inspired by SubprocessorUtility method, but works with embedded solr server |
|
92 |
* @param indexDocument |
|
93 |
* @param fieldsToMerge |
|
94 |
* @return |
|
95 |
* @throws IOException |
|
96 |
* @throws EncoderException |
|
97 |
* @throws XPathExpressionException |
|
98 |
*/ |
|
99 |
private SolrDoc mergeWithIndexedDocument(SolrDoc indexDocument, List<String> fieldsToMerge) |
|
100 |
throws IOException, EncoderException, XPathExpressionException { |
|
101 |
|
|
102 |
log.debug("about to merge indexed document with new doc to insert for pid: " |
|
103 |
+ indexDocument.getIdentifier()); |
|
104 |
SolrDoc solrDoc = null; |
|
105 |
try { |
|
106 |
solrDoc = ResourceMapSubprocessor.getSolrDoc(indexDocument.getIdentifier()); |
|
107 |
} catch (Exception e) { |
|
108 |
log.error("Could not retrieve existing index document: " + indexDocument.getIdentifier(), e); |
|
109 |
} |
|
110 |
if (solrDoc != null) { |
|
111 |
log.debug("found existing doc to merge for pid: " + indexDocument.getIdentifier()); |
|
112 |
for (SolrElementField field : solrDoc.getFieldList()) { |
|
113 |
if (fieldsToMerge.contains(field.getName()) |
|
114 |
&& !indexDocument.hasFieldWithValue(field.getName(), field.getValue())) { |
|
115 |
indexDocument.addField(field); |
|
116 |
log.debug("merging field: " + field.getName() + " with value: " |
|
117 |
+ field.getValue()); |
|
118 |
} |
|
119 |
} |
|
120 |
} |
|
121 |
return indexDocument; |
|
122 |
} |
|
123 |
|
|
124 |
} |
|
0 | 125 |
metacat-index/src/main/resources/application-context-annotator.xml | ||
---|---|---|
3 | 3 |
xmlns:p="http://www.springframework.org/schema/p" |
4 | 4 |
xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd"> |
5 | 5 |
|
6 |
<bean id="annotatorSubprocessor" class="org.dataone.cn.indexer.annotation.AnnotatorSubprocessor">
|
|
6 |
<bean id="annotatorSubprocessor" class="edu.ucsb.nceas.metacat.index.annotation.MetacatAnnotatorSubprocessor">
|
|
7 | 7 |
|
8 | 8 |
<!-- match annotation documents --> |
9 | 9 |
<property name="matchDocuments"> |
... | ... | |
14 | 14 |
<property name="fieldList"> |
15 | 15 |
<list> |
16 | 16 |
<ref bean="annotation.expansion" /> |
17 |
<ref bean="annotation.bioportal.expansion" /> |
|
18 |
<ref bean="annotation.esor.expansion" /> |
|
17 | 19 |
</list> |
18 | 20 |
</property> |
19 | 21 |
<property name="fieldsToMerge"> |
20 | 22 |
<list> |
21 | 23 |
<value>sem_annotation</value> |
24 |
<value>sem_annotation_bioportal_sm</value> |
|
25 |
<value>sem_annotation_esor_sm</value> |
|
22 | 26 |
<value>sem_annotates</value> |
23 | 27 |
<value>sem_annotated_by</value> |
24 | 28 |
</list> |
25 |
</property> |
|
29 |
</property> |
|
30 |
<property name="ontologyList"> |
|
31 |
<list> |
|
32 |
<value>http://purl.org/spar/datacite/</value> |
|
33 |
<value>http://purl.org/dc/terms/</value> |
|
34 |
<value>http://www.w3.org/2006/time</value> |
|
35 |
<value>http://purl.dataone.org/ontologies/observation/d1-ECSO.owl</value> |
|
36 |
<value>http://purl.dataone.org/ontologies/provenance/ProvONE/v1/owl/provone.owl</value> |
|
37 |
<value>http://purl.obolibrary.org/obo/envo.owl</value> |
|
38 |
<value>http://purl.obolibrary.org/obo/chebi.owl</value> |
|
39 |
</list> |
|
40 |
</property> |
|
26 | 41 |
</bean> |
27 | 42 |
|
28 | 43 |
<bean id="annotation.expansion" class="org.dataone.cn.indexer.annotation.SparqlField"> |
... | ... | |
35 | 50 |
PREFIX owl: <http://www.w3.org/2002/07/owl#> |
36 | 51 |
|
37 | 52 |
SELECT ?sem_annotation |
38 |
FROM <$GRAPH_NAME> |
|
39 | 53 |
WHERE { |
40 | 54 |
<$CONCEPT_URI> rdfs:subClassOf+ ?sem_annotation . |
41 | 55 |
} |
... | ... | |
44 | 58 |
</constructor-arg> |
45 | 59 |
<!--property name="multivalue" value="false" /--> |
46 | 60 |
</bean> |
61 |
|
|
62 |
<!-- bioportal-based annotation --> |
|
63 |
<bean id="annotation.bioportal.expansion" class="org.dataone.cn.indexer.annotation.SparqlField"> |
|
64 |
<constructor-arg name="name" value="sem_annotation_bioportal_sm" /> |
|
65 |
<constructor-arg name="query"> |
|
66 |
<value> |
|
67 |
<![CDATA[ |
|
68 |
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> |
|
69 |
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> |
|
70 |
PREFIX owl: <http://www.w3.org/2002/07/owl#> |
|
71 |
|
|
72 |
SELECT ?sem_annotation_bioportal_sm |
|
73 |
WHERE { |
|
74 |
<$CONCEPT_URI> rdfs:subClassOf+ ?sem_annotation_bioportal_sm . |
|
75 |
} |
|
76 |
]]> |
|
77 |
</value> |
|
78 |
</constructor-arg> |
|
79 |
</bean> |
|
80 |
<bean id="annotation.bioportal.raw" class="org.dataone.cn.indexer.annotation.SparqlField"> |
|
81 |
<constructor-arg name="name" value="sem_annotation_bioportal_raw_sm" /> |
|
82 |
<constructor-arg name="query"> |
|
83 |
<value> |
|
84 |
<![CDATA[ |
|
85 |
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> |
|
86 |
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> |
|
87 |
PREFIX owl: <http://www.w3.org/2002/07/owl#> |
|
88 |
|
|
89 |
SELECT ?sem_annotation_bioportal_raw_sm |
|
90 |
WHERE { |
|
91 |
<$CONCEPT_URI> rdf:about ?sem_annotation_bioportal_raw_sm . |
|
92 |
} |
|
93 |
]]> |
|
94 |
</value> |
|
95 |
</constructor-arg> |
|
96 |
</bean> |
|
97 |
|
|
98 |
<!-- esor-based annotation --> |
|
99 |
<bean id="annotation.esor.expansion" class="org.dataone.cn.indexer.annotation.SparqlField"> |
|
100 |
<constructor-arg name="name" value="sem_annotation_esor_sm" /> |
|
101 |
<constructor-arg name="query"> |
|
102 |
<value> |
|
103 |
<![CDATA[ |
|
104 |
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> |
|
105 |
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> |
|
106 |
PREFIX owl: <http://www.w3.org/2002/07/owl#> |
|
107 |
|
|
108 |
SELECT ?sem_annotation_esor_sm |
|
109 |
WHERE { |
|
110 |
<$CONCEPT_URI> rdfs:subClassOf+ ?sem_annotation_esor_sm . |
|
111 |
} |
|
112 |
]]> |
|
113 |
</value> |
|
114 |
</constructor-arg> |
|
115 |
</bean> |
|
116 |
<bean id="annotation.esor.raw" class="org.dataone.cn.indexer.annotation.SparqlField"> |
|
117 |
<constructor-arg name="name" value="sem_annotation_esor_raw_sm" /> |
|
118 |
<constructor-arg name="query"> |
|
119 |
<value> |
|
120 |
<![CDATA[ |
|
121 |
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> |
|
122 |
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> |
|
123 |
PREFIX owl: <http://www.w3.org/2002/07/owl#> |
|
124 |
|
|
125 |
SELECT ?sem_annotation_esor_raw_sm |
|
126 |
WHERE { |
|
127 |
<$CONCEPT_URI> rdf:about ?sem_annotation_esor_raw_sm . |
|
128 |
} |
|
129 |
]]> |
|
130 |
</value> |
|
131 |
</constructor-arg> |
|
132 |
</bean> |
|
47 | 133 |
|
48 | 134 |
</beans> |
Also available in: Unified diff
subclass AnnotatorSubprocessor for use in metacat-index (uses embedded solr server and solrj for retrieving/merging existing documents).