/metacat-index/src/main/java/edu/ucsb/nceas/metacat/index/annotation/RdfXmlSubprocessor.java - Metacat - Ecoinformatics Redmine

metacat/metacat-index/src/main/java/edu/ucsb/nceas/metacat/index/annotation/RdfXmlSubprocessor.java @ 9028

       /**
        * This program is free software; you can redistribute it and/or modify
        * it under the terms of the GNU General Public License as published by
        * the Free Software Foundation; either version 2 of the License, or
        * (at your option) any later version.
+       *
        * This program is distributed in the hope that it will be useful,
        * but WITHOUT ANY WARRANTY; without even the implied warranty of
        * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
        * GNU General Public License for more details.
+       *
        * You should have received a copy of the GNU General Public License
        * along with this program; if not, write to the Free Software
        * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
        */
       package edu.ucsb.nceas.metacat.index.annotation;
       import java.io.IOException;
       import java.io.InputStream;
       import java.net.MalformedURLException;
       import java.net.URI;
       import java.util.ArrayList;
       import java.util.Arrays;
       import java.util.Date;
       import java.util.HashMap;
       import java.util.HashSet;
       import java.util.Iterator;
       import java.util.List;
       import java.util.Map;
       import java.util.Set;
       import javax.xml.parsers.ParserConfigurationException;
       import org.apache.commons.logging.Log;
       import org.apache.commons.logging.LogFactory;
       import org.apache.solr.client.solrj.SolrServer;
       import org.apache.solr.client.solrj.SolrServerException;
       import org.apache.solr.client.solrj.response.QueryResponse;
       import org.apache.solr.common.SolrDocument;
       import org.apache.solr.common.params.SolrParams;
       import org.apache.solr.schema.IndexSchema;
       import org.apache.solr.servlet.SolrRequestParsers;
       import org.dataone.cn.indexer.annotation.SparqlField;
       import org.dataone.cn.indexer.annotation.TripleStoreService;
       import org.dataone.cn.indexer.convert.SolrDateConverter;
       import org.dataone.cn.indexer.parser.IDocumentSubprocessor;
       import org.dataone.cn.indexer.parser.ISolrDataField;
       import org.dataone.cn.indexer.solrhttp.SolrDoc;
       import org.dataone.cn.indexer.solrhttp.SolrElementField;
       import org.dataone.service.exceptions.NotFound;
       import org.dataone.service.exceptions.UnsupportedType;
       import org.dataone.service.types.v1.Permission;
       import org.dataone.service.types.v1.Subject;
       import org.dataone.service.types.v1.util.AccessUtil;
       import org.dataone.service.types.v1.util.AuthUtils;
       import org.dataone.service.util.DateTimeMarshaller;
       import org.xml.sax.SAXException;
       import com.hp.hpl.jena.ontology.OntModel;
       import com.hp.hpl.jena.query.Dataset;
       import com.hp.hpl.jena.query.Query;
       import com.hp.hpl.jena.query.QueryExecution;
       import com.hp.hpl.jena.query.QueryExecutionFactory;
       import com.hp.hpl.jena.query.QueryFactory;
       import com.hp.hpl.jena.query.QuerySolution;
       import com.hp.hpl.jena.query.ResultSet;
       import com.hp.hpl.jena.rdf.model.ModelFactory;
       import com.hp.hpl.jena.tdb.TDBFactory;
       import edu.ucsb.nceas.metacat.common.SolrServerFactory;
       import edu.ucsb.nceas.metacat.common.query.SolrQueryServiceController;
       import edu.ucsb.nceas.metacat.index.DistributedMapsFactory;
       /**
        * A solr index parser for an RDF/XML file.
        * The solr doc of the RDF/XML object only has the system metadata information.
        * The solr docs of the science metadata doc and data file have the annotation information.
        */
       public class RdfXmlSubprocessor implements IDocumentSubprocessor {
           private static final String QUERY ="q=id:";
           private static Log log = LogFactory.getLog(RdfXmlSubprocessor.class);
           /**
            * If xpath returns true execute the processDocument Method
            */
           private List<String> matchDocuments = null;
           private List<ISolrDataField> fieldList = new ArrayList<ISolrDataField>();
           private static SolrServer solrServer =  null;
           static {
               try {
                   solrServer = SolrServerFactory.createSolrServer();
               } catch (Exception e) {
                   log.error("RdfXmlSubprocessor - can't generate the SolrServer since - "+e.getMessage());
+              }
+          }
           /**
            * Returns true if subprocessor should be run against object
+           *
            * @param formatId the the document to be processed
            * @return true if this processor can parse the formatId
            */
           public boolean canProcess(String formatId) {
               return matchDocuments.contains(formatId);
+          }
           public List<String> getMatchDocuments() {
               return matchDocuments;
+          }
           public void setMatchDocuments(List<String> matchDocuments) {
               this.matchDocuments = matchDocuments;
+          }
           public List<ISolrDataField> getFieldList() {
       		return fieldList;
+      	}
       	public void setFieldList(List<ISolrDataField> fieldList) {
       		this.fieldList = fieldList;
+      	}
       	@Override
           public Map<String, SolrDoc> processDocument(String identifier, Map<String, SolrDoc> docs, InputStream is) throws Exception {
               SolrDoc resourceMapDoc = docs.get(identifier);
               List<SolrDoc> processedDocs = process(resourceMapDoc, is);
               Map<String, SolrDoc> processedDocsMap = new HashMap<String, SolrDoc>();
               for (SolrDoc processedDoc : processedDocs) {
                   processedDocsMap.put(processedDoc.getIdentifier(), processedDoc);
+              }
               // make sure to merge any docs that are currently being processed
               Map<String, SolrDoc> mergedDocuments = mergeDocs(docs, processedDocsMap);
               return mergedDocuments;
+          }
           private List<SolrDoc> process(SolrDoc indexDocument, InputStream is) throws Exception {
           	// get the triplestore dataset
       		Dataset dataset = TripleStoreService.getInstance().getDataset();
           	// read the annotation
           	String indexDocId = indexDocument.getIdentifier();
           	String name = indexDocId;
           	//Check if the identifier is a valid URI and if not, make it one by prepending "http://"
           	URI nameURI = new URI(indexDocId);
           	String scheme = nameURI.getScheme();
           	if((scheme == null) || (scheme.isEmpty())){
           		name = "http://" + indexDocId.toLowerCase();
+          	}
           	boolean loaded = dataset.containsNamedModel(name);
       		if (!loaded) {
       			OntModel ontModel = ModelFactory.createOntologyModel();
       			ontModel.read(is, name);
       			dataset.addNamedModel(name, ontModel);
+      		}
       		//dataset.getDefaultModel().add(ontModel);
       		// process each field query
               Map<String, SolrDoc> documentsToIndex = new HashMap<String, SolrDoc>();
       		for (ISolrDataField field: this.fieldList) {
       			String q = null;
       			if (field instanceof SparqlField) {
       				q = ((SparqlField) field).getQuery();
       				q = q.replaceAll("\\$GRAPH_NAME", name);
       				Query query = QueryFactory.create(q);
       				QueryExecution qexec = QueryExecutionFactory.create(query, dataset);
       				ResultSet results = qexec.execSelect();
       				while (results.hasNext()) {
       					SolrDoc solrDoc = null;
       					QuerySolution solution = results.next();
       					System.out.println(solution.toString());
       					// find the index document we are trying to augment with the annotation
       					if (solution.contains("pid")) {
       						String id = solution.getLiteral("pid").getString();
       						// check if anyone with permissions on the annotation document has write permission on the document we are annotating
       						boolean statementAuthorized = false;
       						try {
       							HashMap<Subject, Set<Permission>> annotationPermissionMap = AccessUtil.getPermissionMap(DistributedMapsFactory.getSystemMetadata(indexDocId).getAccessPolicy());
       							annotationPermissionMap.put(DistributedMapsFactory.getSystemMetadata(indexDocId).getRightsHolder(), new HashSet<Permission>(Arrays.asList(Permission.CHANGE_PERMISSION)));
       							statementAuthorized = AuthUtils.isAuthorized(annotationPermissionMap.keySet(), Permission.WRITE, DistributedMapsFactory.getSystemMetadata(id));
       						} catch (Exception e) {
       							log.warn("Could not check for assertion permission on original pid: " + id, e);
+      						}
       						if (!statementAuthorized) {
       							continue;
+      						}
       						// otherwise carry on with the indexing
       						solrDoc = documentsToIndex.get(id);
       						if (solrDoc == null) {
       							solrDoc = new SolrDoc();
       							solrDoc.addField(new SolrElementField(SolrElementField.FIELD_ID, id));
       							documentsToIndex.put(id, solrDoc);
+      						}
+      					}
       					// add the field to the index document
       					if (solution.contains(field.getName())) {
       						String value = solution.get(field.getName()).toString();
       						SolrElementField f = new SolrElementField(field.getName(), value);
       						if (!solrDoc.hasFieldWithValue(f.getName(), f.getValue())) {
       							solrDoc.addField(f);
+      						}
+      					}
+      				}
+      			}
+      		}
       		// clean up the triple store
       		TDBFactory.release(dataset);
       		// merge the existing index with the new[er] values
               Map<String, SolrDoc> existingDocuments = getSolrDocs(documentsToIndex.keySet());
               Map<String, SolrDoc> mergedDocuments = mergeDocs(documentsToIndex, existingDocuments);
               mergedDocuments.put(indexDocument.getIdentifier(), indexDocument);
               return new ArrayList<SolrDoc>(mergedDocuments.values());
+          }
           private Map<String, SolrDoc> getSolrDocs(Set<String> ids) throws Exception {
               Map<String, SolrDoc> list = new HashMap<String, SolrDoc>();
               if (ids != null) {
                   for (String id : ids) {
                   	SolrDoc doc = getSolrDoc(id);
                       if (doc != null) {
                           list.put(id, doc);
+                      }
+                  }
+              }
               return list;
+          }
           private Map<String, SolrDoc> mergeDocs(Map<String, SolrDoc> pending, Map<String, SolrDoc> existing) throws Exception {
           	IndexSchema indexSchema = SolrQueryServiceController.getInstance().getSchema();
           	Map<String, SolrDoc> merged = new HashMap<String, SolrDoc>();
           	Iterator<String> pendingIter = pending.keySet().iterator();
           	while (pendingIter.hasNext()) {
           		String id = pendingIter.next();
           		SolrDoc pendingDoc = pending.get(id);
           		SolrDoc existingDoc = existing.get(id);
           		SolrDoc mergedDoc = new SolrDoc();
           		if (existingDoc != null) {
           			// merge the existing fields
           			for (SolrElementField field: existingDoc.getFieldList()) {
           				mergedDoc.addField(field);
+          			}
+          		}
           		// add the pending
           		for (SolrElementField field: pendingDoc.getFieldList()) {
           			if (field.getName().equals(SolrElementField.FIELD_ID) && mergedDoc.hasField(SolrElementField.FIELD_ID)) {
           				continue;
+          			}
           			// don't transfer the copyTo fields, otherwise there are errors
       				if (indexSchema.isCopyFieldTarget(indexSchema.getField(field.getName()))) {
       					continue;
+      				}
       				// only add if we don't already have it
       				if (!mergedDoc.hasFieldWithValue(field.getName(), field.getValue())) {
       					mergedDoc.addField(field);
+      				}
+      			}
           		// include in results
       			merged.put(id, mergedDoc);
+          	}
           	return merged;
+          }
       	/*
       	 * Get the SolrDoc for the specified id
       	 */
       	public static SolrDoc getSolrDoc(String id) throws SolrServerException, MalformedURLException, UnsupportedType, NotFound, ParserConfigurationException, IOException, SAXException {
       		SolrDoc doc = null;
       		if (solrServer != null) {
       			String query = QUERY + "\"" + id + "\"";
       			SolrParams solrParams = SolrRequestParsers.parseQueryString(query);
       			QueryResponse qr = solrServer.query(solrParams);
       			if (!qr.getResults().isEmpty()) {
       				doc = new SolrDoc();
       				SolrDocument orig = qr.getResults().get(0);
       				IndexSchema indexSchema = SolrQueryServiceController.getInstance().getSchema();
       				for (String fieldName : orig.getFieldNames()) {
       					// don't transfer the copyTo fields, otherwise there are errors
       					if (indexSchema.isCopyFieldTarget(indexSchema.getField(fieldName))) {
       						continue;
+      					}
       					for (Object value : orig.getFieldValues(fieldName)) {
       						String stringValue = value.toString();
       						// special handling for dates in ISO 8601
       						if (value instanceof Date) {
       							stringValue = DateTimeMarshaller.serializeDateToUTC((Date) value);
       							SolrDateConverter converter = new SolrDateConverter();
       							stringValue = converter.convert(stringValue);
+      						}
       						SolrElementField field = new SolrElementField(fieldName, stringValue);
       						log.debug("Adding field: " + fieldName);
       						doc.addField(field);
+      					}
+      				}
+      			}
+      		}
       		return doc;
+      	}
+      }

(1-1/1)

Project

General

Profile

Metacat