/ - Diff - Metacat - Ecoinformatics Redmine

« Previous | Next »

Revision 9058

Added by ben leinfelder about 10 years ago

moved RDF XML subprocessor to cn-index project.

     /**
      * This program is free software; you can redistribute it and/or modify
      * it under the terms of the GNU General Public License as published by
      * the Free Software Foundation; either version 2 of the License, or
      * (at your option) any later version.
+     *
      * This program is distributed in the hope that it will be useful,
      * but WITHOUT ANY WARRANTY; without even the implied warranty of
      * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      * GNU General Public License for more details.
+     *
      * You should have received a copy of the GNU General Public License
      * along with this program; if not, write to the Free Software
      * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
      */
     package edu.ucsb.nceas.metacat.index.annotation;
     import java.io.IOException;
     import java.io.InputStream;
     import java.net.MalformedURLException;
     import java.net.URI;
     import java.util.ArrayList;
     import java.util.Arrays;
     import java.util.Date;
     import java.util.HashMap;
     import java.util.HashSet;
     import java.util.Iterator;
     import java.util.List;
     import java.util.Map;
     import java.util.Set;
     import javax.xml.parsers.ParserConfigurationException;
     import javax.xml.xpath.XPathExpressionException;
     import org.apache.commons.codec.EncoderException;
     import org.apache.commons.logging.Log;
     import org.apache.commons.logging.LogFactory;
     import org.apache.solr.client.solrj.SolrServer;
     import org.apache.solr.client.solrj.SolrServerException;
     import org.apache.solr.client.solrj.response.QueryResponse;
     import org.apache.solr.common.SolrDocument;
     import org.apache.solr.common.params.SolrParams;
     import org.apache.solr.schema.IndexSchema;
     import org.apache.solr.servlet.SolrRequestParsers;
     import org.dataone.cn.indexer.annotation.SparqlField;
     import org.dataone.cn.indexer.annotation.TripleStoreService;
     import org.dataone.cn.indexer.convert.SolrDateConverter;
     import org.dataone.cn.indexer.parser.IDocumentSubprocessor;
     import org.dataone.cn.indexer.parser.ISolrDataField;
     import org.dataone.cn.indexer.solrhttp.SolrDoc;
     import org.dataone.cn.indexer.solrhttp.SolrElementField;
     import org.dataone.service.exceptions.NotFound;
     import org.dataone.service.exceptions.UnsupportedType;
     import org.dataone.service.types.v1.Permission;
     import org.dataone.service.types.v1.Subject;
     import org.dataone.service.types.v1.util.AccessUtil;
     import org.dataone.service.types.v1.util.AuthUtils;
     import org.dataone.service.util.DateTimeMarshaller;
     import org.xml.sax.SAXException;
     import com.hp.hpl.jena.ontology.OntModel;
     import com.hp.hpl.jena.query.Dataset;
     import com.hp.hpl.jena.query.Query;
     import com.hp.hpl.jena.query.QueryExecution;
     import com.hp.hpl.jena.query.QueryExecutionFactory;
     import com.hp.hpl.jena.query.QueryFactory;
     import com.hp.hpl.jena.query.QuerySolution;
     import com.hp.hpl.jena.query.ResultSet;
     import com.hp.hpl.jena.rdf.model.ModelFactory;
     import com.hp.hpl.jena.tdb.TDBFactory;
     import edu.ucsb.nceas.metacat.common.SolrServerFactory;
     import edu.ucsb.nceas.metacat.common.query.SolrQueryServiceController;
     import edu.ucsb.nceas.metacat.index.DistributedMapsFactory;
     /**
      * A solr index parser for an RDF/XML file.
      * The solr doc of the RDF/XML object only has the system metadata information.
      * The solr docs of the science metadata doc and data file have the annotation information.
      */
     public class RdfXmlSubprocessor implements IDocumentSubprocessor {
         private static final String QUERY ="q=id:";
         private static Log log = LogFactory.getLog(RdfXmlSubprocessor.class);
         /**
          * If xpath returns true execute the processDocument Method
          */
         private List<String> matchDocuments = null;
         private List<ISolrDataField> fieldList = new ArrayList<ISolrDataField>();
         private static SolrServer solrServer =  null;
         static {
             try {
                 solrServer = SolrServerFactory.createSolrServer();
             } catch (Exception e) {
                 log.error("RdfXmlSubprocessor - can't generate the SolrServer since - "+e.getMessage());
+            }
+        }
         /**
          * Returns true if subprocessor should be run against object
+         *
          * @param formatId the the document to be processed
          * @return true if this processor can parse the formatId
          */
         public boolean canProcess(String formatId) {
             return matchDocuments.contains(formatId);
+        }
         public List<String> getMatchDocuments() {
             return matchDocuments;
+        }
         public void setMatchDocuments(List<String> matchDocuments) {
             this.matchDocuments = matchDocuments;
+        }
         public List<ISolrDataField> getFieldList() {
     		return fieldList;
+    	}
     	public void setFieldList(List<ISolrDataField> fieldList) {
     		this.fieldList = fieldList;
+    	}
     	@Override
         public Map<String, SolrDoc> processDocument(String identifier, Map<String, SolrDoc> docs, InputStream is) throws Exception {
             SolrDoc resourceMapDoc = docs.get(identifier);
             List<SolrDoc> processedDocs = process(resourceMapDoc, is);
             Map<String, SolrDoc> processedDocsMap = new HashMap<String, SolrDoc>();
             for (SolrDoc processedDoc : processedDocs) {
                 processedDocsMap.put(processedDoc.getIdentifier(), processedDoc);
+            }
             // make sure to merge any docs that are currently being processed
             Map<String, SolrDoc> mergedDocuments = mergeDocs(docs, processedDocsMap);
             return mergedDocuments;
+        }
         private List<SolrDoc> process(SolrDoc indexDocument, InputStream is) throws Exception {
         	// get the triplestore dataset
     		Dataset dataset = TripleStoreService.getInstance().getDataset();
         	// read the annotation
         	String indexDocId = indexDocument.getIdentifier();
         	String name = indexDocId;
         	//Check if the identifier is a valid URI and if not, make it one by prepending "http://"
         	URI nameURI = new URI(indexDocId);
         	String scheme = nameURI.getScheme();
         	if((scheme == null) || (scheme.isEmpty())){
         		name = "http://" + indexDocId.toLowerCase();
+        	}
         	boolean loaded = dataset.containsNamedModel(name);
     		if (!loaded) {
     			OntModel ontModel = ModelFactory.createOntologyModel();
     			ontModel.read(is, name);
     			dataset.addNamedModel(name, ontModel);
+    		}
     		//dataset.getDefaultModel().add(ontModel);
     		// process each field query
             Map<String, SolrDoc> documentsToIndex = new HashMap<String, SolrDoc>();
     		for (ISolrDataField field: this.fieldList) {
     			String q = null;
     			if (field instanceof SparqlField) {
     				q = ((SparqlField) field).getQuery();
     				q = q.replaceAll("\\$GRAPH_NAME", name);
     				Query query = QueryFactory.create(q);
     				QueryExecution qexec = QueryExecutionFactory.create(query, dataset);
     				ResultSet results = qexec.execSelect();
     				while (results.hasNext()) {
     					SolrDoc solrDoc = null;
     					QuerySolution solution = results.next();
     					System.out.println(solution.toString());
     					// find the index document we are trying to augment with the annotation
     					if (solution.contains("pid")) {
     						String id = solution.getLiteral("pid").getString();
     						// check if anyone with permissions on the annotation document has write permission on the document we are annotating
     						boolean statementAuthorized = false;
     						try {
     							HashMap<Subject, Set<Permission>> annotationPermissionMap = AccessUtil.getPermissionMap(DistributedMapsFactory.getSystemMetadata(indexDocId).getAccessPolicy());
     							annotationPermissionMap.put(DistributedMapsFactory.getSystemMetadata(indexDocId).getRightsHolder(), new HashSet<Permission>(Arrays.asList(Permission.CHANGE_PERMISSION)));
     							statementAuthorized = AuthUtils.isAuthorized(annotationPermissionMap.keySet(), Permission.WRITE, DistributedMapsFactory.getSystemMetadata(id));
     						} catch (Exception e) {
     							log.warn("Could not check for assertion permission on original pid: " + id, e);
+    						}
     						if (!statementAuthorized) {
     							continue;
+    						}
     						// otherwise carry on with the indexing
     						solrDoc = documentsToIndex.get(id);
     						if (solrDoc == null) {
     							solrDoc = new SolrDoc();
     							solrDoc.addField(new SolrElementField(SolrElementField.FIELD_ID, id));
     							documentsToIndex.put(id, solrDoc);
+    						}
+    					}
     					// add the field to the index document
     					if (solution.contains(field.getName())) {
     						String value = solution.get(field.getName()).toString();
     						SolrElementField f = new SolrElementField(field.getName(), value);
     						if (!solrDoc.hasFieldWithValue(f.getName(), f.getValue())) {
     							solrDoc.addField(f);
+    						}
+    					}
+    				}
+    			}
+    		}
     		// clean up the triple store
     		TDBFactory.release(dataset);
     		// merge the existing index with the new[er] values
             Map<String, SolrDoc> existingDocuments = getSolrDocs(documentsToIndex.keySet());
             Map<String, SolrDoc> mergedDocuments = mergeDocs(documentsToIndex, existingDocuments);
             mergedDocuments.put(indexDocument.getIdentifier(), indexDocument);
             return new ArrayList<SolrDoc>(mergedDocuments.values());
+        }
         private Map<String, SolrDoc> getSolrDocs(Set<String> ids) throws Exception {
             Map<String, SolrDoc> list = new HashMap<String, SolrDoc>();
             if (ids != null) {
                 for (String id : ids) {
                 	SolrDoc doc = getSolrDoc(id);
                     if (doc != null) {
                         list.put(id, doc);
+                    }
+                }
+            }
             return list;
+        }
         private Map<String, SolrDoc> mergeDocs(Map<String, SolrDoc> pending, Map<String, SolrDoc> existing) throws Exception {
         	IndexSchema indexSchema = SolrQueryServiceController.getInstance().getSchema();
         	Map<String, SolrDoc> merged = new HashMap<String, SolrDoc>();
         	Iterator<String> pendingIter = pending.keySet().iterator();
         	while (pendingIter.hasNext()) {
         		String id = pendingIter.next();
         		SolrDoc pendingDoc = pending.get(id);
         		SolrDoc existingDoc = existing.get(id);
         		SolrDoc mergedDoc = new SolrDoc();
         		if (existingDoc != null) {
         			// merge the existing fields
         			for (SolrElementField field: existingDoc.getFieldList()) {
         				mergedDoc.addField(field);
+        			}
+        		}
         		// add the pending
         		for (SolrElementField field: pendingDoc.getFieldList()) {
         			if (field.getName().equals(SolrElementField.FIELD_ID) && mergedDoc.hasField(SolrElementField.FIELD_ID)) {
         				continue;
+        			}
         			// don't transfer the copyTo fields, otherwise there are errors
     				if (indexSchema.isCopyFieldTarget(indexSchema.getField(field.getName()))) {
     					continue;
+    				}
     				// only add if we don't already have it
     				if (!mergedDoc.hasFieldWithValue(field.getName(), field.getValue())) {
     					mergedDoc.addField(field);
+    				}
+    			}
         		// include in results
     			merged.put(id, mergedDoc);
+        	}
         	return merged;
+        }
     	/*
     	 * Get the SolrDoc for the specified id
     	 */
     	public static SolrDoc getSolrDoc(String id) throws SolrServerException, MalformedURLException, UnsupportedType, NotFound, ParserConfigurationException, IOException, SAXException {
     		SolrDoc doc = null;
     		if (solrServer != null) {
     			String query = QUERY + "\"" + id + "\"";
     			SolrParams solrParams = SolrRequestParsers.parseQueryString(query);
     			QueryResponse qr = solrServer.query(solrParams);
     			if (!qr.getResults().isEmpty()) {
     				doc = new SolrDoc();
     				SolrDocument orig = qr.getResults().get(0);
     				IndexSchema indexSchema = SolrQueryServiceController.getInstance().getSchema();
     				for (String fieldName : orig.getFieldNames()) {
     					// don't transfer the copyTo fields, otherwise there are errors
     					if (indexSchema.isCopyFieldTarget(indexSchema.getField(fieldName))) {
     						continue;
+    					}
     					for (Object value : orig.getFieldValues(fieldName)) {
     						String stringValue = value.toString();
     						// special handling for dates in ISO 8601
     						if (value instanceof Date) {
     							stringValue = DateTimeMarshaller.serializeDateToUTC((Date) value);
     							SolrDateConverter converter = new SolrDateConverter();
     							stringValue = converter.convert(stringValue);
+    						}
     						SolrElementField field = new SolrElementField(fieldName, stringValue);
     						log.debug("Adding field: " + fieldName);
     						doc.addField(field);
+    					}
+    				}
+    			}
+    		}
     		return doc;
+    	}
     	@Override
     	public SolrDoc mergeWithIndexedDocument(SolrDoc indexDocument)
     			throws IOException, EncoderException, XPathExpressionException {
     		// TODO: actually perform merging
     		return indexDocument;
+    	}
+    }

Also available in: Unified diff

Project

General

Profile

Metacat

Revision 9058

Added by ben leinfelder about 10 years ago