Project

General

Profile

1
package edu.ucsb.nceas.metacat.annotation;
2

    
3
import java.io.InputStream;
4
import java.io.InputStreamReader;
5
import java.io.StringWriter;
6
import java.net.URL;
7
import java.net.URLEncoder;
8
import java.sql.PreparedStatement;
9
import java.sql.SQLException;
10
import java.util.ArrayList;
11
import java.util.Arrays;
12
import java.util.Iterator;
13
import java.util.List;
14
import java.util.Map;
15
import java.util.Vector;
16

    
17
import org.apache.log4j.Logger;
18
import org.apache.wicket.protocol.http.mock.MockHttpServletRequest;
19
import org.dataone.service.types.v1.Identifier;
20
import org.dataone.service.types.v1.Session;
21
import org.dataone.service.types.v1.Subject;
22
import org.ecoinformatics.datamanager.parser.Attribute;
23
import org.ecoinformatics.datamanager.parser.DataPackage;
24
import org.ecoinformatics.datamanager.parser.Entity;
25
import org.ecoinformatics.datamanager.parser.generic.DataPackageParserInterface;
26
import org.ecoinformatics.datamanager.parser.generic.Eml200DataPackageParser;
27
import org.w3c.dom.Document;
28
import org.w3c.dom.NodeList;
29

    
30
import com.hp.hpl.jena.ontology.AllValuesFromRestriction;
31
import com.hp.hpl.jena.ontology.Individual;
32
import com.hp.hpl.jena.ontology.ObjectProperty;
33
import com.hp.hpl.jena.ontology.OntClass;
34
import com.hp.hpl.jena.ontology.OntModel;
35
import com.hp.hpl.jena.ontology.Ontology;
36
import com.hp.hpl.jena.rdf.model.ModelFactory;
37
import com.hp.hpl.jena.rdf.model.Property;
38
import com.hp.hpl.jena.rdf.model.Resource;
39
import com.hp.hpl.jena.util.iterator.ExtendedIterator;
40

    
41
import edu.ucsb.nceas.metacat.DBUtil;
42
import edu.ucsb.nceas.metacat.DocumentImpl;
43
import edu.ucsb.nceas.metacat.IdentifierManager;
44
import edu.ucsb.nceas.metacat.McdbDocNotFoundException;
45
import edu.ucsb.nceas.metacat.database.DBConnection;
46
import edu.ucsb.nceas.metacat.database.DBConnectionPool;
47
import edu.ucsb.nceas.metacat.dataone.MNodeService;
48
import edu.ucsb.nceas.metacat.properties.PropertyService;
49
import edu.ucsb.nceas.metacat.replication.ReplicationService;
50
import edu.ucsb.nceas.metacat.util.DocumentUtil;
51
import edu.ucsb.nceas.utilities.SortedProperties;
52
import edu.ucsb.nceas.utilities.XMLUtilities;
53

    
54
public class DatapackageSummarizer {
55

    
56
	private static Logger logMetacat = Logger.getLogger(DatapackageSummarizer.class);
57
	
58
	public static String rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
59
	public static String rdfs = "http://www.w3.org/2000/01/rdf-schema#";
60
	public static String owl = "http://www.w3.org/2002/07/owl#";
61
	public static String oboe = "http://ecoinformatics.org/oboe/oboe.1.0/oboe.owl#";
62
	public static String oboe_core = "http://ecoinformatics.org/oboe/oboe.1.0/oboe-core.owl#";
63
	public static String oa = "http://www.w3.org/ns/oa#";
64
	public static String oa_source = "http://www.w3.org/ns/oa.rdf";
65
	public static String dcterms = "http://purl.org/dc/terms/";
66
	public static String dcterms_source = "http://dublincore.org/2012/06/14/dcterms.rdf";
67
	public static String foaf = "http://xmlns.com/foaf/0.1/";
68
	public static String foaf_source = "http://xmlns.com/foaf/spec/index.rdf";
69
    public static String prov = "http://www.w3.org/ns/prov#";
70
    public static String prov_source = "http://www.w3.org/ns/prov.owl";
71
    public static String cito =  "http://purl.org/spar/cito/";
72
    
73
    // for looking up concepts in BioPortal
74
    static final String REST_URL = "http://data.bioontology.org";
75
    static final String API_KEY = "24e4775e-54e0-11e0-9d7b-005056aa3316";
76
    
77
    // package visibility for testing only
78
    boolean randomize = false;
79

    
80
    /**
81
     * Generate annotation for given metadata identifier
82
     * @param metadataPid
83
     */
84
    public String generateAnnotation(Identifier metadataPid) throws Exception {
85
    	
86
    	DataPackage dataPackage = this.getDataPackage(metadataPid);
87
    	
88
		OntModel m = ModelFactory.createOntologyModel();
89
		Ontology ont = m.createOntology("http://annotation/" + metadataPid.getValue());
90
		
91
		// TODO: import the ontologies we use
92
		ont.addImport(m.createResource(oboe));
93
		m.addSubModel(ModelFactory.createOntologyModel().read(oboe));
94
		
95
		ont.addImport(m.createResource(oa));
96
		m.addSubModel(ModelFactory.createOntologyModel().read(oa_source));
97

    
98
		ont.addImport(m.createResource(dcterms));
99
		m.addSubModel(ModelFactory.createOntologyModel().read(dcterms_source));
100

    
101
		ont.addImport(m.createResource(foaf));
102
		m.addSubModel(ModelFactory.createOntologyModel().read(foaf_source));
103
		
104
		ont.addImport(m.createResource(prov));
105
		//m.addSubModel(ModelFactory.createOntologyModel().read(prov_source));
106

    
107
		ont.addImport(m.createResource(cito));
108
		
109
		// properties
110
		ObjectProperty hasBodyProperty = m.getObjectProperty(oa + "hasBody");
111
		ObjectProperty hasTargetProperty = m.getObjectProperty(oa + "hasTarget");
112
		ObjectProperty hasSourceProperty = m.getObjectProperty(oa + "hasSource");
113
		ObjectProperty hasSelectorProperty = m.getObjectProperty(oa + "hasSelector");
114
		ObjectProperty annotatedByProperty = m.getObjectProperty(oa + "annotatedBy");
115
		Property identifierProperty = m.getProperty(dcterms + "identifier");
116
		Property conformsToProperty = m.getProperty(dcterms + "conformsTo");
117
		Property nameProperty = m.getProperty(foaf + "name");
118
		Property rdfValue = m.getProperty(rdf + "value");
119
		
120
		ObjectProperty ofCharacteristic = m.getObjectProperty(oboe_core + "ofCharacteristic");
121
		ObjectProperty usesStandard = m.getObjectProperty(oboe_core + "usesStandard");
122

    
123
		// classes
124
		OntClass measurementClass =  m.getOntClass(oboe_core + "Measurement");
125
		OntClass characteristicClass = m.getOntClass(oboe_core + "Characteristic");
126
		OntClass standardClass =  m.getOntClass(oboe_core + "Standard");
127
		
128
		Resource annotationClass =  m.getOntClass(oa + "Annotation");
129
		Resource specificResourceClass =  m.getOntClass(oa + "SpecificResource");
130
		Resource fragmentSelectorClass =  m.getOntClass(oa + "FragmentSelector");
131
		Resource entityClass =  m.getResource(prov + "Entity");
132
		Resource personClass =  m.getResource(prov + "Person");
133
		
134
		int cnt = 1;
135

    
136
		// these apply to every attribute annotation
137
		Individual meta1 = m.createIndividual(ont.getURI() + "#meta" + cnt, entityClass);
138
		Individual p1 = m.createIndividual(ont.getURI() + "#person" + cnt, personClass);
139
		p1.addProperty(nameProperty, "Ben Leinfelder");
140
		meta1.addProperty(identifierProperty, metadataPid.getValue());
141

    
142
		// loop through the tables and attributes
143
		Entity[] entities = dataPackage.getEntityList();
144
		for (Entity entity: entities) {
145
			String entityName = entity.getName();
146
			logMetacat.debug("Entity name: " + entityName);
147
			Attribute[] attributes = entity.getAttributeList().getAttributes();
148
			int attributeCount = 1;
149
			for (Attribute attribute: attributes) {
150
				
151
				String attributeName = attribute.getName();
152
				String attributeLabel = attribute.getLabel();
153
				String attributeDefinition = attribute.getDefinition();
154
				String attributeType = attribute.getAttributeType();
155
				String attributeScale = attribute.getMeasurementScale();
156
				String attributeUnitType = attribute.getUnitType();
157
				String attributeUnit = attribute.getUnit();
158
				String attributeDomain = attribute.getDomain().getClass().getSimpleName();
159

    
160
				logMetacat.debug("Attribute name: " + attributeName);
161
				logMetacat.debug("Attribute label: " + attributeLabel);
162
				logMetacat.debug("Attribute definition: " + attributeDefinition);
163
				logMetacat.debug("Attribute type: " + attributeType);
164
				logMetacat.debug("Attribute scale: " + attributeScale);
165
				logMetacat.debug("Attribute unit type: " + attributeUnitType);
166
				logMetacat.debug("Attribute unit: " + attributeUnit);
167
				logMetacat.debug("Attribute domain: " + attributeDomain);
168
			
169
				// look up the characteristic or standard subclasses
170
				Resource standard = this.lookupStandard(standardClass, attribute);
171
				Resource characteristic = this.lookupCharacteristic(characteristicClass, attribute);
172
				
173
				if (standard != null || characteristic != null) {
174
					
175
					// instances
176
					Individual m1 = m.createIndividual(ont.getURI() + "#measurement" + cnt, measurementClass);
177
					Individual a1 = m.createIndividual(ont.getURI() + "#annotation" + cnt, annotationClass);
178
					Individual t1 = m.createIndividual(ont.getURI() + "#target" + cnt, specificResourceClass);
179
					String xpointer = "xpointer(/eml/dataSet/" + cnt + "/attributeList/" + attributeCount + ")";
180
					Individual s1 = m.createIndividual(ont.getURI() + "#" + xpointer, fragmentSelectorClass);
181
					s1.addLiteral(rdfValue, xpointer);
182
					s1.addProperty(conformsToProperty, "http://www.w3.org/TR/xptr/");
183
					
184
					// statements about the annotation
185
					a1.addProperty(hasBodyProperty, m1);
186
					a1.addProperty(hasTargetProperty, t1);
187
					t1.addProperty(hasSourceProperty, meta1);
188
					t1.addProperty(hasSelectorProperty, s1);
189
					a1.addProperty(annotatedByProperty, p1);
190
					
191
					// describe the measurement in terms of restrictions
192
					if (standard != null) {
193
						AllValuesFromRestriction avfr = m.createAllValuesFromRestriction(null, usesStandard, standard);
194
						m1.addOntClass(avfr);
195
					}
196
					if (characteristic != null) {
197
						AllValuesFromRestriction avfr = m.createAllValuesFromRestriction(null, ofCharacteristic, characteristic);
198
						m1.addOntClass(avfr);
199
					}
200
					cnt++;
201
				}
202
				
203
			}		
204
		}
205
		
206
		StringWriter sw = new StringWriter();
207
		// only write the base model
208
		//m.write(sw, "RDF/XML-ABBREV");
209
		m.write(sw, null);
210

    
211
		return sw.toString();
212
		
213
	}
214
	
215
	private Resource lookupStandard(OntClass standardClass, Attribute attribute) {
216
		// what's our unit?
217
		String unit = attribute.getUnit().toLowerCase();
218
		List<String> tokens = Arrays.asList(unit.split(" "));
219

    
220
		boolean found = false;
221
		ExtendedIterator iter = standardClass.listSubClasses(false);
222
		if (randomize) {
223
			List subclasses = iter.toList();
224
			int size = subclasses.size();
225
			Long index = new Long(Math.round(Math.floor((Math.random() * (size-1)))));
226
			OntClass subclass = (OntClass) subclasses.get( index.intValue() );
227
			return subclass;
228
		}
229
		while (iter.hasNext()) {
230
			OntClass subclass = (OntClass) iter.next();
231
			String subclassName = subclass.getLocalName().toLowerCase();
232
			logMetacat.debug("subclass: " + subclassName);
233
			if (tokens.contains(subclassName)) {
234
				found = true;
235
			}
236
			if (subclass.hasLabel(unit, null)) {
237
				found = true;
238
			}
239
			if (found) {
240
				return subclass;
241
			}
242
		}
243
		// try to look it up if we got this far
244
		return this.lookupRemoteAnnotationClass(standardClass, unit);
245
	}
246
	
247
	private Resource lookupCharacteristic(OntClass characteristicClass, Attribute attribute) {
248
		// what's our label?
249
		String label = attribute.getLabel().toLowerCase();
250
		List<String> tokens = Arrays.asList(label.split(" "));
251
		
252
		boolean found = false;
253
		// find something that matches
254
		ExtendedIterator iter = characteristicClass.listSubClasses();
255
		if (randomize) {
256
			List subclasses = iter.toList();
257
			int size = subclasses.size();
258
			Long index = new Long(Math.round(Math.floor((Math.random() * (size-1)))));
259
			OntClass subclass = (OntClass) subclasses.get( index.intValue() );
260
			return subclass;
261
		}
262
		while (iter.hasNext()) {
263
			OntClass subclass = (OntClass) iter.next();
264
			String subclassName = subclass.getLocalName().toLowerCase();
265
			logMetacat.debug("subclass: " + subclassName);
266
			if (tokens.contains(subclassName)) {
267
				found = true;
268
			}
269
			if (subclass.hasLabel(label, null)) {
270
				found = true;
271
			}
272
			if (found) {
273
				return subclass;
274
			}
275
		}
276
		
277
		// try to look it up if we got this far
278
		return this.lookupRemoteAnnotationClass(characteristicClass, attribute.getDefinition());
279
		
280
	}
281
	
282
	private Resource lookupRemoteAnnotationClass(OntClass superClass, String text) {
283
		
284
		
285
		try {
286
			
287
			String urlParameters = "apikey=" + API_KEY;
288
			urlParameters += "&format=xml";
289
//			urlParameters += "&ontologies=OBOE-SBC";
290
			urlParameters += "&ontologies=SWEET";
291
			urlParameters += "&text=" + URLEncoder.encode(text, "UTF-8");
292
			
293
			String url = REST_URL + "/annotator?" + urlParameters ;
294
			URL restURL = new URL(url);
295
			InputStream is = ReplicationService.getURLStream(restURL);
296
			Document doc = XMLUtilities.getXMLReaderAsDOMDocument(new InputStreamReader(is, "UTF-8"));
297
			NodeList classNodeList = XMLUtilities.getNodeListWithXPath(doc, "//annotation/annotatedClass/id");
298
			if (classNodeList != null && classNodeList.getLength() > 0) {
299
				String classURI = classNodeList.item(0).getFirstChild().getNodeValue();
300
				logMetacat.info("annotator suggested: " + classURI);
301
				Resource subclass = superClass.getModel().getResource(classURI);
302
				// TODO: check that it is a subclass of superClass?
303
				return subclass;
304
			}
305
		} catch (Exception e) {
306
			logMetacat.error("Could not lookup BioPortal annotation for text= " + text, e);
307
		}
308
		
309
		return null;
310
	}
311
	
312
	private DataPackage getDataPackage(Identifier pid) throws Exception {
313
		// for using the MN API as the MN itself
314
		MockHttpServletRequest request = new MockHttpServletRequest(null, null, null);
315
		Session session = new Session();
316
        Subject subject = MNodeService.getInstance(request).getCapabilities().getSubject(0);
317
        session.setSubject(subject);
318
		InputStream emlStream = MNodeService.getInstance(request).get(session, pid);
319

    
320
		// parse the metadata
321
		DataPackageParserInterface parser = new Eml200DataPackageParser();
322
		parser.parse(emlStream);
323
		DataPackage dataPackage = parser.getDataPackage();
324
		return dataPackage;
325
	}
326

    
327
	private void summarize(List<Identifier> identifiers) throws SQLException {
328
		
329
		DBConnection dbconn = null;
330

    
331
		try {
332
			dbconn = DBConnectionPool.getDBConnection("DatapackageSummarizer.summarize");
333
			
334
			PreparedStatement dropStatement = dbconn.prepareStatement("DROP TABLE IF EXISTS entity_summary");
335
			dropStatement.execute();
336
	
337
			PreparedStatement createStatement = dbconn.prepareStatement(
338
					"CREATE TABLE entity_summary (" +
339
					"guid text, " +
340
					"title text, " +
341
					"entity text," +
342
					"attributeName text," +
343
					"attributeLabel text," +
344
					"attributeDefinition text," +
345
					"attributeType text," +
346
					"attributeScale text," +
347
					"attributeUnitType text," +
348
					"attributeUnit text," +
349
					"attributeDomain text" +
350
					")");
351
			createStatement.execute();
352
			
353
			PreparedStatement insertStatement = dbconn.prepareStatement(
354
					"INSERT INTO entity_summary " +
355
					"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)");
356
			
357
			for (Identifier pid: identifiers) {
358
			
359
				logMetacat.debug("Parsing pid: " + pid.getValue());
360
				
361
				try {
362
					
363
					// get the package
364
					DataPackage dataPackage = this.getDataPackage(pid);
365
					String title = dataPackage.getTitle();
366
					logMetacat.debug("Title: " + title);
367
					
368
					Entity[] entities = dataPackage.getEntityList();
369
					if (entities != null) {
370
						for (Entity entity: entities) {
371
							String entityName = entity.getName();
372
							logMetacat.debug("Entity name: " + entityName);
373
							Attribute[] attributes = entity.getAttributeList().getAttributes();
374
							for (Attribute attribute: attributes) {
375
								String attributeName = attribute.getName();
376
								String attributeLabel = attribute.getLabel();
377
								String attributeDefinition = attribute.getDefinition();
378
								String attributeType = attribute.getAttributeType();
379
								String attributeScale = attribute.getMeasurementScale();
380
								String attributeUnitType = attribute.getUnitType();
381
								String attributeUnit = attribute.getUnit();
382
								String attributeDomain = attribute.getDomain().getClass().getSimpleName();
383
	
384
								logMetacat.debug("Attribute name: " + attributeName);
385
								logMetacat.debug("Attribute label: " + attributeLabel);
386
								logMetacat.debug("Attribute definition: " + attributeDefinition);
387
								logMetacat.debug("Attribute type: " + attributeType);
388
								logMetacat.debug("Attribute scale: " + attributeScale);
389
								logMetacat.debug("Attribute unit type: " + attributeUnitType);
390
								logMetacat.debug("Attribute unit: " + attributeUnit);
391
								logMetacat.debug("Attribute domain: " + attributeDomain);
392
								
393
								// set the values for this attribute
394
								insertStatement.setString(1, pid.getValue());
395
								insertStatement.setString(2, title);
396
								insertStatement.setString(3, entityName);
397
								insertStatement.setString(4, attributeName);
398
								insertStatement.setString(5, attributeLabel);
399
								insertStatement.setString(6, attributeDefinition);
400
								insertStatement.setString(7, attributeType);
401
								insertStatement.setString(8, attributeScale);
402
								insertStatement.setString(9, attributeUnitType);
403
								insertStatement.setString(10, attributeUnit);
404
								insertStatement.setString(11, attributeDomain);
405
								insertStatement.execute();
406
								
407
							}		
408
						}
409
					}
410
					
411
				} catch (Exception e) {
412
					logMetacat.warn("error parsing metadata for: " + pid.getValue(), e);
413
				}
414
			}
415
		} catch (SQLException sqle) {
416
			// just throw it
417
			throw sqle;
418
		} finally {
419
			if (dbconn != null) {
420
				DBConnectionPool.returnDBConnection(dbconn, 0);
421
				dbconn.close();
422
			}
423
		}
424
	}
425
	
426
	public static void main(String[] args) throws Exception {
427
		// set up the properties based on the test/deployed configuration of the workspace
428
			SortedProperties testProperties = new SortedProperties("test/test.properties");
429
			testProperties.load();
430
			String metacatContextDir = testProperties.getProperty("metacat.contextDir");
431
			PropertyService.getInstance(metacatContextDir + "/WEB-INF");
432
			
433
			testGenerate();
434
//			testSummary();
435
			System.exit(0);
436
	}
437
	
438
	public static void testGenerate() throws Exception {
439
		Identifier metadataPid = new Identifier();
440
		metadataPid.setValue("doi:10.5072/FK2445ZN4");
441
		DatapackageSummarizer ds = new DatapackageSummarizer();
442
		String rdfString = ds.generateAnnotation(metadataPid);
443
		logMetacat.info("RDF annotation: \n" + rdfString);
444
		
445
	}
446
	
447
	public static void testSummary() throws Exception {
448
		
449
		// summarize the packages
450
		DatapackageSummarizer ds = new DatapackageSummarizer();
451
		List<Identifier> identifiers = new ArrayList<Identifier>();
452
		Map<Integer, String> serverCodes = ReplicationService.getServerCodes();
453

    
454
		// select the metadata ids we want to summarize
455
		boolean includeReplicas = true;
456
		Iterator<Integer> codeIter = Arrays.asList(new Integer[] {1}).iterator();
457
		if (includeReplicas ) {
458
			codeIter = serverCodes.keySet().iterator();
459
		}
460
		
461
		Vector<String> idList = new Vector<String>();
462
		while (codeIter.hasNext()) {
463
			int serverLocation = codeIter.next();
464
			Vector<String> idList0 = DBUtil.getAllDocidsByType(DocumentImpl.EML2_0_0NAMESPACE, false, serverLocation);
465
			Vector<String> idList1 = DBUtil.getAllDocidsByType(DocumentImpl.EML2_0_1NAMESPACE, false, serverLocation);
466
			Vector<String> idList2 = DBUtil.getAllDocidsByType(DocumentImpl.EML2_1_0NAMESPACE, false, serverLocation);
467
			Vector<String> idList3 = DBUtil.getAllDocidsByType(DocumentImpl.EML2_1_1NAMESPACE, false, serverLocation);
468
			
469
			idList.addAll(idList0);
470
			idList.addAll(idList1);
471
			idList.addAll(idList2);
472
			idList.addAll(idList3);
473
		
474
		}
475
		
476
		// go through all the identifiers now
477
		for (String localId : idList) {
478
			try {
479
				String guid = IdentifierManager.getInstance().getGUID(
480
						DocumentUtil.getDocIdFromAccessionNumber(localId), 
481
						DocumentUtil.getRevisionFromAccessionNumber(localId));
482
				Identifier pid = new Identifier();
483
				pid.setValue(guid);
484
				identifiers.add(pid);
485
			} catch (McdbDocNotFoundException nfe) {
486
				// just skip it
487
				continue;
488
			}
489
		}
490
		ds.summarize(identifiers);
491
		System.exit(0);
492
	}
493
	
494
}
    (1-1/1)