Project

General

Profile

1
package edu.ucsb.nceas.metacat.annotation;
2

    
3
import java.io.InputStream;
4
import java.io.StringWriter;
5
import java.sql.PreparedStatement;
6
import java.sql.SQLException;
7
import java.util.ArrayList;
8
import java.util.Arrays;
9
import java.util.Iterator;
10
import java.util.List;
11
import java.util.Map;
12
import java.util.Vector;
13

    
14
import org.apache.log4j.Logger;
15
import org.apache.wicket.protocol.http.mock.MockHttpServletRequest;
16
import org.dataone.service.types.v1.Identifier;
17
import org.dataone.service.types.v1.Session;
18
import org.dataone.service.types.v1.Subject;
19
import org.ecoinformatics.datamanager.parser.Attribute;
20
import org.ecoinformatics.datamanager.parser.DataPackage;
21
import org.ecoinformatics.datamanager.parser.Entity;
22
import org.ecoinformatics.datamanager.parser.generic.DataPackageParserInterface;
23
import org.ecoinformatics.datamanager.parser.generic.Eml200DataPackageParser;
24

    
25
import com.hp.hpl.jena.ontology.AllValuesFromRestriction;
26
import com.hp.hpl.jena.ontology.Individual;
27
import com.hp.hpl.jena.ontology.ObjectProperty;
28
import com.hp.hpl.jena.ontology.OntClass;
29
import com.hp.hpl.jena.ontology.OntModel;
30
import com.hp.hpl.jena.ontology.Ontology;
31
import com.hp.hpl.jena.rdf.model.ModelFactory;
32
import com.hp.hpl.jena.rdf.model.Property;
33
import com.hp.hpl.jena.rdf.model.Resource;
34
import com.hp.hpl.jena.util.iterator.ExtendedIterator;
35

    
36
import edu.ucsb.nceas.metacat.DBUtil;
37
import edu.ucsb.nceas.metacat.DocumentImpl;
38
import edu.ucsb.nceas.metacat.IdentifierManager;
39
import edu.ucsb.nceas.metacat.McdbDocNotFoundException;
40
import edu.ucsb.nceas.metacat.database.DBConnection;
41
import edu.ucsb.nceas.metacat.database.DBConnectionPool;
42
import edu.ucsb.nceas.metacat.dataone.MNodeService;
43
import edu.ucsb.nceas.metacat.properties.PropertyService;
44
import edu.ucsb.nceas.metacat.replication.ReplicationService;
45
import edu.ucsb.nceas.metacat.util.DocumentUtil;
46
import edu.ucsb.nceas.utilities.SortedProperties;
47

    
48
public class DatapackageSummarizer {
49

    
50
	private static Logger logMetacat = Logger.getLogger(DatapackageSummarizer.class);
51
	
52
	public static String rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
53
	public static String rdfs = "http://www.w3.org/2000/01/rdf-schema#";
54
	public static String owl = "http://www.w3.org/2002/07/owl#";
55
	public static String oboe = "http://ecoinformatics.org/oboe/oboe.1.0/oboe.owl#";
56
	public static String oboe_core = "http://ecoinformatics.org/oboe/oboe.1.0/oboe-core.owl#";
57
	public static String oboe_characteristics = "http://ecoinformatics.org/oboe/oboe.1.0/oboe-characteristics.owl#";
58
	public static String oa = "http://www.w3.org/ns/oa#";
59
	public static String oa_source = "http://www.w3.org/ns/oa.rdf";
60
	public static String dcterms = "http://purl.org/dc/terms/";
61
	public static String dcterms_source = "http://dublincore.org/2012/06/14/dcterms.rdf";
62
	public static String foaf = "http://xmlns.com/foaf/0.1/";
63
	public static String foaf_source = "http://xmlns.com/foaf/spec/index.rdf";
64
    public static String prov = "http://www.w3.org/ns/prov#";
65
    public static String prov_source = "http://www.w3.org/ns/prov.owl";
66
    public static String cito =  "http://purl.org/spar/cito/";
67
    
68
	public static String OBOE_SBC = "OBOE-SBC";
69
    
70
    // package visibility for testing only
71
    boolean randomize = false;
72

    
73
    /**
74
     * Generate annotation for given metadata identifier
75
     * @param metadataPid
76
     */
77
    public String generateAnnotation(Identifier metadataPid) throws Exception {
78
    	
79
    	DataPackage dataPackage = this.getDataPackage(metadataPid);
80
    	
81
		OntModel m = ModelFactory.createOntologyModel();
82
		Ontology ont = m.createOntology("http://annotation/" + metadataPid.getValue());
83
		
84
		// TODO: import the ontologies we use
85
		ont.addImport(m.createResource(oboe));
86
		m.addSubModel(ModelFactory.createOntologyModel().read(oboe));
87
		
88
		ont.addImport(m.createResource(oa));
89
		m.addSubModel(ModelFactory.createOntologyModel().read(oa_source));
90

    
91
		ont.addImport(m.createResource(dcterms));
92
		m.addSubModel(ModelFactory.createOntologyModel().read(dcterms_source));
93

    
94
		ont.addImport(m.createResource(foaf));
95
		m.addSubModel(ModelFactory.createOntologyModel().read(foaf_source));
96
		
97
		ont.addImport(m.createResource(prov));
98
		//m.addSubModel(ModelFactory.createOntologyModel().read(prov_source));
99

    
100
		ont.addImport(m.createResource(cito));
101
		
102
		// properties
103
		ObjectProperty hasBodyProperty = m.getObjectProperty(oa + "hasBody");
104
		ObjectProperty hasTargetProperty = m.getObjectProperty(oa + "hasTarget");
105
		ObjectProperty hasSourceProperty = m.getObjectProperty(oa + "hasSource");
106
		ObjectProperty hasSelectorProperty = m.getObjectProperty(oa + "hasSelector");
107
		ObjectProperty annotatedByProperty = m.getObjectProperty(oa + "annotatedBy");
108
		Property identifierProperty = m.getProperty(dcterms + "identifier");
109
		Property conformsToProperty = m.getProperty(dcterms + "conformsTo");
110
		Property nameProperty = m.getProperty(foaf + "name");
111
		Property rdfValue = m.getProperty(rdf + "value");
112
		
113
		ObjectProperty ofCharacteristic = m.getObjectProperty(oboe_core + "ofCharacteristic");
114
		ObjectProperty usesStandard = m.getObjectProperty(oboe_core + "usesStandard");
115

    
116
		// classes
117
		OntClass measurementClass =  m.getOntClass(oboe_core + "Measurement");
118
		OntClass characteristicClass = m.getOntClass(oboe_core + "Characteristic");
119
		OntClass standardClass =  m.getOntClass(oboe_core + "Standard");
120
		
121
		Resource annotationClass =  m.getOntClass(oa + "Annotation");
122
		Resource specificResourceClass =  m.getOntClass(oa + "SpecificResource");
123
		Resource fragmentSelectorClass =  m.getOntClass(oa + "FragmentSelector");
124
		Resource entityClass =  m.getResource(prov + "Entity");
125
		Resource personClass =  m.getResource(prov + "Person");
126
		
127
		// these apply to every attribute annotation
128
		Individual meta1 = m.createIndividual(ont.getURI() + "#meta", entityClass);
129
		meta1.addProperty(identifierProperty, metadataPid.getValue());
130

    
131
		// who should we attribute the annotation to?
132
		Individual p1 = m.createIndividual(ont.getURI() + "#person", personClass);
133
		
134
		// add an orcid annotation if we can find one from their system
135
		List<String> creators = dataPackage.getCreators();
136
		//creators = Arrays.asList("Matthew Jones");
137
		if (creators != null && creators.size() > 0) {
138
			p1.addProperty(nameProperty, creators.get(0));
139
			String orcidId = OrcidService.lookupOrcid(null, null, creators.toArray(new String[0]));
140
			if (orcidId != null) {
141
				p1.addProperty(identifierProperty, orcidId);
142
			}
143
		}
144
		
145
		// loop through the tables and attributes
146
		int entityCount = 1;
147
		Entity[] entities = dataPackage.getEntityList();
148
		for (Entity entity: entities) {
149
			String entityName = entity.getName();
150
			logMetacat.debug("Entity name: " + entityName);
151
			Attribute[] attributes = entity.getAttributeList().getAttributes();
152
			int attributeCount = 1;
153
			for (Attribute attribute: attributes) {
154
				
155
				// for naming the individuals uniquely
156
				String cnt = entityCount + "_" + attributeCount;
157
				
158
				String attributeName = attribute.getName();
159
				String attributeLabel = attribute.getLabel();
160
				String attributeDefinition = attribute.getDefinition();
161
				String attributeType = attribute.getAttributeType();
162
				String attributeScale = attribute.getMeasurementScale();
163
				String attributeUnitType = attribute.getUnitType();
164
				String attributeUnit = attribute.getUnit();
165
				String attributeDomain = attribute.getDomain().getClass().getSimpleName();
166

    
167
				logMetacat.debug("Attribute name: " + attributeName);
168
				logMetacat.debug("Attribute label: " + attributeLabel);
169
				logMetacat.debug("Attribute definition: " + attributeDefinition);
170
				logMetacat.debug("Attribute type: " + attributeType);
171
				logMetacat.debug("Attribute scale: " + attributeScale);
172
				logMetacat.debug("Attribute unit type: " + attributeUnitType);
173
				logMetacat.debug("Attribute unit: " + attributeUnit);
174
				logMetacat.debug("Attribute domain: " + attributeDomain);
175
			
176
				// look up the characteristic or standard subclasses
177
				Resource standard = this.lookupStandard(standardClass, attribute);
178
				Resource characteristic = this.lookupCharacteristic(characteristicClass, attribute);
179
				
180
				if (standard != null || characteristic != null) {
181
					
182
					// instances
183
					Individual m1 = m.createIndividual(ont.getURI() + "#measurement" + cnt, measurementClass);
184
					Individual a1 = m.createIndividual(ont.getURI() + "#annotation" + cnt, annotationClass);
185
					Individual t1 = m.createIndividual(ont.getURI() + "#target" + cnt, specificResourceClass);
186
					String xpointer = "xpointer(/eml/dataSet/" + entityCount + "/attributeList/" + attributeCount + ")";
187
					Individual s1 = m.createIndividual(ont.getURI() + "#" + xpointer, fragmentSelectorClass);
188
					s1.addLiteral(rdfValue, xpointer);
189
					s1.addProperty(conformsToProperty, "http://www.w3.org/TR/xptr/");
190
					
191
					// statements about the annotation
192
					a1.addProperty(hasBodyProperty, m1);
193
					a1.addProperty(hasTargetProperty, t1);
194
					t1.addProperty(hasSourceProperty, meta1);
195
					t1.addProperty(hasSelectorProperty, s1);
196
					a1.addProperty(annotatedByProperty, p1);
197
					
198
					// describe the measurement in terms of restrictions
199
					if (standard != null) {
200
						AllValuesFromRestriction avfr = m.createAllValuesFromRestriction(null, usesStandard, standard);
201
						m1.addOntClass(avfr);
202
					}
203
					if (characteristic != null) {
204
						AllValuesFromRestriction avfr = m.createAllValuesFromRestriction(null, ofCharacteristic, characteristic);
205
						m1.addOntClass(avfr);
206
					}
207
				}
208
				attributeCount++;
209
				
210
			}
211
			entityCount++;
212
		}
213
		
214
		StringWriter sw = new StringWriter();
215
		// only write the base model
216
		//m.write(sw, "RDF/XML-ABBREV");
217
		m.write(sw, null);
218

    
219
		return sw.toString();
220
		
221
	}
222
	
223
	private Resource lookupStandard(OntClass standardClass, Attribute attribute) {
224
		// what's our unit?
225
		String unit = attribute.getUnit().toLowerCase();
226
		List<String> tokens = Arrays.asList(unit.split(" "));
227

    
228
		boolean found = false;
229
		ExtendedIterator iter = standardClass.listSubClasses(false);
230
		if (randomize) {
231
			List subclasses = iter.toList();
232
			int size = subclasses.size();
233
			Long index = new Long(Math.round(Math.floor((Math.random() * (size-1)))));
234
			OntClass subclass = (OntClass) subclasses.get( index.intValue() );
235
			return subclass;
236
		}
237
		while (iter.hasNext()) {
238
			OntClass subclass = (OntClass) iter.next();
239
			String subclassName = subclass.getLocalName().toLowerCase();
240
			logMetacat.debug("subclass: " + subclassName);
241
			if (tokens.contains(subclassName)) {
242
				found = true;
243
			}
244
			if (subclass.hasLabel(unit, null)) {
245
				found = true;
246
			}
247
			if (found) {
248
				return subclass;
249
			}
250
		}
251
		// try to look it up if we got this far
252
		return BioPortalService.lookupAnnotationClass(standardClass, unit, OBOE_SBC);
253
	}
254
	
255
	private Resource lookupCharacteristic(OntClass characteristicClass, Attribute attribute) {
256
		// what's our label?
257
		String label = attribute.getLabel().toLowerCase();
258
		List<String> tokens = Arrays.asList(label.split(" "));
259
		
260
		boolean found = false;
261
		// find something that matches
262
		ExtendedIterator iter = characteristicClass.listSubClasses();
263
		if (randomize) {
264
			List subclasses = iter.toList();
265
			int size = subclasses.size();
266
			Long index = new Long(Math.round(Math.floor((Math.random() * (size-1)))));
267
			OntClass subclass = (OntClass) subclasses.get( index.intValue() );
268
			return subclass;
269
		}
270
		while (iter.hasNext()) {
271
			OntClass subclass = (OntClass) iter.next();
272
			String subclassName = subclass.getLocalName().toLowerCase();
273
			logMetacat.debug("subclass: " + subclassName);
274
			if (tokens.contains(subclassName)) {
275
				found = true;
276
			}
277
			if (subclass.hasLabel(label, null)) {
278
				found = true;
279
			}
280
			if (found) {
281
				return subclass;
282
			}
283
		}
284
		
285
		// try to look it up if we got this far
286
		return BioPortalService.lookupAnnotationClass(characteristicClass, attribute.getDefinition(), OBOE_SBC);
287
		
288
	}
289
	
290
	private DataPackage getDataPackage(Identifier pid) throws Exception {
291
		// for using the MN API as the MN itself
292
		MockHttpServletRequest request = new MockHttpServletRequest(null, null, null);
293
		Session session = new Session();
294
        Subject subject = MNodeService.getInstance(request).getCapabilities().getSubject(0);
295
        session.setSubject(subject);
296
		InputStream emlStream = MNodeService.getInstance(request).get(session, pid);
297

    
298
		// parse the metadata
299
		DataPackageParserInterface parser = new Eml200DataPackageParser();
300
		parser.parse(emlStream);
301
		DataPackage dataPackage = parser.getDataPackage();
302
		return dataPackage;
303
	}
304

    
305
	private void summarize(List<Identifier> identifiers) throws SQLException {
306
		
307
		DBConnection dbconn = null;
308

    
309
		try {
310
			dbconn = DBConnectionPool.getDBConnection("DatapackageSummarizer.summarize");
311
			
312
			PreparedStatement dropStatement = dbconn.prepareStatement("DROP TABLE IF EXISTS entity_summary");
313
			dropStatement.execute();
314
	
315
			PreparedStatement createStatement = dbconn.prepareStatement(
316
					"CREATE TABLE entity_summary (" +
317
					"guid text, " +
318
					"title text, " +
319
					"entity text," +
320
					"attributeName text," +
321
					"attributeLabel text," +
322
					"attributeDefinition text," +
323
					"attributeType text," +
324
					"attributeScale text," +
325
					"attributeUnitType text," +
326
					"attributeUnit text," +
327
					"attributeDomain text" +
328
					")");
329
			createStatement.execute();
330
			
331
			PreparedStatement insertStatement = dbconn.prepareStatement(
332
					"INSERT INTO entity_summary " +
333
					"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)");
334
			
335
			for (Identifier pid: identifiers) {
336
			
337
				logMetacat.debug("Parsing pid: " + pid.getValue());
338
				
339
				try {
340
					
341
					// get the package
342
					DataPackage dataPackage = this.getDataPackage(pid);
343
					String title = dataPackage.getTitle();
344
					logMetacat.debug("Title: " + title);
345
					
346
					Entity[] entities = dataPackage.getEntityList();
347
					if (entities != null) {
348
						for (Entity entity: entities) {
349
							String entityName = entity.getName();
350
							logMetacat.debug("Entity name: " + entityName);
351
							Attribute[] attributes = entity.getAttributeList().getAttributes();
352
							for (Attribute attribute: attributes) {
353
								String attributeName = attribute.getName();
354
								String attributeLabel = attribute.getLabel();
355
								String attributeDefinition = attribute.getDefinition();
356
								String attributeType = attribute.getAttributeType();
357
								String attributeScale = attribute.getMeasurementScale();
358
								String attributeUnitType = attribute.getUnitType();
359
								String attributeUnit = attribute.getUnit();
360
								String attributeDomain = attribute.getDomain().getClass().getSimpleName();
361
	
362
								logMetacat.debug("Attribute name: " + attributeName);
363
								logMetacat.debug("Attribute label: " + attributeLabel);
364
								logMetacat.debug("Attribute definition: " + attributeDefinition);
365
								logMetacat.debug("Attribute type: " + attributeType);
366
								logMetacat.debug("Attribute scale: " + attributeScale);
367
								logMetacat.debug("Attribute unit type: " + attributeUnitType);
368
								logMetacat.debug("Attribute unit: " + attributeUnit);
369
								logMetacat.debug("Attribute domain: " + attributeDomain);
370
								
371
								// set the values for this attribute
372
								insertStatement.setString(1, pid.getValue());
373
								insertStatement.setString(2, title);
374
								insertStatement.setString(3, entityName);
375
								insertStatement.setString(4, attributeName);
376
								insertStatement.setString(5, attributeLabel);
377
								insertStatement.setString(6, attributeDefinition);
378
								insertStatement.setString(7, attributeType);
379
								insertStatement.setString(8, attributeScale);
380
								insertStatement.setString(9, attributeUnitType);
381
								insertStatement.setString(10, attributeUnit);
382
								insertStatement.setString(11, attributeDomain);
383
								insertStatement.execute();
384
								
385
							}		
386
						}
387
					}
388
					
389
				} catch (Exception e) {
390
					logMetacat.warn("error parsing metadata for: " + pid.getValue(), e);
391
				}
392
			}
393
		} catch (SQLException sqle) {
394
			// just throw it
395
			throw sqle;
396
		} finally {
397
			if (dbconn != null) {
398
				DBConnectionPool.returnDBConnection(dbconn, 0);
399
				dbconn.close();
400
			}
401
		}
402
	}
403
	
404
	public static void main(String[] args) throws Exception {
405
		// set up the properties based on the test/deployed configuration of the workspace
406
			SortedProperties testProperties = new SortedProperties("test/test.properties");
407
			testProperties.load();
408
			String metacatContextDir = testProperties.getProperty("metacat.contextDir");
409
			PropertyService.getInstance(metacatContextDir + "/WEB-INF");
410
			
411
			testGenerate();
412
//			testSummary();
413
			System.exit(0);
414
	}
415
	
416
	public static void testGenerate() throws Exception {
417
		Identifier metadataPid = new Identifier();
418
		metadataPid.setValue("doi:10.5072/FK2445ZN4");
419
		DatapackageSummarizer ds = new DatapackageSummarizer();
420
		String rdfString = ds.generateAnnotation(metadataPid);
421
		logMetacat.info("RDF annotation: \n" + rdfString);
422
		
423
	}
424
	
425
	public static void testSummary() throws Exception {
426
		
427
		// summarize the packages
428
		DatapackageSummarizer ds = new DatapackageSummarizer();
429
		List<Identifier> identifiers = new ArrayList<Identifier>();
430
		Map<Integer, String> serverCodes = ReplicationService.getServerCodes();
431

    
432
		// select the metadata ids we want to summarize
433
		boolean includeReplicas = true;
434
		Iterator<Integer> codeIter = Arrays.asList(new Integer[] {1}).iterator();
435
		if (includeReplicas ) {
436
			codeIter = serverCodes.keySet().iterator();
437
		}
438
		
439
		Vector<String> idList = new Vector<String>();
440
		while (codeIter.hasNext()) {
441
			int serverLocation = codeIter.next();
442
			Vector<String> idList0 = DBUtil.getAllDocidsByType(DocumentImpl.EML2_0_0NAMESPACE, false, serverLocation);
443
			Vector<String> idList1 = DBUtil.getAllDocidsByType(DocumentImpl.EML2_0_1NAMESPACE, false, serverLocation);
444
			Vector<String> idList2 = DBUtil.getAllDocidsByType(DocumentImpl.EML2_1_0NAMESPACE, false, serverLocation);
445
			Vector<String> idList3 = DBUtil.getAllDocidsByType(DocumentImpl.EML2_1_1NAMESPACE, false, serverLocation);
446
			
447
			idList.addAll(idList0);
448
			idList.addAll(idList1);
449
			idList.addAll(idList2);
450
			idList.addAll(idList3);
451
		
452
		}
453
		
454
		// go through all the identifiers now
455
		for (String localId : idList) {
456
			try {
457
				String guid = IdentifierManager.getInstance().getGUID(
458
						DocumentUtil.getDocIdFromAccessionNumber(localId), 
459
						DocumentUtil.getRevisionFromAccessionNumber(localId));
460
				Identifier pid = new Identifier();
461
				pid.setValue(guid);
462
				identifiers.add(pid);
463
			} catch (McdbDocNotFoundException nfe) {
464
				// just skip it
465
				continue;
466
			}
467
		}
468
		ds.summarize(identifiers);
469
		System.exit(0);
470
	}
471
	
472
}
(2-2/3)