Revision 7608
Added by Jing Tao over 11 years ago
src/edu/ucsb/nceas/metacat/index/GenericIndex.java | ||
---|---|---|
1 |
package edu.ucsb.nceas.metacat.index; |
|
2 |
|
|
3 |
import java.io.IOException; |
|
4 |
import java.util.Map; |
|
5 |
|
|
6 |
/** |
|
7 |
* The simplest of index interfaces. Create and remove associations between document |
|
8 |
* IDs and sets of tagged data; search with arbitrary query language. |
|
9 |
*/ |
|
10 |
public interface GenericIndex { |
|
11 |
public void insert(String docID, final Map<String,String[]> fields) throws IOException; |
|
12 |
public void update(String docID, final Map<String,String[]> fields) throws IOException; |
|
13 |
public void remove(String docID) throws IOException; |
|
14 |
public String[] query(String q); |
|
15 |
} |
src/edu/ucsb/nceas/metacat/index/FieldSpec.java | ||
---|---|---|
1 |
package edu.ucsb.nceas.metacat.index; |
|
2 |
|
|
3 |
import org.w3c.dom.Document; |
|
4 |
import sun.beans.editors.ByteEditor; |
|
5 |
|
|
6 |
import java.io.ByteArrayInputStream; |
|
7 |
import java.io.InputStream; |
|
8 |
import java.io.Reader; |
|
9 |
|
|
10 |
/** |
|
11 |
*/ |
|
12 |
public abstract class FieldSpec { |
|
13 |
public final String name; |
|
14 |
|
|
15 |
public FieldSpec(String name) { |
|
16 |
this.name = name; |
|
17 |
} |
|
18 |
/* */ |
|
19 |
public abstract String[] extract(final Reader s); |
|
20 |
|
|
21 |
// public abstract String extract(ByteArrayInputStream s); |
|
22 |
} |
src/edu/ucsb/nceas/metacat/index/MetacatIndex.java | ||
---|---|---|
1 |
package edu.ucsb.nceas.metacat.index; |
|
2 |
|
|
3 |
//import edu.ucsb.nceas.metacat.util.SystemUtil; |
|
4 |
|
|
5 |
import java.io.*; |
|
6 |
import java.util.*; |
|
7 |
|
|
8 |
/* |
|
9 |
* |
|
10 |
*/ |
|
11 |
|
|
12 |
public class MetacatIndex { |
|
13 |
// singleton, though it doesn't really need to be -- simplifies configuration a bit |
|
14 |
private static MetacatIndex instance = null; |
|
15 |
|
|
16 |
public MetacatIndex getInstance() { |
|
17 |
if (instance != null) { |
|
18 |
instance = new MetacatIndex(); |
|
19 |
} |
|
20 |
return instance; |
|
21 |
// return new MetacatIndex(); |
|
22 |
} |
|
23 |
|
|
24 |
public static final String MCIDFIELD = "metacat-id"; |
|
25 |
public static final String MCIndexName = "index"; |
|
26 |
protected GenericIndex index = null; |
|
27 |
protected String dataPath = null; |
|
28 |
protected List<FieldSpec> fields; |
|
29 |
// These are separated to more easily replicate the exact behavior of the DataONE indexer |
|
30 |
// but probably this is unnecessary and they can simply be agglomerated |
|
31 |
protected List<FieldSpec> d1SysFields, d1EmlFields, d1DryadFields, d1FgdcFields; |
|
32 |
|
|
33 |
private MetacatIndex() { |
|
34 |
this.dataPath = "/Users/brendan/metacat/"; //PropertyService.getProperty("application.datafilepath"); |
|
35 |
this.index = new SolrjIndex("http://localhost:8983"); //PropertyService.getProperty(""); |
|
36 |
this.d1SysFields = FieldDefReader.read(new File(this.dataPath + "d1sys")); |
|
37 |
this.d1EmlFields = FieldDefReader.read(new File(this.dataPath + "d1eml")); |
|
38 |
this.d1DryadFields = FieldDefReader.read(new File(this.dataPath + "d1dryad")); |
|
39 |
this.d1FgdcFields = FieldDefReader.read(new File(this.dataPath + "d1fgdc")); |
|
40 |
readMCIndexPaths(); |
|
41 |
} |
|
42 |
|
|
43 |
protected void readMCIndexPaths() { |
|
44 |
List<String> paths = new ArrayList<String>();//SystemUtil.getPathsforIndexing(); //nullable? |
|
45 |
for (String p : paths) { |
|
46 |
this.fields.add(new XpathIndexField("mcidx_" + p, "//" + p)); |
|
47 |
} |
|
48 |
} |
|
49 |
|
|
50 |
public void update(String docID, Reader doc) { |
|
51 |
Map<String, String[]> idx = new HashMap<String, String[]>(); |
|
52 |
// this stuff is pretty gross, but it's done to match exactly the behavior of the D1 |
|
53 |
// index processor. Probably isn't necessary and should be replaced by a generalized |
|
54 |
// dispatch by document/data type. |
|
55 |
if (DocType.isSysmeta(doc)) { |
|
56 |
idx.putAll(getFields(this.d1SysFields, doc)); |
|
57 |
if (DocType.isSyseml(doc)) { |
|
58 |
idx.putAll(getFields(this.d1EmlFields, doc)); |
|
59 |
} else if (DocType.isSysdryad(doc)) { |
|
60 |
idx.putAll(getFields(this.d1DryadFields, doc)); |
|
61 |
} else if (DocType.isSysfgdc(doc)) { |
|
62 |
idx.putAll(getFields(this.d1FgdcFields, doc)); |
|
63 |
} |
|
64 |
} else { |
|
65 |
if (DocType.isEml(doc)) { |
|
66 |
idx.putAll(getFields(this.d1EmlFields, doc)); |
|
67 |
} |
|
68 |
idx.putAll(getFields(this.fields, doc)); |
|
69 |
} |
|
70 |
try { |
|
71 |
index.update(docID, idx); |
|
72 |
} catch (IOException e) { |
|
73 |
// TODO: logs etc |
|
74 |
e.printStackTrace(); |
|
75 |
} |
|
76 |
} |
|
77 |
|
|
78 |
public void remove(String docID) { |
|
79 |
try { |
|
80 |
index.remove(docID); |
|
81 |
} catch (IOException e) { |
|
82 |
// TODO: logs etc |
|
83 |
e.printStackTrace(); |
|
84 |
} |
|
85 |
} |
|
86 |
|
|
87 |
public List<String> retrieve (String query) { |
|
88 |
String result[] = index.query(query); |
|
89 |
return new ArrayList<String>(Arrays.asList(result)); |
|
90 |
} |
|
91 |
|
|
92 |
protected Map<String, String[]> getFields(List<FieldSpec> fields, Reader doc) { |
|
93 |
Map<String, String[]> idx = new HashMap<String, String[]>(); |
|
94 |
for (FieldSpec fs : fields) { |
|
95 |
idx.put(fs.name, fs.extract(doc)); |
|
96 |
} |
|
97 |
return idx; |
|
98 |
} |
|
99 |
} |
src/edu/ucsb/nceas/metacat/index/FieldDefReader.java | ||
---|---|---|
1 |
package edu.ucsb.nceas.metacat.index; |
|
2 |
|
|
3 |
import java.io.BufferedReader; |
|
4 |
import java.io.File; |
|
5 |
import java.io.FileReader; |
|
6 |
import java.io.IOException; |
|
7 |
import java.util.ArrayList; |
|
8 |
import java.util.List; |
|
9 |
|
|
10 |
/** |
|
11 |
* Read field definitions from a file. Replacement for Spring-based config used in DataONE. |
|
12 |
* Presently uses csv-style text which is primitive and fragile; should probably be some |
|
13 |
* sort of xml to be properly javaee-esque. It's possible to use Spring configuration info |
|
14 |
* directly, but the parsing and evaluation involved is non-trivial, even though the data |
|
15 |
* involved is really quite simple. Thus at present our config info has been manually |
|
16 |
* generated in this format, based on the DataONE indexer's definitions. |
|
17 |
* |
|
18 |
* Oh Java, when will you get a decent object syntax? |
|
19 |
*/ |
|
20 |
public class FieldDefReader { |
|
21 |
public static List<FieldSpec> read(File f) { |
|
22 |
ArrayList<FieldSpec> specs = new ArrayList<FieldSpec>(20); |
|
23 |
try { |
|
24 |
BufferedReader r = new BufferedReader(new FileReader(f)); |
|
25 |
String s; |
|
26 |
while ((s = r.readLine()) != null) { |
|
27 |
String[] def = s.split(" "); |
|
28 |
D1IndexField.DataFormat df = D1IndexField.DataFormat.SINGLE; |
|
29 |
if (def[2].equals("set")) df = D1IndexField.DataFormat.SET; |
|
30 |
else if (def[2].equals("multi")) df = D1IndexField.DataFormat.MULTISET; |
|
31 |
D1IndexField.Conversion dc = D1IndexField.Conversion.NONE; |
|
32 |
if (def[3].equals("date")) dc = D1IndexField.Conversion.DATE; |
|
33 |
else if (def[3].equals("fgdc")) dc = D1IndexField.Conversion.FGDCDATE; |
|
34 |
else if (def[3].equals("lat")) dc = D1IndexField.Conversion.LATITUDE; |
|
35 |
else if (def[3].equals("lon")) dc = D1IndexField.Conversion.LONGITUDE; |
|
36 |
specs.add(new D1IndexField(def[0], def[1], df, dc)); |
|
37 |
} |
|
38 |
r.close(); |
|
39 |
} catch (IOException e) { |
|
40 |
// TODO: logs |
|
41 |
e.printStackTrace(); |
|
42 |
} |
|
43 |
return specs; |
|
44 |
} |
|
45 |
} |
src/edu/ucsb/nceas/metacat/index/LuceneIndex.java | ||
---|---|---|
1 |
package edu.ucsb.nceas.metacat.index; |
|
2 |
|
|
3 |
import org.apache.lucene.analysis.standard.StandardAnalyzer; |
|
4 |
import org.apache.lucene.document.Document; |
|
5 |
import org.apache.lucene.document.Field; |
|
6 |
import org.apache.lucene.document.StringField; |
|
7 |
import org.apache.lucene.document.TextField; |
|
8 |
import org.apache.lucene.index.*; |
|
9 |
import org.apache.lucene.queryparser.classic.ParseException; |
|
10 |
import org.apache.lucene.queryparser.classic.QueryParser; |
|
11 |
import org.apache.lucene.search.IndexSearcher; |
|
12 |
import org.apache.lucene.search.ScoreDoc; |
|
13 |
import org.apache.lucene.search.TopDocs; |
|
14 |
import org.apache.lucene.store.Directory; |
|
15 |
import org.apache.lucene.store.FSDirectory; |
|
16 |
import org.apache.lucene.util.Version; |
|
17 |
|
|
18 |
import java.io.File; |
|
19 |
import java.io.IOException; |
|
20 |
import java.util.Map; |
|
21 |
|
|
22 |
public class LuceneIndex implements GenericIndex { |
|
23 |
protected final StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_41); |
|
24 |
protected final IndexWriterConfig cfg = new IndexWriterConfig(Version.LUCENE_41, analyzer); |
|
25 |
protected final QueryParser parser = new QueryParser(Version.LUCENE_41, MetacatIndex.MCIDFIELD, analyzer); |
|
26 |
protected IndexWriter writer = null; |
|
27 |
protected DirectoryReader reader = null; |
|
28 |
protected Directory index = null; |
|
29 |
|
|
30 |
public LuceneIndex(File fsIndex) { |
|
31 |
try { |
|
32 |
this.index = FSDirectory.open(fsIndex); |
|
33 |
this.cfg.setOpenMode(IndexWriterConfig.OpenMode.APPEND); |
|
34 |
this.writer = new IndexWriter(this.index, this.cfg); |
|
35 |
this.reader = DirectoryReader.open(this.index); |
|
36 |
} catch (Exception e) { |
|
37 |
e.printStackTrace(); |
|
38 |
} |
|
39 |
} |
|
40 |
|
|
41 |
public void insert(String docID, Map<String, String[]> fields) throws IOException { |
|
42 |
this.update(docID, fields); |
|
43 |
} |
|
44 |
|
|
45 |
public void update(String docID, Map<String, String[]> fields) throws IOException { |
|
46 |
Document d = new Document(); |
|
47 |
d.add(new StringField(MetacatIndex.MCIDFIELD, docID, Field.Store.YES)); |
|
48 |
for (String k : fields.keySet()) { |
|
49 |
for (String v : fields.get(k)) { |
|
50 |
d.add(new TextField(k, v, Field.Store.NO)); |
|
51 |
} |
|
52 |
} |
|
53 |
this.writer.addDocument(d); |
|
54 |
this.writer.commit(); |
|
55 |
} |
|
56 |
|
|
57 |
public void remove(String docID) throws IOException { |
|
58 |
this.writer.deleteDocuments(new Term(MetacatIndex.MCIDFIELD, docID)); |
|
59 |
this.writer.commit(); |
|
60 |
} |
|
61 |
|
|
62 |
public String[] query(String q) { |
|
63 |
try { |
|
64 |
DirectoryReader newReader = DirectoryReader.openIfChanged(this.reader); |
|
65 |
if (newReader != null) { // not sure if this is right... |
|
66 |
this.reader.close(); |
|
67 |
this.reader = newReader; |
|
68 |
} |
|
69 |
IndexSearcher searcher = new IndexSearcher(this.reader); |
|
70 |
TopDocs docs = searcher.search(this.parser.parse(q), 100); |
|
71 |
String docIDs[] = new String[docs.scoreDocs.length]; |
|
72 |
int i = 0; |
|
73 |
for (ScoreDoc sd : docs.scoreDocs) { |
|
74 |
Document d = searcher.doc(sd.doc); |
|
75 |
docIDs[i++] = d.getField(MetacatIndex.MCIDFIELD).toString(); |
|
76 |
} |
|
77 |
return docIDs; |
|
78 |
} catch (IOException e) { |
|
79 |
e.printStackTrace(); |
|
80 |
} catch (ParseException e) { |
|
81 |
e.printStackTrace(); |
|
82 |
} |
|
83 |
return new String[0]; |
|
84 |
} |
|
85 |
} |
src/edu/ucsb/nceas/metacat/index/MCIndexDocDef.java | ||
---|---|---|
1 |
package edu.ucsb.nceas.metacat.index; |
|
2 |
|
|
3 |
import java.util.*; |
|
4 |
|
|
5 |
/* |
|
6 |
* Schema for document indexing. The Lucene/SOLR model of a 'document' is a bag of named |
|
7 |
* text objects. |
|
8 |
*/ |
|
9 |
public abstract class MCIndexDocDef { |
|
10 |
protected Set<FieldSpec> fields; |
|
11 |
|
|
12 |
// do we really want this here? I guess this is the whole question..starting to think not. |
|
13 |
public abstract boolean applies(); |
|
14 |
|
|
15 |
|
|
16 |
public MCIndexDocDef(Collection<FieldSpec> fields) { |
|
17 |
this.fields = new HashSet<FieldSpec>(fields); |
|
18 |
} |
|
19 |
|
|
20 |
public void add(FieldSpec f) { |
|
21 |
this.fields.add(f); |
|
22 |
} |
|
23 |
|
|
24 |
public void add(Collection<FieldSpec> f) { |
|
25 |
this.fields.addAll(f); |
|
26 |
} |
|
27 |
|
|
28 |
public void remove(FieldSpec f) { |
|
29 |
this.fields.remove(f); |
|
30 |
} |
|
31 |
|
|
32 |
public void remove(Collection<FieldSpec> f) { |
|
33 |
this.fields.removeAll(f); |
|
34 |
} |
|
35 |
} |
src/edu/ucsb/nceas/metacat/index/MCXmlNamespace.java | ||
---|---|---|
1 |
package edu.ucsb.nceas.metacat.index; |
|
2 |
|
|
3 |
import javax.xml.XMLConstants; |
|
4 |
import javax.xml.namespace.NamespaceContext; |
|
5 |
import java.util.HashMap; |
|
6 |
import java.util.Iterator; |
|
7 |
import java.util.Map; |
|
8 |
|
|
9 |
/** |
|
10 |
*/ |
|
11 |
public class MCXmlNamespace implements NamespaceContext { |
|
12 |
// Namespace prefixes for use in XPath expressions |
|
13 |
// The literal values assigned here should be distinct but are not significant -- they |
|
14 |
// may equally be "a" "b" "c" etc as long as the symbolic names are used consistently |
|
15 |
public static final String E200 = "e200"; |
|
16 |
public static final String E211 = "e211"; |
|
17 |
public static final String E210 = "e210"; |
|
18 |
public static final String E201 = "e201"; |
|
19 |
public static final String D1 = "d1"; |
|
20 |
public static final String ORE = "ore"; |
|
21 |
public static final String DC = "dc"; |
|
22 |
public static final String DCT = "dct"; |
|
23 |
public static final String FOAF = "foaf"; |
|
24 |
|
|
25 |
private static final Map<String, String> prefixes; |
|
26 |
static { |
|
27 |
prefixes = new HashMap<String, String>(); |
|
28 |
prefixes.put(E200, "eml://ecoinformatics.org/eml-2.0.0"); |
|
29 |
prefixes.put(E201, "eml://ecoinformatics.org/eml-2.0.1"); |
|
30 |
prefixes.put(E210, "eml://ecoinformatics.org/eml-2.1.0"); |
|
31 |
prefixes.put(E211, "eml://ecoinformatics.org/eml-2.1.1"); |
|
32 |
prefixes.put(D1, "http://ns.dataone.org/service/types/v1"); |
|
33 |
prefixes.put(ORE, "http://www.openarchives.org/ore/terms/"); |
|
34 |
prefixes.put(DC, "http://purl.org/dc/elements/1.1/"); |
|
35 |
prefixes.put(DCT, "http://purl.org/dc/terms/"); |
|
36 |
prefixes.put(FOAF, "http://xmlns.com/foaf/0.1/"); |
|
37 |
} |
|
38 |
@Override |
|
39 |
public String getNamespaceURI(String prefix) { |
|
40 |
String ns = prefixes.get(prefix); |
|
41 |
return (ns != null) ? ns : XMLConstants.NULL_NS_URI; |
|
42 |
} |
|
43 |
@Override |
|
44 |
public String getPrefix(String namespaceURI) { |
|
45 |
throw new UnsupportedOperationException(); |
|
46 |
} |
|
47 |
@Override |
|
48 |
public Iterator getPrefixes(String namespaceURI) { |
|
49 |
throw new UnsupportedOperationException(); |
|
50 |
} |
|
51 |
} |
|
52 |
|
src/edu/ucsb/nceas/metacat/index/DocType.java | ||
---|---|---|
1 |
package edu.ucsb.nceas.metacat.index; |
|
2 |
|
|
3 |
import org.xml.sax.InputSource; |
|
4 |
|
|
5 |
import javax.xml.xpath.*; |
|
6 |
import java.io.Reader; |
|
7 |
import java.util.ArrayList; |
|
8 |
import java.util.List; |
|
9 |
|
|
10 |
/** |
|
11 |
* Identify document classes for indexing. Replaces equivalent Spring-based configuration |
|
12 |
* used by DataONE indexer. It's not clear that we actually need to perfectly mimic this |
|
13 |
* stuff, but for the now we shall. Note that the replicated DataONE classifications |
|
14 |
* apply only to DataONE System Metadata documents. Additional classifiers have been |
|
15 |
* added for plain EML etc. |
|
16 |
* |
|
17 |
* There are better ways to configure this stuff, but this is in effect a direct transcript |
|
18 |
* of the DataONE material. Automatic extraction from Spring configuration is doable, but |
|
19 |
* not trivial. |
|
20 |
*/ |
|
21 |
public class DocType { |
|
22 |
// |
|
23 |
private static final XPath xpath = XPathFactory.newInstance().newXPath(); |
|
24 |
private static final List<XPathExpression> d1sys = new ArrayList<XPathExpression>(1); |
|
25 |
private static final List<XPathExpression> d1eml = new ArrayList<XPathExpression>(4); |
|
26 |
private static final List<XPathExpression> d1dryad = new ArrayList<XPathExpression>(1); |
|
27 |
private static final List<XPathExpression> d1fgdc = new ArrayList<XPathExpression>(3); |
|
28 |
private static final List<XPathExpression> eml = new ArrayList<XPathExpression>(4); |
|
29 |
static { |
|
30 |
xpath.setNamespaceContext(new MCXmlNamespace()); |
|
31 |
|
|
32 |
try { |
|
33 |
d1sys.add(xpath.compile("/" + MCXmlNamespace.D1 + ":systemMetadata")); |
|
34 |
|
|
35 |
d1eml.add(xpath.compile("/" + MCXmlNamespace.D1 + ":systemMetadata/formatId[text() = 'eml://ecoinformatics.org/eml-2.0.0']")); |
|
36 |
d1eml.add(xpath.compile("/" + MCXmlNamespace.D1 + ":systemMetadata/formatId[text() = 'eml://ecoinformatics.org/eml-2.0.1']")); |
|
37 |
d1eml.add(xpath.compile("/" + MCXmlNamespace.D1 + ":systemMetadata/formatId[text() = 'eml://ecoinformatics.org/eml-2.1.1']")); |
|
38 |
d1eml.add(xpath.compile("/" + MCXmlNamespace.D1 + ":systemMetadata/formatId[text() = 'eml://ecoinformatics.org/eml-2.1.1']")); |
|
39 |
|
|
40 |
d1dryad.add(xpath.compile("/" + MCXmlNamespace.D1 + ":systemMetadata/formatId[text() = 'http://purl.org/dryad/terms/']")); |
|
41 |
|
|
42 |
d1fgdc.add(xpath.compile("/" + MCXmlNamespace.D1 + ":systemMetadata/formatId[text() = 'FGDC-STD-001-1998']")); |
|
43 |
d1fgdc.add(xpath.compile("/" + MCXmlNamespace.D1 + ":systemMetadata/formatId[text() = 'FGDC-STD-001.1-1999']")); |
|
44 |
d1fgdc.add(xpath.compile("/" + MCXmlNamespace.D1 + ":systemMetadata/formatId[text() = 'FGDC-STD-001.2-1999']")); |
|
45 |
|
|
46 |
eml.add(xpath.compile("/" + MCXmlNamespace.E200 + ":eml")); |
|
47 |
eml.add(xpath.compile("/" + MCXmlNamespace.E201 + ":eml")); |
|
48 |
eml.add(xpath.compile("/" + MCXmlNamespace.E210 + ":eml")); |
|
49 |
eml.add(xpath.compile("/" + MCXmlNamespace.E211 + ":eml")); |
|
50 |
} catch (XPathExpressionException e) { |
|
51 |
// TODO: logs |
|
52 |
e.printStackTrace(); |
|
53 |
} |
|
54 |
} |
|
55 |
public static boolean isSysmeta(Reader in) { |
|
56 |
return check(d1sys, in); |
|
57 |
} |
|
58 |
public static boolean isSyseml(Reader in) { |
|
59 |
return check(d1eml, in); |
|
60 |
} |
|
61 |
public static boolean isSysdryad(Reader in) { |
|
62 |
return check(d1dryad, in); |
|
63 |
} |
|
64 |
public static boolean isSysfgdc(Reader in) { |
|
65 |
return check(d1fgdc, in); |
|
66 |
} |
|
67 |
public static boolean isEml(Reader in) { |
|
68 |
return check(eml, in); |
|
69 |
} |
|
70 |
|
|
71 |
public static boolean check(List<XPathExpression> exprs, Reader in) { |
|
72 |
InputSource src = new InputSource(in); |
|
73 |
try { |
|
74 |
for (XPathExpression x : exprs) { |
|
75 |
Boolean match = (Boolean) x.evaluate(src, XPathConstants.BOOLEAN); |
|
76 |
if (match != null && match.booleanValue()) { |
|
77 |
return true; |
|
78 |
} |
|
79 |
} |
|
80 |
} catch (XPathExpressionException e) { |
|
81 |
} |
|
82 |
return false; |
|
83 |
} |
|
84 |
} |
src/edu/ucsb/nceas/metacat/index/D1IndexField.java | ||
---|---|---|
1 |
package edu.ucsb.nceas.metacat.index; |
|
2 |
import org.dataone.cn.indexer.convert.*; |
|
3 |
import org.dataone.cn.indexer.parser.SolrField; |
|
4 |
import org.dataone.cn.indexer.solrhttp.SolrElementField; |
|
5 |
import org.w3c.dom.Document; |
|
6 |
import org.xml.sax.InputSource; |
|
7 |
import org.xml.sax.SAXException; |
|
8 |
|
|
9 |
import javax.xml.parsers.DocumentBuilder; |
|
10 |
import javax.xml.parsers.DocumentBuilderFactory; |
|
11 |
import javax.xml.parsers.ParserConfigurationException; |
|
12 |
import javax.xml.xpath.XPath; |
|
13 |
import javax.xml.xpath.XPathExpressionException; |
|
14 |
import javax.xml.xpath.XPathFactory; |
|
15 |
import java.io.IOException; |
|
16 |
import java.io.Reader; |
|
17 |
import java.util.List; |
|
18 |
|
|
19 |
/* |
|
20 |
* Wrapper to use DataONE indexer's field-processing code for extraction. |
|
21 |
* |
|
22 |
*/ |
|
23 |
|
|
24 |
public class D1IndexField extends FieldSpec { |
|
25 |
// man Java enums are awful |
|
26 |
public enum DataFormat { SINGLE, SET, MULTISET } |
|
27 |
public enum Conversion { NONE, DATE, FGDCDATE, LATITUDE, LONGITUDE, FORMAT } |
|
28 |
|
|
29 |
private static DocumentBuilder docBuilder = null; |
|
30 |
private static final DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); |
|
31 |
private static final XPath xpath = XPathFactory.newInstance().newXPath(); |
|
32 |
static { |
|
33 |
dbf.setNamespaceAware(true); |
|
34 |
|
|
35 |
try { |
|
36 |
docBuilder = dbf.newDocumentBuilder(); |
|
37 |
} catch (ParserConfigurationException e) { |
|
38 |
e.printStackTrace(); |
|
39 |
} |
|
40 |
xpath.setNamespaceContext(new MCXmlNamespace()); |
|
41 |
} |
|
42 |
|
|
43 |
private SolrField d1SolrField; |
|
44 |
|
|
45 |
public D1IndexField(String name, String xp, DataFormat d, Conversion c) { |
|
46 |
super(name); |
|
47 |
|
|
48 |
IConverter conv = null; |
|
49 |
switch (c) { |
|
50 |
case NONE: conv = null; break; |
|
51 |
case DATE: conv = new SolrDateConverter(); break; |
|
52 |
case FGDCDATE: conv = new FgdcDateConverter(); break; |
|
53 |
case LATITUDE: conv = new SolrLatitudeConverter(); break; |
|
54 |
case LONGITUDE: conv = new SolrLongitudeConverter(); break; |
|
55 |
case FORMAT: conv = new FormatIdToFormatTypeConverter(); break; |
|
56 |
} |
|
57 |
this.d1SolrField = new SolrField(name, xp, d != DataFormat.SINGLE, conv); |
|
58 |
if (d == DataFormat.SINGLE) { |
|
59 |
this.d1SolrField.setCombineNodes(true); |
|
60 |
} |
|
61 |
else if (d == DataFormat.SET) { |
|
62 |
this.d1SolrField.setDedupe(true); |
|
63 |
} |
|
64 |
|
|
65 |
this.d1SolrField.initExpression(D1IndexField.xpath); |
|
66 |
} |
|
67 |
|
|
68 |
@Override |
|
69 |
public String[] extract(final Reader in) { |
|
70 |
List<SolrElementField> fieldList = null; |
|
71 |
try { |
|
72 |
// TODO: encodings |
|
73 |
Document doc = docBuilder.parse(new InputSource(in)); |
|
74 |
fieldList = this.solrFields(doc); |
|
75 |
} catch (Exception e) { |
|
76 |
e.printStackTrace(); |
|
77 |
} |
|
78 |
if (fieldList != null) { |
|
79 |
String[] fields = new String[fieldList.size()]; |
|
80 |
int i = 0; |
|
81 |
for (SolrElementField f : fieldList) { |
|
82 |
fields[i++] = f.getValue(); |
|
83 |
} |
|
84 |
return fields; |
|
85 |
} else { |
|
86 |
return new String[0]; |
|
87 |
} |
|
88 |
} |
|
89 |
|
|
90 |
// convenience method for use with DataONE SolrDoc |
|
91 |
public List<SolrElementField> solrFields(final Document doc) |
|
92 |
throws XPathExpressionException, IOException, ParserConfigurationException, SAXException { |
|
93 |
return this.d1SolrField.processField(doc); |
|
94 |
} |
|
95 |
} |
src/edu/ucsb/nceas/metacat/index/Embedded.java | ||
---|---|---|
1 |
package edu.ucsb.nceas.metacat.index; |
|
2 |
|
|
3 |
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer; |
|
4 |
import org.apache.solr.core.CoreContainer; |
|
5 |
|
|
6 |
public class Embedded extends SolrjIndex { |
|
7 |
Embedded() { |
|
8 |
super(""); |
|
9 |
// this is just for experiment; proper configuration will be different |
|
10 |
System.setProperty("solr.solr.home", "/Users/brendan/solrhome/"); |
|
11 |
CoreContainer.Initializer init = new CoreContainer.Initializer(); |
|
12 |
CoreContainer c = init.initialize(); |
|
13 |
solr = new EmbeddedSolrServer(c, "mc-core"); |
|
14 |
} |
|
15 |
} |
src/edu/ucsb/nceas/metacat/index/XpathIndexField.java | ||
---|---|---|
1 |
package edu.ucsb.nceas.metacat.index; |
|
2 |
|
|
3 |
import java.io.Reader; |
|
4 |
|
|
5 |
import edu.ucsb.nceas.metacat.index.D1IndexField.DataFormat; |
|
6 |
import edu.ucsb.nceas.metacat.index.D1IndexField.Conversion; |
|
7 |
|
|
8 |
/* |
|
9 |
* Simple xpath-based indexing. Accepts DOM-parseable data, extracts text data from all |
|
10 |
* nodes selected by an XPath expression concatenated into a single Lucene/SOLR field |
|
11 |
* body. Equivalent to the DataONE indexer's MergeSolrField bean. |
|
12 |
* |
|
13 |
* Currently just wraps DataONE SolrField code. |
|
14 |
*/ |
|
15 |
|
|
16 |
public class XpathIndexField extends FieldSpec { |
|
17 |
private static final String textSelector = "text()"; |
|
18 |
public final String xpath; |
|
19 |
|
|
20 |
public XpathIndexField(String name, String xp) { |
|
21 |
super(name); |
|
22 |
this.xpath = xp; |
|
23 |
} |
|
24 |
|
|
25 |
@Override |
|
26 |
public String[] extract(Reader in) { |
|
27 |
String s = this.xpath; |
|
28 |
if (!this.xpath.endsWith(textSelector)) { |
|
29 |
if (this.xpath.charAt(this.xpath.length()-1) != '/') { |
|
30 |
s = s + "/"; |
|
31 |
} |
|
32 |
s = s + textSelector; |
|
33 |
} |
|
34 |
|
|
35 |
D1IndexField field = new D1IndexField(this.name, s, DataFormat.SINGLE, Conversion.NONE); |
|
36 |
return field.extract(in); |
|
37 |
} |
|
38 |
} |
src/edu/ucsb/nceas/metacat/index/D1Index.java | ||
---|---|---|
1 |
package edu.ucsb.nceas.metacat.index; |
|
2 |
import org.springframework.http.client.HttpComponentsClientHttpRequestFactory; |
|
3 |
|
|
4 |
import org.apache.http.impl.client.DefaultHttpClient; |
|
5 |
import org.dataone.cn.indexer.solrhttp.HTTPService; |
|
6 |
import org.dataone.cn.indexer.solrhttp.SolrDoc; |
|
7 |
import org.dataone.cn.indexer.solrhttp.SolrElementAdd; |
|
8 |
import org.dataone.cn.indexer.solrhttp.SolrElementField; |
|
9 |
|
|
10 |
import java.io.IOException; |
|
11 |
import java.util.ArrayList; |
|
12 |
import java.util.List; |
|
13 |
import java.util.Map; |
|
14 |
|
|
15 |
public class D1Index implements GenericIndex { |
|
16 |
protected final DefaultHttpClient client = new DefaultHttpClient(); |
|
17 |
protected final HTTPService solrSvc = |
|
18 |
new HTTPService(new HttpComponentsClientHttpRequestFactory(this.client)); |
|
19 |
protected String uri = ""; |
|
20 |
|
|
21 |
D1Index(String uri) { |
|
22 |
this.uri = uri; |
|
23 |
} |
|
24 |
|
|
25 |
public void insert(String docID, Map<String, String[]> fields) throws IOException { |
|
26 |
this.update(docID, fields); |
|
27 |
} |
|
28 |
|
|
29 |
public void update(String docID, Map<String, String[]> fields) throws IOException { |
|
30 |
SolrDoc doc = new SolrDoc(); |
|
31 |
// this works for our purposes, but violates DataONE expectations |
|
32 |
doc.addField(new SolrElementField(SolrElementField.FIELD_ID, docID)); |
|
33 |
for (String k : fields.keySet()) { |
|
34 |
for (String v : fields.get(k)) { |
|
35 |
doc.addField(new SolrElementField(k, v)); |
|
36 |
} |
|
37 |
} |
|
38 |
List<SolrDoc> docList = new ArrayList<SolrDoc>(1); |
|
39 |
docList.add(doc); |
|
40 |
solrSvc.sendUpdate(this.uri, new SolrElementAdd(docList)); |
|
41 |
} |
|
42 |
|
|
43 |
public void remove(String docID) { |
|
44 |
solrSvc.sendSolrDelete(docID); |
|
45 |
} |
|
46 |
|
|
47 |
public String[] query(String q) throws IndexOutOfBoundsException { |
|
48 |
// the indexer's HttpService class doesn't provide general querying -- it's |
|
49 |
// an indexer only. Query handling is done by separate code in a different |
|
50 |
// part of the D1 architecture. That code is rather tightly bound to other |
|
51 |
// D1 elements and consequently challenging to integrate with Metacat. |
|
52 |
// For this and other reasons I feel that the other index interfaces are |
|
53 |
// a better choice for Metacat, so as yet this remains incomplete. It is |
|
54 |
// however possible to continue down this path. |
|
55 |
return new String[0]; |
|
56 |
} |
|
57 |
} |
src/edu/ucsb/nceas/metacat/index/SolrjIndex.java | ||
---|---|---|
1 |
package edu.ucsb.nceas.metacat.index; |
|
2 |
|
|
3 |
import org.apache.solr.client.solrj.SolrQuery; |
|
4 |
import org.apache.solr.client.solrj.SolrServer; |
|
5 |
import org.apache.solr.client.solrj.SolrServerException; |
|
6 |
import org.apache.solr.client.solrj.impl.HttpSolrServer; |
|
7 |
import org.apache.solr.client.solrj.response.QueryResponse; |
|
8 |
import org.apache.solr.common.SolrDocument; |
|
9 |
import org.apache.solr.common.SolrDocumentList; |
|
10 |
import org.apache.solr.common.SolrInputDocument; |
|
11 |
|
|
12 |
import java.io.IOException; |
|
13 |
import java.util.Map; |
|
14 |
|
|
15 |
public class SolrjIndex implements GenericIndex { |
|
16 |
protected SolrServer solr = null; |
|
17 |
|
|
18 |
SolrjIndex(String uri) { |
|
19 |
this.solr = new HttpSolrServer(uri); |
|
20 |
} |
|
21 |
|
|
22 |
public void insert(String docID, Map<String, String[]> fields) throws IOException { |
|
23 |
this.update(docID, fields); |
|
24 |
} |
|
25 |
|
|
26 |
public void update(String docID, Map<String, String[]> fields) throws IOException { |
|
27 |
SolrInputDocument doc = new SolrInputDocument(); |
|
28 |
doc.addField(MetacatIndex.MCIDFIELD, docID); |
|
29 |
for (String k : fields.keySet()) { |
|
30 |
for (String v : fields.get(k)) { |
|
31 |
doc.addField(k, v); |
|
32 |
} |
|
33 |
} |
|
34 |
try { |
|
35 |
this.solr.add(doc); |
|
36 |
} catch (SolrServerException e) { |
|
37 |
e.printStackTrace(); |
|
38 |
} |
|
39 |
} |
|
40 |
|
|
41 |
public void remove(String docID) throws IOException { |
|
42 |
try { |
|
43 |
solr.deleteByQuery(MetacatIndex.MCIDFIELD + ":" + docID); |
|
44 |
} catch (SolrServerException e) { |
|
45 |
// TODO: handling |
|
46 |
e.printStackTrace(); |
|
47 |
} |
|
48 |
} |
|
49 |
|
|
50 |
public String[] query(String q) { |
|
51 |
SolrQuery sq = new SolrQuery(q); |
|
52 |
QueryResponse rsp = null; |
|
53 |
try { |
|
54 |
rsp = solr.query(sq); |
|
55 |
} catch (SolrServerException e) { |
|
56 |
// TODO: handling |
|
57 |
e.printStackTrace(); |
|
58 |
} |
|
59 |
SolrDocumentList docs = rsp.getResults(); |
|
60 |
String[] docIDs = new String[docs.size()]; |
|
61 |
int i = 0; |
|
62 |
for (SolrDocument d : docs) { |
|
63 |
docIDs[i++] = d.getFieldValue(MetacatIndex.MCIDFIELD).toString(); |
|
64 |
} |
|
65 |
return docIDs; |
|
66 |
} |
|
67 |
} |
Also available in: Unified diff
Remove those obsolete index classes.