Project

General

Profile

« Previous | Next » 

Revision 7546

Added by Jing Tao over 11 years ago

Add insert method for SolrIndex.

View differences:

metacat-index/src/main/java/edu/ucsb/nceas/metacat/index/SolrIndex.java
26 26
 */
27 27
package edu.ucsb.nceas.metacat.index;
28 28

  
29
import java.io.IOException;
30
import java.io.InputStream;
31
import java.util.ArrayList;
32
import java.util.HashMap;
33
import java.util.Iterator;
29 34
import java.util.List;
35
import java.util.Map;
36
import java.util.Set;
30 37

  
31 38
import javax.xml.parsers.DocumentBuilder;
32 39
import javax.xml.parsers.DocumentBuilderFactory;
40
import javax.xml.parsers.ParserConfigurationException;
33 41
import javax.xml.xpath.XPath;
42
import javax.xml.xpath.XPathExpressionException;
34 43
import javax.xml.xpath.XPathFactory;
35 44

  
45
import org.apache.commons.codec.EncoderException;
46
import org.apache.commons.io.output.ByteArrayOutputStream;
47
import org.apache.commons.logging.Log;
48
import org.apache.commons.logging.LogFactory;
36 49
import org.apache.solr.client.solrj.SolrServer;
50
import org.apache.solr.client.solrj.SolrServerException;
51
import org.apache.solr.common.SolrInputDocument;
37 52
import org.dataone.cn.indexer.XMLNamespaceConfig;
53
import org.dataone.cn.indexer.XPathDocumentParser;
38 54
import org.dataone.cn.indexer.parser.IDocumentSubprocessor;
55
import org.dataone.cn.indexer.parser.SolrField;
56
import org.dataone.cn.indexer.solrhttp.SolrDoc;
57
import org.dataone.cn.indexer.solrhttp.SolrElementAdd;
58
import org.dataone.cn.indexer.solrhttp.SolrElementField;
59
import org.w3c.dom.Document;
60
import org.xml.sax.SAXException;
39 61

  
40 62
/**
41 63
 * A class does insert, update and remove indexes to a SOLR server
......
43 65
 *
44 66
 */
45 67
public class SolrIndex {
68
    
69
    //private static final String INPUT_ENCODING = "UTF-8";
70
    
46 71
    private List<IDocumentSubprocessor> subprocessors = null;
47 72
    private SolrServer solrServer = null;
48 73
    private XMLNamespaceConfig xmlNamespaceConfig = null;
74
    private List<SolrField> sysmetaSolrFields = null;
49 75

  
50 76
    private static DocumentBuilderFactory documentBuilderFactory = null;
51 77
    private static DocumentBuilder builder = null;
52 78

  
53 79
    private static XPathFactory xpathFactory = null;
54 80
    private static XPath xpath = null;
81
    Log log = LogFactory.getLog(SolrIndex.class);
55 82
    
83
    static {
84
        documentBuilderFactory = DocumentBuilderFactory.newInstance();
85
        documentBuilderFactory.setNamespaceAware(true);
86
        try {
87
            builder = documentBuilderFactory.newDocumentBuilder();
88
        } catch (ParserConfigurationException e) {
89
            e.printStackTrace();
90
        }
91
        xpathFactory = XPathFactory.newInstance();
92
        xpath = xpathFactory.newXPath();
93
    }
94
    
56 95
    /**
57 96
     * Constructor
58 97
     */
59
    public SolrIndex() {
60
        
98
    public SolrIndex(XMLNamespaceConfig xmlNamespaceConfig, List<SolrField> sysmetaSolrFields)
99
                    throws XPathExpressionException, ParserConfigurationException {
100
                this.xmlNamespaceConfig = xmlNamespaceConfig;
101
                this.sysmetaSolrFields = sysmetaSolrFields;
102
                init();
61 103
    }
62 104
    
105
    private void init() throws ParserConfigurationException, XPathExpressionException {
106
        xpath.setNamespaceContext(xmlNamespaceConfig);
107
        initExpressions();
108
    }
109

  
110
    private void initExpressions() throws XPathExpressionException {
111
        for (SolrField field : sysmetaSolrFields) {
112
            field.initExpression(xpath);
113
        }
114

  
115
    }
63 116
    
117
    
64 118
    /**
65 119
     * Get the list of the Subprocessors in this index.
66 120
     * @return the list of the Subprocessors.
......
74 128
     * @param subprocessorList  the list will be set.
75 129
     */
76 130
    public void setSubprocessors(List<IDocumentSubprocessor> subprocessorList) {
77
        /*for (IDocumentSubprocessor subprocessor : subprocessorList) {
131
        for (IDocumentSubprocessor subprocessor : subprocessorList) {
78 132
            subprocessor.initExpression(xpath);
79
        }*/
133
        }
80 134
        this.subprocessors = subprocessorList;
81 135
    }
136
    
137
    /**
138
     * Generate the index for the given information
139
     * @param id
140
     * @param systemMetaDataStream
141
     * @param dataStream
142
     * @return
143
     * @throws IOException
144
     * @throws SAXException
145
     * @throws ParserConfigurationException
146
     * @throws XPathExpressionException
147
     * @throws EncoderException
148
     */
149
    private Map<String, SolrDoc> process(String id, InputStream systemMetaDataStream, InputStream dataStream)
150
                    throws IOException, SAXException, ParserConfigurationException,
151
                    XPathExpressionException{
152

  
153
        // Load the System Metadata document
154
        Document sysMetaDoc = generateXmlDocument(systemMetaDataStream);
155
        if (sysMetaDoc == null) {
156
            log.error("Could not load System metadata for ID: " + id);
157
            return null;
158
        }
159

  
160
        // Extract the field values from the System Metadata
161
        List<SolrElementField> sysSolrFields = processSysmetaFields(sysMetaDoc, id);
162
        SolrDoc indexDocument = new SolrDoc(sysSolrFields);
163
        Map<String, SolrDoc> docs = new HashMap<String, SolrDoc>();
164
        docs.put(id, indexDocument);
165

  
166
        // Determine if subprocessors are available for this ID
167
        if (subprocessors != null) {
168
                    // for each subprocessor loaded from the spring config
169
                    for (IDocumentSubprocessor subprocessor : subprocessors) {
170
                        // Does this subprocessor apply?
171
                        if (subprocessor.canProcess(sysMetaDoc)) {
172
                            // if so, then extract the additional information from the
173
                            // document.
174
                            try {
175
                                // docObject = the resource map document or science
176
                                // metadata document.
177
                                // note that resource map processing touches all objects
178
                                // referenced by the resource map.
179
                                Document docObject = generateXmlDocument(dataStream);
180
                                if (docObject == null) {
181
                                    log.error("Could not load OBJECT for ID " + id );
182
                                } else {
183
                                    docs = subprocessor.processDocument(id, docs, docObject);
184
                                }
185
                            } catch (Exception e) {
186
                                log.error(e.getStackTrace().toString());
187
                            }
188
                        }
189
                    }
190
       }
191

  
192
       // TODO: in the XPathDocumentParser class in d1_cn_index_process module,
193
       // merge is only for resource map. We need more work here.
194
       for (SolrDoc mergeDoc : docs.values()) {
195
           if (!mergeDoc.isMerged()) {
196
                 //mergeWithIndexedDocument(mergeDoc);
197
           }
198
       }
199

  
200
       //SolrElementAdd addCommand = getAddCommand(new ArrayList<SolrDoc>(docs.values()));
201
               
202
       return docs;
203
    }
204
    
205
    /*
206
     * Generate a Document from the InputStream
207
     */
208
    private Document generateXmlDocument(InputStream smdStream) throws SAXException {
209
        Document doc = null;
210

  
211
        try {
212
            doc = builder.parse(smdStream);
213
        } catch (IOException e) {
214
            log.error(e.getMessage(), e);
215
        }
216

  
217
        return doc;
218
    }
219
    
220
    /*
221
     * Index the fields of the system metadata
222
     */
223
    private List<SolrElementField> processSysmetaFields(Document doc, String identifier) {
224

  
225
        List<SolrElementField> fieldList = new ArrayList<SolrElementField>();
226
        // solrFields is the list of fields defined in the application context
227
       
228
        for (SolrField field : sysmetaSolrFields) {
229
            try {
230
                // the field.getFields method can return a single value or
231
                // multiple values for multi-valued fields
232
                // or can return multiple SOLR document fields.
233
                fieldList.addAll(field.getFields(doc, identifier));
234
            } catch (Exception e) {
235
                e.printStackTrace();
236
            }
237
        }
238
        return fieldList;
239

  
240
    }
241
    
242
    /**
243
     * Generate indexes for a newly inserted document.
244
     * @param pid  the id of this document
245
     * @param systemMetadata  the system metadata associated with the data object
246
     * @param data  the data object itself
247
     * @throws SolrServerException 
248
     */
249
    public void insert(String pid, InputStream systemMetadata, InputStream data) 
250
                    throws IOException, SAXException, ParserConfigurationException,
251
                    XPathExpressionException, SolrServerException {
252
        Map<String, SolrDoc> docs = process(pid, systemMetadata, data);
253
        
254
        //transform the Map to the SolrInputDocument which can be used by the solr server
255
        if(docs != null) {
256
            Set<String> ids = docs.keySet();
257
            for(String id : ids) {
258
                SolrInputDocument solrDoc = new SolrInputDocument();
259
                if(id != null) {
260
                    SolrDoc doc = docs.get(id);
261
                    if(doc != null) {
262
                        List<SolrElementField> list = doc.getFieldList();
263
                        if(list != null) {
264
                            Iterator<SolrElementField> iterator = list.iterator();
265
                            while (iterator.hasNext()) {
266
                                SolrElementField field = iterator.next();
267
                                if(field != null) {
268
                                    String value = field.getValue();
269
                                    String name = field.getName();
270
                                    solrDoc.addField(name, value);
271
                                }
272
                            }
273
                        }
274
                    }
275
                }
276
                if(!solrDoc.isEmpty()) {
277
                    solrServer.add(solrDoc);
278
                }
279
            }
280
        }
281
    }
82 282
}
metacat-index/src/main/resources/index-processor-context.xml
40 40
  <constructor-arg>
41 41
   <list>
42 42
    <bean class="edu.ucsb.nceas.metacat.index.SolrIndex">
43
     <!--  <constructor-arg name="fields" ref="xpath_system_metadata_100">
43
     <constructor-arg name="sysmetaSolrFields" ref="xpath_system_metadata_100">
44 44
     </constructor-arg>
45 45
     <constructor-arg name="xmlNamespaceConfig" ref="xmlNamespaceConfig" />
46
     <property name="solrBaseUri" value="${solr.base.uri}" />
46
     <!--<property name="solrBaseUri" value="${solr.base.uri}" />
47 47
     <property name="httpService" ref="httpService" /> -->
48 48
     <property name="subprocessors">
49 49
      <list>

Also available in: Unified diff