Project

General

Profile

1
/**
2
 *  '$RCSfile$'
3
 *    Purpose: A class that gets Accession Number, check for uniqueness
4
 *             and register it into db
5
 *  Copyright: 2000 Regents of the University of California and the
6
 *             National Center for Ecological Analysis and Synthesis
7
 *    Authors: Jivka Bojilova, Matt Jones
8
 *
9
 *   '$Author: leinfelder $'
10
 *     '$Date: 2011-11-02 20:40:12 -0700 (Wed, 02 Nov 2011) $'
11
 * '$Revision: 6595 $'
12
 *
13
 * This program is free software; you can redistribute it and/or modify
14
 * it under the terms of the GNU General Public License as published by
15
 * the Free Software Foundation; either version 2 of the License, or
16
 * (at your option) any later version.
17
 *
18
 * This program is distributed in the hope that it will be useful,
19
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21
 * GNU General Public License for more details.
22
 *
23
 * You should have received a copy of the GNU General Public License
24
 * along with this program; if not, write to the Free Software
25
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
26
 */
27
package edu.ucsb.nceas.metacat.index;
28

    
29
import java.io.ByteArrayInputStream;
30
import java.io.IOException;
31
import java.io.InputStream;
32
import java.util.ArrayList;
33
import java.util.HashMap;
34
import java.util.Iterator;
35
import java.util.List;
36
import java.util.Map;
37
import java.util.Set;
38

    
39
import javax.xml.parsers.DocumentBuilder;
40
import javax.xml.parsers.DocumentBuilderFactory;
41
import javax.xml.parsers.ParserConfigurationException;
42
import javax.xml.xpath.XPath;
43
import javax.xml.xpath.XPathExpressionException;
44
import javax.xml.xpath.XPathFactory;
45

    
46
import org.apache.commons.codec.EncoderException;
47
import org.apache.commons.io.output.ByteArrayOutputStream;
48
import org.apache.commons.logging.Log;
49
import org.apache.commons.logging.LogFactory;
50
import org.apache.solr.client.solrj.SolrQuery;
51
import org.apache.solr.client.solrj.SolrServer;
52
import org.apache.solr.client.solrj.SolrServerException;
53
import org.apache.solr.client.solrj.response.QueryResponse;
54
import org.apache.solr.client.solrj.response.UpdateResponse;
55
import org.apache.solr.common.SolrDocument;
56
import org.apache.solr.common.SolrDocumentList;
57
import org.apache.solr.common.SolrInputDocument;
58
import org.apache.solr.common.util.NamedList;
59
import org.dataone.cn.indexer.XMLNamespaceConfig;
60
import org.dataone.cn.indexer.parser.IDocumentSubprocessor;
61
import org.dataone.cn.indexer.parser.SolrField;
62
import org.dataone.cn.indexer.solrhttp.SolrDoc;
63
import org.dataone.cn.indexer.solrhttp.SolrElementField;
64
import org.dataone.service.types.v1.Identifier;
65
import org.dataone.service.types.v1.SystemMetadata;
66
import org.dataone.service.util.TypeMarshaller;
67
import org.jibx.runtime.JiBXException;
68
import org.w3c.dom.Document;
69
import org.w3c.dom.NameList;
70
import org.xml.sax.SAXException;
71

    
72
/**
73
 * A class does insert, update and remove indexes to a SOLR server
74
 * @author tao
75
 *
76
 */
77
public class SolrIndex {
78
            
79
    public static final String ID = "id";
80
    private static final String IDQUERY = ID+":*";
81
    private List<IDocumentSubprocessor> subprocessors = null;
82
    private SolrServer solrServer = null;
83
    private XMLNamespaceConfig xmlNamespaceConfig = null;
84
    private List<SolrField> sysmetaSolrFields = null;
85

    
86
    private static DocumentBuilderFactory documentBuilderFactory = null;
87
    private static DocumentBuilder builder = null;
88

    
89
    private static XPathFactory xpathFactory = null;
90
    private static XPath xpath = null;
91
    Log log = LogFactory.getLog(SolrIndex.class);
92
    
93
    static {
94
        documentBuilderFactory = DocumentBuilderFactory.newInstance();
95
        documentBuilderFactory.setNamespaceAware(true);
96
        try {
97
            builder = documentBuilderFactory.newDocumentBuilder();
98
        } catch (ParserConfigurationException e) {
99
            e.printStackTrace();
100
        }
101
        xpathFactory = XPathFactory.newInstance();
102
        xpath = xpathFactory.newXPath();
103
    }
104
    
105
    /**
106
     * Constructor
107
     * @throws SAXException 
108
     * @throws IOException 
109
     */
110
    public SolrIndex(List<SolrField> sysmetaSolrFields, XMLNamespaceConfig xmlNamespaceConfig)
111
                    throws XPathExpressionException, ParserConfigurationException, IOException, SAXException {
112
         this.xmlNamespaceConfig = xmlNamespaceConfig;
113
         this.sysmetaSolrFields = sysmetaSolrFields;
114
         init();
115
    }
116
    
117
    private void init() throws ParserConfigurationException, XPathExpressionException {
118
        xpath.setNamespaceContext(xmlNamespaceConfig);
119
        initExpressions();
120
    }
121

    
122
    private void initExpressions() throws XPathExpressionException {
123
        for (SolrField field : sysmetaSolrFields) {
124
            field.initExpression(xpath);
125
        }
126

    
127
    }
128
    
129
    
130
    /**
131
     * Get the list of the Subprocessors in this index.
132
     * @return the list of the Subprocessors.
133
     */
134
    public List<IDocumentSubprocessor> getSubprocessors() {
135
        return subprocessors;
136
    }
137

    
138
    /**
139
     * Set the list of Subprocessors.
140
     * @param subprocessorList  the list will be set.
141
     */
142
    public void setSubprocessors(List<IDocumentSubprocessor> subprocessorList) {
143
        for (IDocumentSubprocessor subprocessor : subprocessorList) {
144
            subprocessor.initExpression(xpath);
145
        }
146
        this.subprocessors = subprocessorList;
147
    }
148
    
149
    /**
150
     * Generate the index for the given information
151
     * @param id
152
     * @param systemMetadata
153
     * @param dataStream
154
     * @return
155
     * @throws IOException
156
     * @throws SAXException
157
     * @throws ParserConfigurationException
158
     * @throws XPathExpressionException
159
     * @throws JiBXException 
160
     * @throws EncoderException
161
     */
162
    private Map<String, SolrDoc> process(String id, SystemMetadata systemMetadata, InputStream dataStream)
163
                    throws IOException, SAXException, ParserConfigurationException,
164
                    XPathExpressionException, JiBXException{
165

    
166
        // Load the System Metadata document
167
        ByteArrayOutputStream systemMetadataOutputStream = new ByteArrayOutputStream();
168
        TypeMarshaller.marshalTypeToOutputStream(systemMetadata, systemMetadataOutputStream);
169
        ByteArrayInputStream systemMetadataStream = new ByteArrayInputStream(systemMetadataOutputStream.toByteArray());
170
        Document sysMetaDoc = generateXmlDocument(systemMetadataStream);
171
        if (sysMetaDoc == null) {
172
            log.error("Could not load System metadata for ID: " + id);
173
            return null;
174
        }
175

    
176
        // Extract the field values from the System Metadata
177
        List<SolrElementField> sysSolrFields = processSysmetaFields(sysMetaDoc, id);
178
        SolrDoc indexDocument = new SolrDoc(sysSolrFields);
179
        Map<String, SolrDoc> docs = new HashMap<String, SolrDoc>();
180
        docs.put(id, indexDocument);
181

    
182
        // Determine if subprocessors are available for this ID
183
        if (subprocessors != null) {
184
                    // for each subprocessor loaded from the spring config
185
                    for (IDocumentSubprocessor subprocessor : subprocessors) {
186
                        // Does this subprocessor apply?
187
                        if (subprocessor.canProcess(sysMetaDoc)) {
188
                            // if so, then extract the additional information from the
189
                            // document.
190
                            try {
191
                                // docObject = the resource map document or science
192
                                // metadata document.
193
                                // note that resource map processing touches all objects
194
                                // referenced by the resource map.
195
                                Document docObject = generateXmlDocument(dataStream);
196
                                if (docObject == null) {
197
                                    log.error("Could not load OBJECT for ID " + id );
198
                                } else {
199
                                    docs = subprocessor.processDocument(id, docs, docObject);
200
                                }
201
                            } catch (Exception e) {
202
                                log.error(e.getStackTrace().toString());
203
                            }
204
                        }
205
                    }
206
       }
207

    
208
       // TODO: in the XPathDocumentParser class in d1_cn_index_process module,
209
       // merge is only for resource map. We need more work here.
210
       for (SolrDoc mergeDoc : docs.values()) {
211
           if (!mergeDoc.isMerged()) {
212
                 //mergeWithIndexedDocument(mergeDoc);
213
           }
214
       }
215

    
216
       //SolrElementAdd addCommand = getAddCommand(new ArrayList<SolrDoc>(docs.values()));
217
               
218
       return docs;
219
    }
220
    
221
    /*
222
     * Generate a Document from the InputStream
223
     */
224
    private Document generateXmlDocument(InputStream smdStream) throws SAXException {
225
        Document doc = null;
226

    
227
        try {
228
            doc = builder.parse(smdStream);
229
        } catch (IOException e) {
230
            log.error(e.getMessage(), e);
231
        }
232

    
233
        return doc;
234
    }
235
    
236
    /*
237
     * Index the fields of the system metadata
238
     */
239
    private List<SolrElementField> processSysmetaFields(Document doc, String identifier) {
240

    
241
        List<SolrElementField> fieldList = new ArrayList<SolrElementField>();
242
        // solrFields is the list of fields defined in the application context
243
       
244
        for (SolrField field : sysmetaSolrFields) {
245
            try {
246
                // the field.getFields method can return a single value or
247
                // multiple values for multi-valued fields
248
                // or can return multiple SOLR document fields.
249
                fieldList.addAll(field.getFields(doc, identifier));
250
            } catch (Exception e) {
251
                e.printStackTrace();
252
            }
253
        }
254
        return fieldList;
255

    
256
    }
257
    
258
    /**
259
     * Check the parameters of the insert or update methods.
260
     * @param pid
261
     * @param systemMetadata
262
     * @param data
263
     * @throws SolrServerException
264
     */
265
    private void checkParams(String pid, SystemMetadata systemMetadata, InputStream data) throws SolrServerException {
266
        if(pid == null || pid.trim().equals("")) {
267
            throw new SolrServerException("The identifier of the indexed document should not be null or blank.");
268
        }
269
        if(systemMetadata == null) {
270
            throw new SolrServerException("The system metadata of the indexed document should not be null.");
271
        }
272
        if(data == null) {
273
            throw new SolrServerException("The indexed document itself should not be null.");
274
        }
275
    }
276
    
277
    /**
278
     * Insert the indexes for a document.
279
     * @param pid  the id of this document
280
     * @param systemMetadata  the system metadata associated with the data object
281
     * @param data  the data object itself
282
     * @throws SolrServerException 
283
     * @throws JiBXException 
284
     */
285
    private synchronized void insert(String pid, SystemMetadata systemMetadata, InputStream data) 
286
                    throws IOException, SAXException, ParserConfigurationException,
287
                    XPathExpressionException, SolrServerException, JiBXException {
288
        checkParams(pid, systemMetadata, data);
289
        Map<String, SolrDoc> docs = process(pid, systemMetadata, data);
290
        
291
        //transform the Map to the SolrInputDocument which can be used by the solr server
292
        if(docs != null) {
293
            Set<String> ids = docs.keySet();
294
            for(String id : ids) {
295
                SolrInputDocument solrDoc = new SolrInputDocument();
296
                if(id != null) {
297
                    SolrDoc doc = docs.get(id);
298
                    if(doc != null) {
299
                        List<SolrElementField> list = doc.getFieldList();
300
                        if(list != null) {
301
                            //solrDoc.addField(METACATPIDFIELD, pid);
302
                            Iterator<SolrElementField> iterator = list.iterator();
303
                            while (iterator.hasNext()) {
304
                                SolrElementField field = iterator.next();
305
                                if(field != null) {
306
                                    String value = field.getValue();
307
                                    String name = field.getName();
308
                                    //System.out.println("add name/value pair - "+name+"/"+value);
309
                                    solrDoc.addField(name, value);
310
                                }
311
                            }
312
                        }
313
                    }
314
                }
315
                if(!solrDoc.isEmpty()) {
316
                    UpdateResponse response = solrServer.add(solrDoc);
317
                    solrServer.commit();
318
                    //System.out.println("=================the response is:\n"+response.toString());
319
                }
320
            }
321
        }
322
    }
323
    
324
    /**
325
     * Update the solr index. This method handles the three scenarios:
326
     * 1. Archive (or delete) - if the the system metadata shows the value of the archive is true,
327
     *    remove the index for the document and its previous versions if it has.
328
     * 2. Update an existing doc - if the the system metadata shows the value of the archive is false and it has an obsoletes,
329
     *    remove the index for the previous version(s) and generate new index for the doc.
330
     * 3. Add a new doc - if the system metadata shows the value of the archive is false and it hasn't an obsoletes, generate the
331
     *    index for the doc.
332
     * @param pid  the id of the document
333
     * @param obsoleteIds  the chain of the obsoletes by this id
334
     * @param systemMetadata  the system metadata associated with the data object
335
     * @param data  the data object itself
336
     * @throws SolrServerException 
337
     * @throws JiBXException 
338
     */
339
    public void update(String pid, List<String> obsoleteIds, SystemMetadata systemMetadata, InputStream data) 
340
                    throws IOException, SAXException, ParserConfigurationException,
341
                    XPathExpressionException, SolrServerException, JiBXException {
342
        checkParams(pid, systemMetadata, data);
343
        boolean isArchive = systemMetadata.getArchived();
344
        if(isArchive) {
345
            //archive(delete)
346
            Identifier obsolete = systemMetadata.getObsoletes();
347
            if(obsolete != null) {
348
                removeObsoletesChain(obsolete.getValue(), obsoleteIds);
349
            }
350
            remove(pid);
351
        } else {
352
            Identifier obsolete = systemMetadata.getObsoletes();
353
            if(obsolete != null) {
354
                removeObsoletesChain(obsolete.getValue(), obsoleteIds);
355
            }
356
            //generate index for either add or update.
357
            insert(pid, systemMetadata, data);
358
        }
359
    }
360
    
361
    
362
    private void removeObsoletesChain(String obsoleteId, List<String> obsoleteIdChain) throws SolrServerException, IOException {
363
        if(obsoleteId != null && !obsoleteId.trim().equals("")) {
364
            if(obsoleteIdChain == null || obsoleteIdChain.isEmpty()) {
365
                throw new SolrServerException("SolrIndex.removeObsoletesChain - The obsoletes chain can't be null or empty since the system metadata already has the obsoletes element."); 
366
            }
367
            if(!obsoleteIdChain.contains(obsoleteId)) {
368
                throw new SolrServerException("SolrIndex.removeObsoletesChain - The obsoletes elment in the system metadata is not in the obsoleteId chain"); 
369
            }
370
            remove(obsoleteIdChain);
371
        } else {
372
            throw new SolrServerException("SolrIndex.removeObsoletesChain - The obsolete id should be null."); 
373
        }  
374
    }
375
    
376
    /**
377
     * Remove all the indexes associated with the pids in the list.
378
     * @param pidList
379
     * @throws IOException
380
     * @throws SolrServerException
381
     */
382
    private void remove(List<String> pidList) throws IOException, SolrServerException {
383
        if(pidList != null) {
384
            for(String id : pidList) {
385
                remove(id);
386
            }
387
        }
388
    }
389
 
390
    /**
391
     * Remove the indexed associated with specified pid.
392
     * @param pid  the pid which the indexes are associated with
393
     * @throws IOException
394
     * @throws SolrServerException
395
     */
396
    public void remove(String pid) throws IOException, SolrServerException {
397
        if(pid != null && !pid.trim().equals("")) {
398
            solrServer.deleteById(pid);
399
            solrServer.commit();
400
        }
401
       
402
       
403
    }
404

    
405
    /**
406
     * Get the solrServer
407
     * @return
408
     */
409
    public SolrServer getSolrServer() {
410
        return solrServer;
411
    }
412

    
413
    /**
414
     * Set the solrServer. 
415
     * @param solrServer
416
     */
417
    public void setSolrServer(SolrServer solrServer) {
418
        this.solrServer = solrServer;
419
    }
420
    
421
    /**
422
     * Get all indexed ids in the solr server. 
423
     * @return an empty list if there is no index.
424
     * @throws SolrServerException
425
     */
426
    public List<String> getSolrIds() throws SolrServerException {
427
        List<String> list = new ArrayList<String>();
428
        SolrQuery query = new SolrQuery(IDQUERY); 
429
        query.setRows(Integer.MAX_VALUE); 
430
        query.setFields(ID); 
431
        QueryResponse response = solrServer.query(query); 
432
        SolrDocumentList docs = response.getResults();
433
        if(docs != null) {
434
            for(SolrDocument doc :docs) {
435
                String identifier = (String)doc.getFieldValue(ID);
436
                //System.out.println("======================== "+identifier);
437
                list.add(identifier);
438
            }
439
        }
440
        return list;
441
    }
442
}
(4-4/5)