Project

General

Profile

1
/**
2
 *  Copyright: 2013 Regents of the University of California and the
3
 *             National Center for Ecological Analysis and Synthesis
4
 *
5
 * This program is free software; you can redistribute it and/or modify
6
 * it under the terms of the GNU General Public License as published by
7
 * the Free Software Foundation; either version 2 of the License, or
8
 * (at your option) any later version.
9
 *
10
 * This program is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
 * GNU General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU General Public License
16
 * along with this program; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 */
19
package edu.ucsb.nceas.metacat.index;
20

    
21
import java.io.ByteArrayInputStream;
22
import java.io.FileNotFoundException;
23
import java.io.IOException;
24
import java.io.InputStream;
25
import java.util.ArrayList;
26
import java.util.Calendar;
27
import java.util.HashMap;
28
import java.util.Iterator;
29
import java.util.List;
30
import java.util.Map;
31
import java.util.Set;
32

    
33
import javax.xml.parsers.DocumentBuilder;
34
import javax.xml.parsers.DocumentBuilderFactory;
35
import javax.xml.parsers.ParserConfigurationException;
36
import javax.xml.xpath.XPath;
37
import javax.xml.xpath.XPathExpressionException;
38
import javax.xml.xpath.XPathFactory;
39

    
40
import org.apache.commons.codec.EncoderException;
41
import org.apache.commons.io.output.ByteArrayOutputStream;
42
import org.apache.commons.lang.StringUtils;
43
import org.apache.commons.logging.Log;
44
import org.apache.commons.logging.LogFactory;
45
import org.apache.solr.client.solrj.SolrQuery;
46
import org.apache.solr.client.solrj.SolrServer;
47
import org.apache.solr.client.solrj.SolrServerException;
48
import org.apache.solr.client.solrj.response.QueryResponse;
49
import org.apache.solr.client.solrj.response.UpdateResponse;
50
import org.apache.solr.common.SolrDocument;
51
import org.apache.solr.common.SolrDocumentList;
52
import org.apache.solr.common.SolrInputDocument;
53
import org.dataone.cn.indexer.XMLNamespaceConfig;
54
import org.dataone.cn.indexer.parser.IDocumentSubprocessor;
55
import org.dataone.cn.indexer.parser.SolrField;
56
import org.dataone.cn.indexer.resourcemap.ResourceEntry;
57
import org.dataone.cn.indexer.resourcemap.ResourceMap;
58
import org.dataone.cn.indexer.solrhttp.SolrDoc;
59
import org.dataone.cn.indexer.solrhttp.SolrElementField;
60
import org.dataone.service.exceptions.NotFound;
61
import org.dataone.service.exceptions.NotImplemented;
62
import org.dataone.service.exceptions.ServiceFailure;
63
import org.dataone.service.exceptions.UnsupportedType;
64
import org.dataone.service.types.v1.Event;
65
import org.dataone.service.types.v1.Identifier;
66
import org.dataone.service.types.v1.SystemMetadata;
67
import org.dataone.service.util.TypeMarshaller;
68
import org.dspace.foresite.OREParserException;
69
import org.jibx.runtime.JiBXException;
70
import org.w3c.dom.Document;
71
import org.xml.sax.SAXException;
72

    
73
import edu.ucsb.nceas.metacat.common.index.event.IndexEvent;
74
import edu.ucsb.nceas.metacat.index.event.EventlogFactory;
75
import edu.ucsb.nceas.metacat.index.resourcemap.ResourceMapSubprocessor;
76

    
77
/**
78
 * A class does insert, update and remove indexes to a SOLR server
79
 * @author tao
80
 *
81
 */
82
public class SolrIndex {
83
            
84
    public static final String ID = "id";
85
    private static final String IDQUERY = ID+":*";
86
    private List<IDocumentSubprocessor> subprocessors = null;
87
    private SolrServer solrServer = null;
88
    private XMLNamespaceConfig xmlNamespaceConfig = null;
89
    private List<SolrField> sysmetaSolrFields = null;
90

    
91
    private static DocumentBuilderFactory documentBuilderFactory = null;
92
    private static DocumentBuilder builder = null;
93

    
94
    private static XPathFactory xpathFactory = null;
95
    private static XPath xpath = null;
96
    Log log = LogFactory.getLog(SolrIndex.class);
97
    
98
    static {
99
        documentBuilderFactory = DocumentBuilderFactory.newInstance();
100
        documentBuilderFactory.setNamespaceAware(true);
101
        try {
102
            builder = documentBuilderFactory.newDocumentBuilder();
103
        } catch (ParserConfigurationException e) {
104
            e.printStackTrace();
105
        }
106
        xpathFactory = XPathFactory.newInstance();
107
        xpath = xpathFactory.newXPath();
108
    }
109
    
110
    /**
111
     * Constructor
112
     * @throws SAXException 
113
     * @throws IOException 
114
     */
115
    public SolrIndex(XMLNamespaceConfig xmlNamespaceConfig, List<SolrField> sysmetaSolrFields)
116
                    throws XPathExpressionException, ParserConfigurationException, IOException, SAXException {
117
         this.xmlNamespaceConfig = xmlNamespaceConfig;
118
         this.sysmetaSolrFields = sysmetaSolrFields;
119
         init();
120
    }
121
    
122
    private void init() throws ParserConfigurationException, XPathExpressionException {
123
        xpath.setNamespaceContext(xmlNamespaceConfig);
124
        initExpressions();
125
    }
126

    
127
    private void initExpressions() throws XPathExpressionException {
128
        for (SolrField field : sysmetaSolrFields) {
129
            field.initExpression(xpath);
130
        }
131

    
132
    }
133
    
134
    
135
    /**
136
     * Get the list of the Subprocessors in this index.
137
     * @return the list of the Subprocessors.
138
     */
139
    public List<IDocumentSubprocessor> getSubprocessors() {
140
        return subprocessors;
141
    }
142

    
143
    /**
144
     * Set the list of Subprocessors.
145
     * @param subprocessorList  the list will be set.
146
     */
147
    public void setSubprocessors(List<IDocumentSubprocessor> subprocessorList) {
148
        for (IDocumentSubprocessor subprocessor : subprocessorList) {
149
            subprocessor.initExpression(xpath);
150
        }
151
        this.subprocessors = subprocessorList;
152
    }
153
    
154
    /**
155
     * Generate the index for the given information
156
     * @param id
157
     * @param systemMetadata
158
     * @param dataStream
159
     * @return
160
     * @throws IOException
161
     * @throws SAXException
162
     * @throws ParserConfigurationException
163
     * @throws XPathExpressionException
164
     * @throws JiBXException 
165
     * @throws SolrServerException 
166
     * @throws EncoderException
167
     * @throws UnsupportedType 
168
     * @throws NotFound 
169
     * @throws NotImplemented 
170
     */
171
    private Map<String, SolrDoc> process(String id, SystemMetadata systemMetadata, InputStream dataStream)
172
                    throws IOException, SAXException, ParserConfigurationException,
173
                    XPathExpressionException, JiBXException, EncoderException, SolrServerException, NotImplemented, NotFound, UnsupportedType{
174

    
175
        // Load the System Metadata document
176
        ByteArrayOutputStream systemMetadataOutputStream = new ByteArrayOutputStream();
177
        TypeMarshaller.marshalTypeToOutputStream(systemMetadata, systemMetadataOutputStream);
178
        ByteArrayInputStream systemMetadataStream = new ByteArrayInputStream(systemMetadataOutputStream.toByteArray());
179
        Document sysMetaDoc = generateXmlDocument(systemMetadataStream);
180
        if (sysMetaDoc == null) {
181
            log.error("Could not load System metadata for ID: " + id);
182
            return null;
183
        }
184

    
185
        // Extract the field values from the System Metadata
186
        List<SolrElementField> sysSolrFields = processSysmetaFields(sysMetaDoc, id);
187
        SolrDoc indexDocument = new SolrDoc(sysSolrFields);
188
        Map<String, SolrDoc> docs = new HashMap<String, SolrDoc>();
189
        docs.put(id, indexDocument);
190

    
191
        // Determine if subprocessors are available for this ID
192
        if (subprocessors != null) {
193
                    // for each subprocessor loaded from the spring config
194
                    for (IDocumentSubprocessor subprocessor : subprocessors) {
195
                        // Does this subprocessor apply?
196
                        if (subprocessor.canProcess(sysMetaDoc)) {
197
                            // if so, then extract the additional information from the
198
                            // document.
199
                            try {
200
                                // docObject = the resource map document or science
201
                                // metadata document.
202
                                // note that resource map processing touches all objects
203
                                // referenced by the resource map.
204
                                Document docObject = generateXmlDocument(dataStream);
205
                                if (docObject == null) {
206
                                    throw new Exception("Could not load OBJECT for ID " + id );
207
                                } else {
208
                                    docs = subprocessor.processDocument(id, docs, docObject);
209
                                }
210
                            } catch (Exception e) {
211
                                log.error(e.getStackTrace().toString());
212
                                throw new SolrServerException(e.getMessage());
213
                            }
214
                        }
215
                    }
216
       }
217

    
218
       // TODO: in the XPathDocumentParser class in d1_cn_index_process module,
219
       // merge is only for resource map. We need more work here.
220
       for (SolrDoc mergeDoc : docs.values()) {
221
           if (!mergeDoc.isMerged()) {
222
                 mergeWithIndexedDocument(mergeDoc);
223
           }
224
       }
225

    
226
       //SolrElementAdd addCommand = getAddCommand(new ArrayList<SolrDoc>(docs.values()));
227
               
228
       return docs;
229
    }
230
    
231
    /**
232
     * Merge updates with existing solr documents
233
     * 
234
     * This method appears to re-set the data package field data into the
235
     * document about to be updated in the solr index. Since packaging
236
     * information is derived from the package document (resource map), this
237
     * information is not present when processing a document contained in a data
238
     * package. This method replaces those values from the existing solr index
239
     * record for the document being processed. -- sroseboo, 1-18-12
240
     * 
241
     * @param indexDocument
242
     * @return
243
     * @throws IOException
244
     * @throws EncoderException
245
     * @throws XPathExpressionException
246
     * @throws SAXException 
247
     * @throws ParserConfigurationException 
248
     * @throws SolrServerException 
249
     * @throws UnsupportedType 
250
     * @throws NotFound 
251
     * @throws NotImplemented 
252
     */
253
    // TODO:combine merge function with resourcemap merge function
254

    
255
    private SolrDoc mergeWithIndexedDocument(SolrDoc indexDocument) throws IOException,
256
            EncoderException, XPathExpressionException, SolrServerException, ParserConfigurationException, SAXException, NotImplemented, NotFound, UnsupportedType {
257
        List<String> ids = new ArrayList<String>();
258
        ids.add(indexDocument.getIdentifier());
259
        List<SolrDoc> indexedDocuments = ResourceMapSubprocessor.getSolrDocs(ids);
260
        SolrDoc indexedDocument = indexedDocuments == null || indexedDocuments.size() <= 0 ? null
261
                : indexedDocuments.get(0);
262
        if (indexedDocument == null || indexedDocument.getFieldList().size() <= 0) {
263
            return indexDocument;
264
        } else {
265
            for (SolrElementField field : indexedDocument.getFieldList()) {
266
                if ((field.getName().equals(SolrElementField.FIELD_ISDOCUMENTEDBY)
267
                        || field.getName().equals(SolrElementField.FIELD_DOCUMENTS) || field
268
                        .getName().equals(SolrElementField.FIELD_RESOURCEMAP))
269
                        && !indexDocument.hasFieldWithValue(field.getName(), field.getValue())) {
270
                    indexDocument.addField(field);
271
                }
272
            }
273

    
274
            indexDocument.setMerged(true);
275
            return indexDocument;
276
        }
277
    }
278
    
279
    /*
280
     * Generate a Document from the InputStream
281
     */
282
    private Document generateXmlDocument(InputStream smdStream) throws SAXException {
283
        Document doc = null;
284

    
285
        try {
286
            doc = builder.parse(smdStream);
287
        } catch (IOException e) {
288
            log.error(e.getMessage(), e);
289
        }
290

    
291
        return doc;
292
    }
293
    
294
    /*
295
     * Index the fields of the system metadata
296
     */
297
    private List<SolrElementField> processSysmetaFields(Document doc, String identifier) {
298

    
299
        List<SolrElementField> fieldList = new ArrayList<SolrElementField>();
300
        // solrFields is the list of fields defined in the application context
301
       
302
        for (SolrField field : sysmetaSolrFields) {
303
            try {
304
                // the field.getFields method can return a single value or
305
                // multiple values for multi-valued fields
306
                // or can return multiple SOLR document fields.
307
                fieldList.addAll(field.getFields(doc, identifier));
308
            } catch (Exception e) {
309
                e.printStackTrace();
310
            }
311
        }
312
        return fieldList;
313

    
314
    }
315
    
316
    /**
317
     * Check the parameters of the insert or update methods.
318
     * @param pid
319
     * @param systemMetadata
320
     * @param data
321
     * @throws SolrServerException
322
     */
323
    private void checkParams(String pid, SystemMetadata systemMetadata, InputStream data) throws SolrServerException {
324
        if(pid == null || pid.trim().equals("")) {
325
            throw new SolrServerException("The identifier of the indexed document should not be null or blank.");
326
        }
327
        if(systemMetadata == null) {
328
            throw new SolrServerException("The system metadata of the indexed document should not be null.");
329
        }
330
        if(data == null) {
331
            throw new SolrServerException("The indexed document itself should not be null.");
332
        }
333
    }
334
    
335
    /**
336
     * Insert the indexes for a document.
337
     * @param pid  the id of this document
338
     * @param systemMetadata  the system metadata associated with the data object
339
     * @param data  the data object itself
340
     * @throws SolrServerException 
341
     * @throws JiBXException 
342
     * @throws EncoderException 
343
     * @throws UnsupportedType 
344
     * @throws NotFound 
345
     * @throws NotImplemented 
346
     */
347
    private synchronized void insert(String pid, SystemMetadata systemMetadata, InputStream data) 
348
                    throws IOException, SAXException, ParserConfigurationException,
349
                    XPathExpressionException, SolrServerException, JiBXException, EncoderException, NotImplemented, NotFound, UnsupportedType {
350
        checkParams(pid, systemMetadata, data);
351
        Map<String, SolrDoc> docs = process(pid, systemMetadata, data);
352
        
353
        //transform the Map to the SolrInputDocument which can be used by the solr server
354
        if(docs != null) {
355
            Set<String> ids = docs.keySet();
356
            for(String id : ids) {
357
                if(id != null) {
358
                    SolrDoc doc = docs.get(id);
359
                    insertToIndex(doc);
360
                }
361
                
362
            }
363
        }
364
    }
365
    
366
    /*
367
     * Insert a SolrDoc to the solr server.
368
     */
369
    private synchronized void insertToIndex(SolrDoc doc) throws SolrServerException, IOException {
370
        if(doc != null ) {
371
            SolrInputDocument solrDoc = new SolrInputDocument();
372
            List<SolrElementField> list = doc.getFieldList();
373
            if(list != null) {
374
                //solrDoc.addField(METACATPIDFIELD, pid);
375
                Iterator<SolrElementField> iterator = list.iterator();
376
                while (iterator.hasNext()) {
377
                    SolrElementField field = iterator.next();
378
                    if(field != null) {
379
                        String value = field.getValue();
380
                        String name = field.getName();
381
                        //System.out.println("add name/value pair - "+name+"/"+value);
382
                        solrDoc.addField(name, value);
383
                    }
384
                }
385
            }
386
            if(!solrDoc.isEmpty()) {
387
                /*IndexEvent event = new IndexEvent();
388
                event.setDate(Calendar.getInstance().getTime());
389
                Identifier pid = new Identifier();
390
                pid.setValue(doc.getIdentifier());
391
                event.setIdentifier(pid);*/
392
                try {
393
                    UpdateResponse response = solrServer.add(solrDoc);
394
                    solrServer.commit();
395
                    /*event.setType(IndexEvent.SUCCESSINSERT);
396
                    event.setDescription("Successfully insert the solr index for the id "+pid.getValue());
397
                    try {
398
                        EventlogFactory.createIndexEventLog().write(event);
399
                    } catch (Exception e) {
400
                        log.error("SolrIndex.insertToIndex - IndexEventLog can't log the index inserting event :"+e.getMessage());
401
                    }*/
402
                } catch (SolrServerException e) {
403
                    /*event.setAction(Event.CREATE);
404
                    event.setDescription("Failed to insert the solr index for the id "+pid.getValue()+" since "+e.getMessage());
405
                    try {
406
                        EventlogFactory.createIndexEventLog().write(event);
407
                    } catch (Exception ee) {
408
                        log.error("SolrIndex.insertToIndex - IndexEventLog can't log the index inserting event :"+ee.getMessage());
409
                    }*/
410
                    throw e;
411
                } catch (IOException e) {
412
                    /*event.setAction(Event.CREATE);
413
                    event.setDescription("Failed to insert the solr index for the id "+pid.getValue()+" since "+e.getMessage());
414
                    try {
415
                        EventlogFactory.createIndexEventLog().write(event);
416
                    } catch (Exception ee) {
417
                        log.error("SolrIndex.insertToIndex - IndexEventLog can't log the index inserting event :"+ee.getMessage());
418
                    }*/
419
                    throw e;
420
                    
421
                }
422
                //System.out.println("=================the response is:\n"+response.toString());
423
            }
424
        }
425
    }
426
    
427
    /**
428
     * Update the solr index. This method handles the three scenarios:
429
     * 1. Update an existing doc - if the the system metadata shows the value of the archive is false and it has an obsoletes,
430
     *    remove the index for the previous version(s) and generate new index for the doc.
431
     * 2. Add a new doc - if the system metadata shows the value of the archive is false and it hasn't an obsoletes, generate the
432
     *    index for the doc.
433
     * @param pid  the id of the document
434
     * @param obsoleteIds  the chain of the obsoletes by this id
435
     * @param systemMetadata  the system metadata associated with the data object
436
     * @param data  the data object itself
437
     * @throws SolrServerException 
438
     * @throws JiBXException 
439
     * @throws EncoderException 
440
     * @throws UnsupportedType 
441
     * @throws NotFound 
442
     * @throws NotImplemented 
443
     * @throws ServiceFailure 
444
     * @throws OREParserException 
445
     */
446
    public void update(String pid, SystemMetadata systemMetadata, InputStream data) 
447
                    throws IOException, SAXException, ParserConfigurationException,
448
                    XPathExpressionException, SolrServerException, JiBXException, EncoderException, NotImplemented, NotFound, UnsupportedType, ServiceFailure, OREParserException {
449
        checkParams(pid, systemMetadata, data);
450
        //generate index for either add or update.
451
        insert(pid, systemMetadata, data);
452
        log.info("============================= update index for the identifier "+pid);
453
       
454
    }
455
    
456
    
457
    
458
   
459

    
460
    /*
461
     * Is the pid a resource map
462
     */
463
    private boolean isDataPackage(String pid) throws FileNotFoundException, ServiceFailure {
464
        boolean isDataPackage = false;
465
        SystemMetadata sysmeta = DistributedMapsFactory.getSystemMetadata(pid);
466
        if(sysmeta != null) {
467
            isDataPackage = IndexGenerator.isResourceMap(sysmeta.getFormatId());
468
        }
469
        return isDataPackage;
470
    }
471

    
472
    private boolean isPartOfDataPackage(String pid) throws XPathExpressionException, NotImplemented, NotFound, UnsupportedType, SolrServerException, IOException, ParserConfigurationException, SAXException {
473
        SolrDoc dataPackageIndexDoc = ResourceMapSubprocessor.getSolrDoc(pid);
474
        if (dataPackageIndexDoc != null) {
475
            String resourceMapId = dataPackageIndexDoc
476
                    .getFirstFieldValue(SolrElementField.FIELD_RESOURCEMAP);
477
            return StringUtils.isNotEmpty(resourceMapId);
478
        } else {
479
            return false;
480
        }
481
    }
482

    
483
    /**
484
     * Get the solrServer
485
     * @return
486
     */
487
    public SolrServer getSolrServer() {
488
        return solrServer;
489
    }
490

    
491
    /**
492
     * Set the solrServer. 
493
     * @param solrServer
494
     */
495
    public void setSolrServer(SolrServer solrServer) {
496
        this.solrServer = solrServer;
497
    }
498
    
499
    /**
500
     * Get all indexed ids in the solr server. 
501
     * @return an empty list if there is no index.
502
     * @throws SolrServerException
503
     */
504
    public List<String> getSolrIds() throws SolrServerException {
505
        List<String> list = new ArrayList<String>();
506
        SolrQuery query = new SolrQuery(IDQUERY); 
507
        query.setRows(Integer.MAX_VALUE); 
508
        query.setFields(ID); 
509
        QueryResponse response = solrServer.query(query); 
510
        SolrDocumentList docs = response.getResults();
511
        if(docs != null) {
512
            for(SolrDocument doc :docs) {
513
                String identifier = (String)doc.getFieldValue(ID);
514
                //System.out.println("======================== "+identifier);
515
                list.add(identifier);
516
            }
517
        }
518
        return list;
519
    }
520
}
(5-5/6)