Project

General

Profile

1
/**
2
 *  '$RCSfile$'
3
 *    Purpose: A class that gets Accession Number, check for uniqueness
4
 *             and register it into db
5
 *  Copyright: 2000 Regents of the University of California and the
6
 *             National Center for Ecological Analysis and Synthesis
7
 *    Authors: Jivka Bojilova, Matt Jones
8
 *
9
 *   '$Author: leinfelder $'
10
 *     '$Date: 2011-11-02 20:40:12 -0700 (Wed, 02 Nov 2011) $'
11
 * '$Revision: 6595 $'
12
 *
13
 * This program is free software; you can redistribute it and/or modify
14
 * it under the terms of the GNU General Public License as published by
15
 * the Free Software Foundation; either version 2 of the License, or
16
 * (at your option) any later version.
17
 *
18
 * This program is distributed in the hope that it will be useful,
19
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21
 * GNU General Public License for more details.
22
 *
23
 * You should have received a copy of the GNU General Public License
24
 * along with this program; if not, write to the Free Software
25
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
26
 */
27
package edu.ucsb.nceas.metacat.index;
28

    
29
import java.io.ByteArrayInputStream;
30
import java.io.IOException;
31
import java.io.InputStream;
32
import java.util.ArrayList;
33
import java.util.HashMap;
34
import java.util.Iterator;
35
import java.util.List;
36
import java.util.Map;
37
import java.util.Set;
38

    
39
import javax.xml.parsers.DocumentBuilder;
40
import javax.xml.parsers.DocumentBuilderFactory;
41
import javax.xml.parsers.ParserConfigurationException;
42
import javax.xml.xpath.XPath;
43
import javax.xml.xpath.XPathExpressionException;
44
import javax.xml.xpath.XPathFactory;
45

    
46
import org.apache.commons.codec.EncoderException;
47
import org.apache.commons.io.output.ByteArrayOutputStream;
48
import org.apache.commons.logging.Log;
49
import org.apache.commons.logging.LogFactory;
50
import org.apache.solr.client.solrj.SolrQuery;
51
import org.apache.solr.client.solrj.SolrServer;
52
import org.apache.solr.client.solrj.SolrServerException;
53
import org.apache.solr.client.solrj.response.QueryResponse;
54
import org.apache.solr.client.solrj.response.UpdateResponse;
55
import org.apache.solr.common.SolrDocument;
56
import org.apache.solr.common.SolrDocumentList;
57
import org.apache.solr.common.SolrInputDocument;
58
import org.apache.solr.common.util.NamedList;
59
import org.dataone.cn.indexer.XMLNamespaceConfig;
60
import org.dataone.cn.indexer.parser.IDocumentSubprocessor;
61
import org.dataone.cn.indexer.parser.SolrField;
62
import org.dataone.cn.indexer.solrhttp.SolrDoc;
63
import org.dataone.cn.indexer.solrhttp.SolrElementField;
64
import org.dataone.service.exceptions.NotFound;
65
import org.dataone.service.exceptions.NotImplemented;
66
import org.dataone.service.exceptions.UnsupportedType;
67
import org.dataone.service.types.v1.Identifier;
68
import org.dataone.service.types.v1.SystemMetadata;
69
import org.dataone.service.util.TypeMarshaller;
70
import org.jibx.runtime.JiBXException;
71
import org.w3c.dom.Document;
72
import org.w3c.dom.NameList;
73
import org.xml.sax.SAXException;
74

    
75
import edu.ucsb.nceas.metacat.index.resourcemap.ResourceMapSubprocessor;
76

    
77
/**
78
 * A class does insert, update and remove indexes to a SOLR server
79
 * @author tao
80
 *
81
 */
82
public class SolrIndex {
83
            
84
    public static final String ID = "id";
85
    private static final String IDQUERY = ID+":*";
86
    private List<IDocumentSubprocessor> subprocessors = null;
87
    private SolrServer solrServer = null;
88
    private XMLNamespaceConfig xmlNamespaceConfig = null;
89
    private List<SolrField> sysmetaSolrFields = null;
90

    
91
    private static DocumentBuilderFactory documentBuilderFactory = null;
92
    private static DocumentBuilder builder = null;
93

    
94
    private static XPathFactory xpathFactory = null;
95
    private static XPath xpath = null;
96
    Log log = LogFactory.getLog(SolrIndex.class);
97
    
98
    static {
99
        documentBuilderFactory = DocumentBuilderFactory.newInstance();
100
        documentBuilderFactory.setNamespaceAware(true);
101
        try {
102
            builder = documentBuilderFactory.newDocumentBuilder();
103
        } catch (ParserConfigurationException e) {
104
            e.printStackTrace();
105
        }
106
        xpathFactory = XPathFactory.newInstance();
107
        xpath = xpathFactory.newXPath();
108
    }
109
    
110
    /**
111
     * Constructor
112
     * @throws SAXException 
113
     * @throws IOException 
114
     */
115
    public SolrIndex(List<SolrField> sysmetaSolrFields, XMLNamespaceConfig xmlNamespaceConfig)
116
                    throws XPathExpressionException, ParserConfigurationException, IOException, SAXException {
117
         this.xmlNamespaceConfig = xmlNamespaceConfig;
118
         this.sysmetaSolrFields = sysmetaSolrFields;
119
         init();
120
    }
121
    
122
    private void init() throws ParserConfigurationException, XPathExpressionException {
123
        xpath.setNamespaceContext(xmlNamespaceConfig);
124
        initExpressions();
125
    }
126

    
127
    private void initExpressions() throws XPathExpressionException {
128
        for (SolrField field : sysmetaSolrFields) {
129
            field.initExpression(xpath);
130
        }
131

    
132
    }
133
    
134
    
135
    /**
136
     * Get the list of the Subprocessors in this index.
137
     * @return the list of the Subprocessors.
138
     */
139
    public List<IDocumentSubprocessor> getSubprocessors() {
140
        return subprocessors;
141
    }
142

    
143
    /**
144
     * Set the list of Subprocessors.
145
     * @param subprocessorList  the list will be set.
146
     */
147
    public void setSubprocessors(List<IDocumentSubprocessor> subprocessorList) {
148
        for (IDocumentSubprocessor subprocessor : subprocessorList) {
149
            subprocessor.initExpression(xpath);
150
        }
151
        this.subprocessors = subprocessorList;
152
    }
153
    
154
    /**
155
     * Generate the index for the given information
156
     * @param id
157
     * @param systemMetadata
158
     * @param dataStream
159
     * @return
160
     * @throws IOException
161
     * @throws SAXException
162
     * @throws ParserConfigurationException
163
     * @throws XPathExpressionException
164
     * @throws JiBXException 
165
     * @throws SolrServerException 
166
     * @throws EncoderException
167
     * @throws UnsupportedType 
168
     * @throws NotFound 
169
     * @throws NotImplemented 
170
     */
171
    private Map<String, SolrDoc> process(String id, SystemMetadata systemMetadata, InputStream dataStream)
172
                    throws IOException, SAXException, ParserConfigurationException,
173
                    XPathExpressionException, JiBXException, EncoderException, SolrServerException, NotImplemented, NotFound, UnsupportedType{
174

    
175
        // Load the System Metadata document
176
        ByteArrayOutputStream systemMetadataOutputStream = new ByteArrayOutputStream();
177
        TypeMarshaller.marshalTypeToOutputStream(systemMetadata, systemMetadataOutputStream);
178
        ByteArrayInputStream systemMetadataStream = new ByteArrayInputStream(systemMetadataOutputStream.toByteArray());
179
        Document sysMetaDoc = generateXmlDocument(systemMetadataStream);
180
        if (sysMetaDoc == null) {
181
            log.error("Could not load System metadata for ID: " + id);
182
            return null;
183
        }
184

    
185
        // Extract the field values from the System Metadata
186
        List<SolrElementField> sysSolrFields = processSysmetaFields(sysMetaDoc, id);
187
        SolrDoc indexDocument = new SolrDoc(sysSolrFields);
188
        Map<String, SolrDoc> docs = new HashMap<String, SolrDoc>();
189
        docs.put(id, indexDocument);
190

    
191
        // Determine if subprocessors are available for this ID
192
        if (subprocessors != null) {
193
                    // for each subprocessor loaded from the spring config
194
                    for (IDocumentSubprocessor subprocessor : subprocessors) {
195
                        // Does this subprocessor apply?
196
                        if (subprocessor.canProcess(sysMetaDoc)) {
197
                            // if so, then extract the additional information from the
198
                            // document.
199
                            try {
200
                                // docObject = the resource map document or science
201
                                // metadata document.
202
                                // note that resource map processing touches all objects
203
                                // referenced by the resource map.
204
                                Document docObject = generateXmlDocument(dataStream);
205
                                if (docObject == null) {
206
                                    log.error("Could not load OBJECT for ID " + id );
207
                                } else {
208
                                    docs = subprocessor.processDocument(id, docs, docObject);
209
                                }
210
                            } catch (Exception e) {
211
                                log.error(e.getStackTrace().toString());
212
                            }
213
                        }
214
                    }
215
       }
216

    
217
       // TODO: in the XPathDocumentParser class in d1_cn_index_process module,
218
       // merge is only for resource map. We need more work here.
219
       for (SolrDoc mergeDoc : docs.values()) {
220
           if (!mergeDoc.isMerged()) {
221
                 mergeWithIndexedDocument(mergeDoc);
222
           }
223
       }
224

    
225
       //SolrElementAdd addCommand = getAddCommand(new ArrayList<SolrDoc>(docs.values()));
226
               
227
       return docs;
228
    }
229
    
230
    /**
231
     * Merge updates with existing solr documents
232
     * 
233
     * This method appears to re-set the data package field data into the
234
     * document about to be updated in the solr index. Since packaging
235
     * information is derived from the package document (resource map), this
236
     * information is not present when processing a document contained in a data
237
     * package. This method replaces those values from the existing solr index
238
     * record for the document being processed. -- sroseboo, 1-18-12
239
     * 
240
     * @param indexDocument
241
     * @return
242
     * @throws IOException
243
     * @throws EncoderException
244
     * @throws XPathExpressionException
245
     * @throws SAXException 
246
     * @throws ParserConfigurationException 
247
     * @throws SolrServerException 
248
     * @throws UnsupportedType 
249
     * @throws NotFound 
250
     * @throws NotImplemented 
251
     */
252
    // TODO:combine merge function with resourcemap merge function
253

    
254
    private SolrDoc mergeWithIndexedDocument(SolrDoc indexDocument) throws IOException,
255
            EncoderException, XPathExpressionException, SolrServerException, ParserConfigurationException, SAXException, NotImplemented, NotFound, UnsupportedType {
256
        List<String> ids = new ArrayList<String>();
257
        ids.add(indexDocument.getIdentifier());
258
        List<SolrDoc> indexedDocuments = ResourceMapSubprocessor.getSolrDocs(ids);
259
        SolrDoc indexedDocument = indexedDocuments == null || indexedDocuments.size() <= 0 ? null
260
                : indexedDocuments.get(0);
261
        if (indexedDocument == null || indexedDocument.getFieldList().size() <= 0) {
262
            return indexDocument;
263
        } else {
264
            for (SolrElementField field : indexedDocument.getFieldList()) {
265
                if ((field.getName().equals(SolrElementField.FIELD_ISDOCUMENTEDBY)
266
                        || field.getName().equals(SolrElementField.FIELD_DOCUMENTS) || field
267
                        .getName().equals(SolrElementField.FIELD_RESOURCEMAP))
268
                        && !indexDocument.hasFieldWithValue(field.getName(), field.getValue())) {
269
                    indexDocument.addField(field);
270
                }
271
            }
272

    
273
            indexDocument.setMerged(true);
274
            return indexDocument;
275
        }
276
    }
277
    
278
    /*
279
     * Generate a Document from the InputStream
280
     */
281
    private Document generateXmlDocument(InputStream smdStream) throws SAXException {
282
        Document doc = null;
283

    
284
        try {
285
            doc = builder.parse(smdStream);
286
        } catch (IOException e) {
287
            log.error(e.getMessage(), e);
288
        }
289

    
290
        return doc;
291
    }
292
    
293
    /*
294
     * Index the fields of the system metadata
295
     */
296
    private List<SolrElementField> processSysmetaFields(Document doc, String identifier) {
297

    
298
        List<SolrElementField> fieldList = new ArrayList<SolrElementField>();
299
        // solrFields is the list of fields defined in the application context
300
       
301
        for (SolrField field : sysmetaSolrFields) {
302
            try {
303
                // the field.getFields method can return a single value or
304
                // multiple values for multi-valued fields
305
                // or can return multiple SOLR document fields.
306
                fieldList.addAll(field.getFields(doc, identifier));
307
            } catch (Exception e) {
308
                e.printStackTrace();
309
            }
310
        }
311
        return fieldList;
312

    
313
    }
314
    
315
    /**
316
     * Check the parameters of the insert or update methods.
317
     * @param pid
318
     * @param systemMetadata
319
     * @param data
320
     * @throws SolrServerException
321
     */
322
    private void checkParams(String pid, SystemMetadata systemMetadata, InputStream data) throws SolrServerException {
323
        if(pid == null || pid.trim().equals("")) {
324
            throw new SolrServerException("The identifier of the indexed document should not be null or blank.");
325
        }
326
        if(systemMetadata == null) {
327
            throw new SolrServerException("The system metadata of the indexed document should not be null.");
328
        }
329
        if(data == null) {
330
            throw new SolrServerException("The indexed document itself should not be null.");
331
        }
332
    }
333
    
334
    /**
335
     * Insert the indexes for a document.
336
     * @param pid  the id of this document
337
     * @param systemMetadata  the system metadata associated with the data object
338
     * @param data  the data object itself
339
     * @throws SolrServerException 
340
     * @throws JiBXException 
341
     * @throws EncoderException 
342
     * @throws UnsupportedType 
343
     * @throws NotFound 
344
     * @throws NotImplemented 
345
     */
346
    private synchronized void insert(String pid, SystemMetadata systemMetadata, InputStream data) 
347
                    throws IOException, SAXException, ParserConfigurationException,
348
                    XPathExpressionException, SolrServerException, JiBXException, EncoderException, NotImplemented, NotFound, UnsupportedType {
349
        checkParams(pid, systemMetadata, data);
350
        Map<String, SolrDoc> docs = process(pid, systemMetadata, data);
351
        
352
        //transform the Map to the SolrInputDocument which can be used by the solr server
353
        if(docs != null) {
354
            Set<String> ids = docs.keySet();
355
            for(String id : ids) {
356
                SolrInputDocument solrDoc = new SolrInputDocument();
357
                if(id != null) {
358
                    SolrDoc doc = docs.get(id);
359
                    if(doc != null) {
360
                        List<SolrElementField> list = doc.getFieldList();
361
                        if(list != null) {
362
                            //solrDoc.addField(METACATPIDFIELD, pid);
363
                            Iterator<SolrElementField> iterator = list.iterator();
364
                            while (iterator.hasNext()) {
365
                                SolrElementField field = iterator.next();
366
                                if(field != null) {
367
                                    String value = field.getValue();
368
                                    String name = field.getName();
369
                                    //System.out.println("add name/value pair - "+name+"/"+value);
370
                                    solrDoc.addField(name, value);
371
                                }
372
                            }
373
                        }
374
                    }
375
                }
376
                if(!solrDoc.isEmpty()) {
377
                    UpdateResponse response = solrServer.add(solrDoc);
378
                    solrServer.commit();
379
                    //System.out.println("=================the response is:\n"+response.toString());
380
                }
381
            }
382
        }
383
    }
384
    
385
    /**
386
     * Update the solr index. This method handles the three scenarios:
387
     * 1. Archive (or delete) - if the the system metadata shows the value of the archive is true,
388
     *    remove the index for the document and its previous versions if it has.
389
     * 2. Update an existing doc - if the the system metadata shows the value of the archive is false and it has an obsoletes,
390
     *    remove the index for the previous version(s) and generate new index for the doc.
391
     * 3. Add a new doc - if the system metadata shows the value of the archive is false and it hasn't an obsoletes, generate the
392
     *    index for the doc.
393
     * @param pid  the id of the document
394
     * @param obsoleteIds  the chain of the obsoletes by this id
395
     * @param systemMetadata  the system metadata associated with the data object
396
     * @param data  the data object itself
397
     * @throws SolrServerException 
398
     * @throws JiBXException 
399
     * @throws EncoderException 
400
     * @throws UnsupportedType 
401
     * @throws NotFound 
402
     * @throws NotImplemented 
403
     */
404
    public void update(String pid, List<String> obsoleteIds, SystemMetadata systemMetadata, InputStream data) 
405
                    throws IOException, SAXException, ParserConfigurationException,
406
                    XPathExpressionException, SolrServerException, JiBXException, EncoderException, NotImplemented, NotFound, UnsupportedType {
407
        checkParams(pid, systemMetadata, data);
408
        boolean isArchive = systemMetadata.getArchived();
409
        if(isArchive) {
410
            //archive(delete)
411
            Identifier obsolete = systemMetadata.getObsoletes();
412
            if(obsolete != null) {
413
                removeObsoletesChain(obsolete.getValue(), obsoleteIds);
414
            }
415
            remove(pid);
416
            log.info("============================= archive the idex for the identifier "+pid);
417
        } else {
418
            Identifier obsolete = systemMetadata.getObsoletes();
419
            if(obsolete != null) {
420
                removeObsoletesChain(obsolete.getValue(), obsoleteIds);
421
            }
422
            //generate index for either add or update.
423
            insert(pid, systemMetadata, data);
424
            log.info("============================= insert index for the identifier "+pid);
425
        }
426
    }
427
    
428
    
429
    private void removeObsoletesChain(String obsoleteId, List<String> obsoleteIdChain) throws SolrServerException, IOException {
430
        if(obsoleteId != null && !obsoleteId.trim().equals("")) {
431
            if(obsoleteIdChain == null || obsoleteIdChain.isEmpty()) {
432
                throw new SolrServerException("SolrIndex.removeObsoletesChain - The obsoletes chain can't be null or empty since the system metadata already has the obsoletes element."); 
433
            }
434
            if(!obsoleteIdChain.contains(obsoleteId)) {
435
                throw new SolrServerException("SolrIndex.removeObsoletesChain - The obsoletes elment in the system metadata is not in the obsoleteId chain"); 
436
            }
437
            remove(obsoleteIdChain);
438
        } else {
439
            throw new SolrServerException("SolrIndex.removeObsoletesChain - The obsolete id should be null."); 
440
        }  
441
    }
442
    
443
    /**
444
     * Remove all the indexes associated with the pids in the list.
445
     * @param pidList
446
     * @throws IOException
447
     * @throws SolrServerException
448
     */
449
    private void remove(List<String> pidList) throws IOException, SolrServerException {
450
        if(pidList != null) {
451
            for(String id : pidList) {
452
                remove(id);
453
            }
454
        }
455
    }
456
 
457
    /**
458
     * Remove the indexed associated with specified pid.
459
     * @param pid  the pid which the indexes are associated with
460
     * @throws IOException
461
     * @throws SolrServerException
462
     */
463
    public void remove(String pid) throws IOException, SolrServerException {
464
        if(pid != null && !pid.trim().equals("")) {
465
            solrServer.deleteById(pid);
466
            solrServer.commit();
467
        }
468
       
469
       
470
    }
471

    
472
    /**
473
     * Get the solrServer
474
     * @return
475
     */
476
    public SolrServer getSolrServer() {
477
        return solrServer;
478
    }
479

    
480
    /**
481
     * Set the solrServer. 
482
     * @param solrServer
483
     */
484
    public void setSolrServer(SolrServer solrServer) {
485
        this.solrServer = solrServer;
486
    }
487
    
488
    /**
489
     * Get all indexed ids in the solr server. 
490
     * @return an empty list if there is no index.
491
     * @throws SolrServerException
492
     */
493
    public List<String> getSolrIds() throws SolrServerException {
494
        List<String> list = new ArrayList<String>();
495
        SolrQuery query = new SolrQuery(IDQUERY); 
496
        query.setRows(Integer.MAX_VALUE); 
497
        query.setFields(ID); 
498
        QueryResponse response = solrServer.query(query); 
499
        SolrDocumentList docs = response.getResults();
500
        if(docs != null) {
501
            for(SolrDocument doc :docs) {
502
                String identifier = (String)doc.getFieldValue(ID);
503
                //System.out.println("======================== "+identifier);
504
                list.add(identifier);
505
            }
506
        }
507
        return list;
508
    }
509
}
(5-5/6)