Project

General

Profile

1
/**
2
 *  '$RCSfile$'
3
 *    Purpose: A class that gets Accession Number, check for uniqueness
4
 *             and register it into db
5
 *  Copyright: 2000 Regents of the University of California and the
6
 *             National Center for Ecological Analysis and Synthesis
7
 *    Authors: Jivka Bojilova, Matt Jones
8
 *
9
 *   '$Author: leinfelder $'
10
 *     '$Date: 2011-11-02 20:40:12 -0700 (Wed, 02 Nov 2011) $'
11
 * '$Revision: 6595 $'
12
 *
13
 * This program is free software; you can redistribute it and/or modify
14
 * it under the terms of the GNU General Public License as published by
15
 * the Free Software Foundation; either version 2 of the License, or
16
 * (at your option) any later version.
17
 *
18
 * This program is distributed in the hope that it will be useful,
19
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21
 * GNU General Public License for more details.
22
 *
23
 * You should have received a copy of the GNU General Public License
24
 * along with this program; if not, write to the Free Software
25
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
26
 */
27
package edu.ucsb.nceas.metacat.index;
28

    
29
import java.io.FileInputStream;
30
import java.io.FileNotFoundException;
31
import java.io.InputStream;
32
import java.util.ArrayList;
33
import java.util.Calendar;
34
import java.util.Date;
35
import java.util.List;
36
import java.util.TimerTask;
37
import java.util.Vector;
38

    
39
import org.apache.commons.logging.Log;
40
import org.apache.commons.logging.LogFactory;
41
import org.apache.solr.client.solrj.SolrServerException;
42
import org.dataone.client.MNode;
43
import org.dataone.configuration.Settings;
44
import org.dataone.service.exceptions.InvalidRequest;
45
import org.dataone.service.exceptions.InvalidToken;
46
import org.dataone.service.exceptions.NotAuthorized;
47
import org.dataone.service.exceptions.NotImplemented;
48
import org.dataone.service.exceptions.ServiceFailure;
49
import org.dataone.service.types.v1.Identifier;
50
import org.dataone.service.types.v1.ObjectFormatIdentifier;
51
import org.dataone.service.types.v1.ObjectInfo;
52
import org.dataone.service.types.v1.ObjectList;
53
import org.dataone.service.types.v1.SystemMetadata;
54

    
55
import com.hazelcast.core.IMap;
56
import com.hazelcast.core.ISet;
57

    
58
import edu.ucsb.nceas.metacat.index.event.EventlogFactory;
59
import edu.ucsb.nceas.metacat.index.event.IndexEvent;
60

    
61

    
62
/**
63
 * A class represents the object to generate massive solr indexes.
64
 * This can happen during an update of Metacat (generating index for all existing documents)
65
 * or regenerate index for those documents
66
 * failing to build index during the insert or update.
67
 * 
68
 * @author tao
69
 *
70
 */
71
public class IndexGenerator extends TimerTask {
72
    
73
    private static final int FIRST =0;
74
    private static final int SECOND =1;
75
    public static final int WAITTIME = 10000;
76
    public static final int MAXWAITNUMBER = 180;
77
    private static final String HTTP = "http://";
78
    private static final String MNAPPENDIX = "/d1/mn";
79
    private static final String RESOURCEMAPPROPERYNAME = "index.resourcemap.namespace";
80
    public static final String WAITIMEPOPERTYNAME = "index.regenerate.start.waitingtime";
81
    public static final String MAXATTEMPTSPROPERTYNAME = "index.regenerate.start.maxattempts";
82
    
83
    private static int waitingTime = WAITTIME;
84
    private static int maxAttempts = MAXWAITNUMBER;
85
    
86
    private SolrIndex solrIndex = null;
87
    //private SystemMetadataEventListener systemMetadataListener = null;
88
    private IMap<Identifier, SystemMetadata> systemMetadataMap;
89
    private IMap<Identifier, String> objectPathMap;
90
    private Log log = LogFactory.getLog(IndexGenerator.class);
91
    //private MNode mNode = null;
92
    private static List<String> resourceMapNamespaces = null;
93
    
94
    /**
95
     * Constructor
96
     * @param solrIndex
97
     * @param systemMetadataListener
98
     */
99
    public IndexGenerator(SolrIndex solrIndex) {
100
        this.solrIndex = solrIndex;
101
        resourceMapNamespaces = Settings.getConfiguration().getList(RESOURCEMAPPROPERYNAME);
102
        //this.systemMetadataListener = systemMetadataListener;
103
        //this.mNode = new MNode(buildMNBaseURL());
104
        try {
105
            waitingTime = Settings.getConfiguration().getInt(WAITIMEPOPERTYNAME);
106
            maxAttempts = Settings.getConfiguration().getInt(MAXATTEMPTSPROPERTYNAME);
107
        } catch (Exception e) {
108
            log.warn("IndexGenerator.constructor - couldn't read the waiting time or maxattempts from the metacat.properties file since : "+e.getMessage()+". Default values will be used");
109
            waitingTime = WAITTIME;
110
            maxAttempts = MAXWAITNUMBER;
111
        }
112
    }
113
    
114
    /**
115
     * Build the index for all documents in Metacat without overwriting.
116
     * @throws SolrServerException 
117
     * @throws ServiceFailure 
118
     * @throws NotImplemented 
119
     * @throws NotAuthorized 
120
     * @throws InvalidToken 
121
     * @throws InvalidRequest 
122
     */
123
    public void indexAll() throws InvalidRequest, InvalidToken, NotAuthorized, 
124
                            NotImplemented, ServiceFailure, SolrServerException, FileNotFoundException {
125
        boolean force = false;
126
        indexAll(force);
127
    }
128
    
129
    /**
130
     * Build the index for all documents. If force is true, the existed index for documents
131
     * will be overwritten. 
132
     * @param force
133
     * @throws SolrServerException 
134
     * @throws ServiceFailure 
135
     * @throws NotImplemented 
136
     * @throws NotAuthorized 
137
     * @throws InvalidToken 
138
     * @throws InvalidRequest 
139
     */
140
    public void indexAll(boolean force) throws InvalidRequest, InvalidToken,
141
                NotAuthorized, NotImplemented, ServiceFailure, SolrServerException, FileNotFoundException {
142
        Date since = null;
143
        Date until = null;
144
        index(since, until, force);
145
    }
146
    
147
    /**
148
     * Build the index for the docs which have been modified since the specified date.
149
     * @param since
150
     * @param force 
151
     * @throws SolrServerException 
152
     * @throws ServiceFailure 
153
     * @throws NotImplemented 
154
     * @throws NotAuthorized 
155
     * @throws InvalidToken 
156
     * @throws InvalidRequest 
157
     */
158
    public void index(Date since, boolean force) throws InvalidRequest, InvalidToken, 
159
                    NotAuthorized, NotImplemented, ServiceFailure, SolrServerException, FileNotFoundException {
160
        Date until = null;
161
        index(since, until, force);
162
    }
163
    
164
    /**
165
     *  Build the index for the docs which have been modified between the specified date.s
166
     * @param since
167
     * @param until
168
     * @param force
169
     * @throws SolrServerException 
170
     * @throws ServiceFailure 
171
     * @throws NotImplemented 
172
     * @throws NotAuthorized 
173
     * @throws InvalidToken 
174
     * @throws InvalidRequest 
175
     * @throws FileNotFoundException 
176
     */
177
    public void index(Date since, Date until, boolean force) throws SolrServerException, InvalidRequest, 
178
                                                InvalidToken, NotAuthorized, NotImplemented, ServiceFailure, FileNotFoundException {
179
        List<String> solrIds = null;
180
        initSystemMetadataMap();
181
        initObjectPathMap();
182
        List[] allMetacatIds = getMetacatIds(since, until);
183
        List<String> otherMetacatIds = allMetacatIds[FIRST];
184
        List<String> resourceMapIds = allMetacatIds[SECOND];
185
        log.info("the metacat ids (exception resource map -----------------------------"+otherMetacatIds);
186
        log.info("the metacat resroucemap ids -----------------------------"+resourceMapIds);
187
        if(!force) {
188
            solrIds = getSolrDocIds();
189
        }
190
        log.info("the solr ids -----------------------------"+solrIds);
191
        index(otherMetacatIds, solrIds, force);
192
        index(resourceMapIds, solrIds, force);
193
       
194
        
195
    }
196
    
197
    /*
198
     * Doing index when it is necessary when compare metacatids and solrids.
199
     */
200
    private void index(List<String> metacatIds, List<String>solrIds, boolean force) {
201
        if(metacatIds != null) {
202
            for(String metacatId : metacatIds) {
203
                if(metacatId != null) {
204
                    boolean buildIndex = true;
205
                    if(!force && solrIds != null && solrIds.contains(metacatId)) {
206
                        //solr already indexs the id and we don't force it to rebuild it, so set the buildIndex to be false
207
                        buildIndex = false;
208
                    }
209
                    if(buildIndex) {
210
                        try {
211
                            generateIndex(metacatId);
212
                        } catch (Exception e) {
213
                            log.error("IndexGenerator.index - Metacat Index couldn't generate the index for the id - "+metacatId+" because "+e.getMessage());
214
                        }
215
                        
216
                    }
217
                }
218
            }
219
        }
220
    }
221
    
222
    public void run() {
223
        IndexEvent event = new IndexEvent();
224
        event.setDate(Calendar.getInstance().getTime());
225
        event.setType(IndexEvent.STARTTIMEDINDEX);
226
        event.setDescription("Start the timed index job");
227
        try {
228
            EventlogFactory.createIndexEventLog().write(event);
229
        } catch (Exception e) {
230
            log.error("IndexGenerator.run - IndexEventLog can't log the timed indexing start event :"+e.getMessage());
231
        }
232
        try {
233
            
234
            indexAll();
235
        } catch (InvalidRequest e) {
236
            // TODO Auto-generated catch block
237
            //e.printStackTrace();
238
            log.error("IndexGenerator.run - Metadata-Index couldn't generate indexes for those documents which haven't been indexed : "+e.getMessage());
239
        } catch (InvalidToken e) {
240
            // TODO Auto-generated catch block
241
            //e.printStackTrace();
242
            log.error("IndexGenerator.run - Metadata-Index couldn't generate indexes for those documents which haven't been indexed : "+e.getMessage());
243
        } catch (NotAuthorized e) {
244
            // TODO Auto-generated catch block
245
            //e.printStackTrace();
246
        } catch (NotImplemented e) {
247
            // TODO Auto-generated catch block
248
            //e.printStackTrace();
249
            log.error("IndexGenerator.run - Metadata-Index couldn't generate indexes for those documents which haven't been indexed : "+e.getMessage());
250
        } catch (ServiceFailure e) {
251
            // TODO Auto-generated catch block
252
            //e.printStackTrace();
253
            log.error("IndexGenerator.run - Metadata-Index couldn't generate indexes for those documents which haven't been indexed : "+e.getMessage());
254
        } catch (SolrServerException e) {
255
            // TODO Auto-generated catch block
256
            //e.printStackTrace();
257
            log.error("IndexGenerator.run - Metadata-Index couldn't generate indexes for those documents which haven't been indexed : "+e.getMessage());
258
        } catch (FileNotFoundException e) {
259
            log.error("IndexGenerator.run - Metadata-Index couldn't generate indexes for those documents which haven't been indexed : "+e.getMessage());
260
        }
261
        event.setDate(Calendar.getInstance().getTime());
262
        event.setType(IndexEvent.FINISHTIMEDINDEX);
263
        event.setDescription("Finish the timed index job");
264
        try {
265
            EventlogFactory.createIndexEventLog().write(event);
266
        } catch (Exception e) {
267
            log.error("IndexGenerator.run - IndexEventLog can't log the timed indexing finish event :"+e.getMessage());
268
        }
269
    }
270
    
271
    /*
272
     * Get the indexed ids list from the solr server.
273
     * An empty list will be returned if there is no ids.
274
     */
275
    private List<String> getSolrDocIds() throws SolrServerException {
276
        List<String> ids = solrIndex.getSolrIds();
277
        return ids;
278
    }
279
    
280
    /*
281
     * Get an array of the list of ids of the metacat. If since and util are null, it will return all of them.
282
     * The first element of the list is the ids except the resource map. The second elements of the list is the ids of the resource map.
283
     * The reason to split them is when we index the resource map, we need the index of the document in the resource map ready.
284
     */
285
    private List[] getMetacatIds(Date since, Date until) throws InvalidRequest, 
286
                        InvalidToken, NotAuthorized, NotImplemented, ServiceFailure, FileNotFoundException {
287
        
288
        List<String> resourceMapIds = new ArrayList();
289
        List<String> otherIds = new ArrayList();
290
        List[] ids = new List[2];
291
        ids[FIRST]= otherIds;
292
        ids[SECOND] = resourceMapIds;
293
        ISet<Identifier> metacatIds = DistributedMapsFactory.getIdentifiersSet();
294
        if(metacatIds != null) {
295
            for(Identifier identifier : metacatIds) {
296
                if(identifier != null && identifier.getValue() != null && !identifier.getValue().equals("")) {
297
                    SystemMetadata sysmeta = getSystemMetadata(identifier.getValue());
298
                    if(sysmeta != null && !sysmeta.getArchived()) {
299
                        ObjectFormatIdentifier formatId =sysmeta.getFormatId();
300
                        //System.out.println("the object format id is "+formatId.getValue());
301
                        //System.out.println("the ============ resourcMapNamespaces"+resourceMapNamespaces);
302
                        boolean correctTimeRange = false;
303
                        Date sysDate = sysmeta.getDateSysMetadataModified();
304
                        if(since == null && until == null) {
305
                            correctTimeRange = true;
306
                        } else if (since != null && until == null) {
307
                            if(sysDate.getTime() >= since.getTime()) {
308
                                correctTimeRange = true;
309
                            }
310
                        } else if (since == null && until != null) {
311
                            if(sysDate.getTime() <= until.getTime()) {
312
                                correctTimeRange = true;
313
                            }
314
                        } else if (since != null && until != null) {
315
                            if(sysDate.getTime() >= since.getTime() && sysDate.getTime() <= until.getTime()) {
316
                                correctTimeRange = true;
317
                            }
318
                        }
319
                        if(correctTimeRange && formatId != null && formatId.getValue() != null && resourceMapNamespaces != null && isResourceMap(formatId)) {
320
                            resourceMapIds.add(identifier.getValue());
321
                        } else {
322
                            otherIds.add(identifier.getValue());
323
                        }
324
                    }
325
                }
326
            }
327
        }
328
        /*if(objects != null) {
329
            List<ObjectInfo> objectInfoList = objects.getObjectInfoList();
330
            if(objectInfoList != null) {
331
                for(ObjectInfo info : objectInfoList) {
332
                    if(info != null) {
333
                        Identifier identifier = info.getIdentifier();
334
                        if(identifier != null && identifier.getValue() != null && !identifier.getValue().equals("")) {
335
                            SystemMetadata sysmeta = getSystemMetadata(identifier.getValue());
336
                            if(sysmeta != null && !sysmeta.getArchived()) {
337
                                ObjectFormatIdentifier formatId =sysmeta.getFormatId();
338
                                //System.out.println("the object format id is "+formatId.getValue());
339
                                //System.out.println("the ============ resourcMapNamespaces"+resourceMapNamespaces);
340
                                if(formatId != null && formatId.getValue() != null && resourceMapNamespaces != null && isResourceMap(formatId)) {
341
                                    resourceMapIds.add(identifier.getValue());
342
                                } else {
343
                                    otherIds.add(identifier.getValue());
344
                                }
345
                            }
346
                            //ids.add(identifier.getValue());
347
                        }
348
                    }
349
                }
350
            }
351
        }*/
352
        return ids;
353
    }
354
    
355
    /*
356
     * If the specified ObjectFormatIdentifier is a resrouce map namespace.
357
     */
358
    public static boolean isResourceMap(ObjectFormatIdentifier formatId) {
359
        boolean isResourceMap = false;
360
        if(formatId != null && resourceMapNamespaces != null) {
361
            for(String namespace : resourceMapNamespaces) {
362
                if(namespace != null && formatId.getValue() != null && !formatId.getValue().trim().equals("") && formatId.getValue().equals(namespace)) {
363
                    isResourceMap = true;
364
                    break;
365
                }
366
            }
367
        }
368
        return isResourceMap;
369
    }
370
    
371
   
372
    
373
    /*
374
     * Generate index for the id.
375
     */
376
    private void generateIndex(String id) throws Exception {
377
        if(id != null)  {
378
                SystemMetadata sysmeta = getSystemMetadata(id);
379
                //only update none-archived id.
380
                if(sysmeta != null && !sysmeta.getArchived()) {
381
                        InputStream data = getDataObject(id);
382
                        Identifier obsolete = sysmeta.getObsoletes();
383
                        List<String> obsoleteChain = null;
384
                        if(obsolete != null) {
385
                            obsoleteChain = getObsoletes(id);
386
                        } 
387
                        solrIndex.update(id, obsoleteChain, sysmeta, data);
388
                } else {
389
                    throw new Exception("IndexGenerator.generate - there is no found SystemMetadata associated with the id "+id);
390
                }
391
           
392
        }
393
    }
394
    
395
    /*
396
     * Initialize the system metadata map
397
     */
398
    private void initSystemMetadataMap() throws FileNotFoundException, ServiceFailure{
399
        int times = 0;
400
        if(systemMetadataMap == null) {
401
            systemMetadataMap = DistributedMapsFactory.getSystemMetadataMap();
402
            /*while(true) {
403
                try {
404
                    systemMetadataMap = DistributedMapsFactory.getSystemMetadataMap();
405
                    break;
406
                } catch (FileNotFoundException e) {
407
                    throw e;
408
                } catch (ServiceFailure e) {
409
                    if(times <= maxAttempts) {
410
                        log.warn("IndexGenerator.initSystemMetadataMap - the hazelcast service is not ready : "
411
                                         +e.getMessage()+"\nWe will try to access it "+waitingTime/1000+" seconds later ");
412
                        try {
413
                            Thread.sleep(waitingTime);
414
                        } catch (Exception ee) {
415
                            log.warn("IndexGenerator.initSystemMetadataMap - the thread can't sleep for "+waitingTime/1000+" seconds to wait the hazelcast service");
416
                        }
417
                       
418
                    } else {
419
                        throw new ServiceFailure("0000", "IndexGenerator.initSystemMetadataMap - the hazelcast service is not ready even though Metacat-index wailted for "+maxAttempts*waitingTime/1000+" seconds. We can't get the system metadata from it and the building index can't happen this time");
420
                    }
421
                }
422
                times++;
423
            }*/
424
        }
425
    }
426
    
427
    /*
428
     * We should call this method after calling initSystemMetadataMap since this method doesn't have the mechanism to wait the readiness of the hazelcast service
429
     */
430
    private void initObjectPathMap() throws FileNotFoundException, ServiceFailure {
431
        if(objectPathMap == null) {
432
            objectPathMap = DistributedMapsFactory.getObjectPathMap();
433
        }
434
    }
435
    /**
436
     * Get an InputStream as the data object for the specific pid.
437
     * @param pid
438
     * @return
439
     * @throws FileNotFoundException
440
     */
441
    private InputStream getDataObject(String pid) throws FileNotFoundException {
442
        Identifier identifier = new Identifier();
443
        identifier.setValue(pid);
444
        String objectPath = objectPathMap.get(identifier);
445
        InputStream data = null;
446
        data = new FileInputStream(objectPath);
447
        return data;
448

    
449
    }
450
    
451
    /**
452
     * Get the SystemMetadata for the specified id from the distributed Map.
453
     * The null maybe is returned if there is no system metadata found.
454
     * @param id  the specified id.
455
     * @return the SystemMetadata associated with the id.
456
     */
457
    private SystemMetadata getSystemMetadata(String id) {
458
        SystemMetadata metadata = null;
459
        if(systemMetadataMap != null && id != null) {
460
            Identifier identifier = new Identifier();
461
            identifier.setValue(id);
462
            metadata = systemMetadataMap.get(identifier);
463
        }
464
        return metadata;
465
    }
466
    
467
    /**
468
     * Get the obsoletes chain of the specified id. The returned list doesn't include
469
     * the specified id itself. The newer version has the lower index number in the list.
470
     * Empty list will be returned if there is no document to be obsoleted by this id.
471
     * @param id
472
     * @return
473
     */
474
    private List<String> getObsoletes(String id) {
475
        List<String> obsoletes = new ArrayList<String>();
476
        while (id != null) {
477
            SystemMetadata metadata = getSystemMetadata(id);
478
            id = null;//set it to be null in order to stop the while loop if the id can't be assinged to a new value in the following code.
479
            if(metadata != null) {
480
                Identifier identifier = metadata.getObsoletes();
481
                if(identifier != null && identifier.getValue() != null && !identifier.getValue().trim().equals("")) {
482
                    obsoletes.add(identifier.getValue());
483
                    id = identifier.getValue();
484
                } 
485
            } 
486
        }
487
        return obsoletes;
488
    }
489
    
490
    /**
491
     * Overwrite and do nothing
492
     */
493
    public boolean cancel() {
494
        return true;
495
    }
496

    
497
}
(3-3/6)