Project

General

Profile

1
/**
2
 *  '$RCSfile$'
3
 *    Purpose: A Class that implements administrative methods 
4
 *  Copyright: 2010 Regents of the University of California and the
5
 *             National Center for Ecological Analysis and Synthesis
6
 *    Authors: Michael Daigle
7
 * 
8
 *   '$Author: berkley $'
9
 *     '$Date: 2010-06-08 12:34:30 -0700 (Tue, 08 Jun 2010) $'
10
 * '$Revision: 5374 $'
11
 *
12
 * This program is free software; you can redistribute it and/or modify
13
 * it under the terms of the GNU General Public License as published by
14
 * the Free Software Foundation; either version 2 of the License, or
15
 * (at your option) any later version.
16
 *
17
 * This program is distributed in the hope that it will be useful,
18
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
 * GNU General Public License for more details.
21
 *
22
 * You should have received a copy of the GNU General Public License
23
 * along with this program; if not, write to the Free Software
24
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25
 */
26
package edu.ucsb.nceas.metacat.util;
27

    
28
import java.io.ByteArrayInputStream;
29
import java.io.InputStream;
30
import java.io.OutputStream;
31
import java.math.BigInteger;
32
import java.net.HttpURLConnection;
33
import java.net.URL;
34
import java.util.ArrayList;
35
import java.util.Calendar;
36
import java.util.Date;
37
import java.util.HashMap;
38
import java.util.List;
39
import java.util.Map;
40
import java.util.Vector;
41

    
42
import javax.activation.DataHandler;
43
import javax.activation.DataSource;
44
import javax.mail.internet.MimeBodyPart;
45
import javax.mail.internet.MimeMultipart;
46

    
47
import org.apache.commons.io.IOUtils;
48
import org.dataone.client.D1Client;
49
import org.dataone.client.MNode;
50
import org.dataone.client.ObjectFormatCache;
51
import org.dataone.client.auth.CertificateManager;
52
import org.dataone.ore.ResourceMapFactory;
53
import org.dataone.service.exceptions.NotFound;
54
import org.dataone.service.types.v1.AccessPolicy;
55
import org.dataone.service.types.v1.AccessRule;
56
import org.dataone.service.types.v1.Checksum;
57
import org.dataone.service.types.v1.Identifier;
58
import org.dataone.service.types.v1.NodeReference;
59
import org.dataone.service.types.v1.ObjectFormat;
60
import org.dataone.service.types.v1.Permission;
61
import org.dataone.service.types.v1.Session;
62
import org.dataone.service.types.v1.Subject;
63
import org.dataone.service.types.v1.SystemMetadata;
64
import org.dataone.service.types.v1.util.ChecksumUtil;
65
import org.dataone.service.util.Constants;
66
import org.dspace.foresite.ResourceMap;
67
import org.ecoinformatics.datamanager.DataManager;
68
import org.ecoinformatics.datamanager.database.DatabaseConnectionPoolInterface;
69
import org.ecoinformatics.datamanager.parser.DataPackage;
70

    
71
import edu.ucsb.nceas.metacat.MetaCatServlet;
72
import edu.ucsb.nceas.metacat.dataquery.MetacatDatabaseConnectionPoolFactory;
73
import edu.ucsb.nceas.metacat.properties.PropertyService;
74
import edu.ucsb.nceas.metacat.restservice.InputStreamDataSource;
75

    
76
/**
77
 * @author berkley
78
 * A class to populate a metacat instance based on documents returned from a query
79
 */
80
public class MetacatPopulator
81
{
82
    private String sourceUrl = null;
83
    private String destinationUrl = null;
84
    private String query = null;
85
    private String username = null;
86
    private String password = null;
87
    private Session session = null;
88
    private String subjectDN = null;
89
    
90
    /**
91
     * create a new MetacatPopulator with given source and destination urls.  
92
     * These should be
93
     * of the form "http://<url>/<metacat_instance>"
94
     * If username and/or password is null, the query will be run as public
95
     * @param sourceUrl
96
     * @param destUrl
97
     * @param query
98
     * @param username
99
     * @param password
100
     */
101
    public MetacatPopulator(String sourceUrl, String destUrl, String query, String username, String password)
102
    {
103
        this.sourceUrl = sourceUrl;
104
        this.query = query;
105
        this.username = username;
106
        this.password = password;
107
        this.destinationUrl = destUrl;
108
        // TODO: use specific certificate?
109
        this.session = null; //new Session();
110
        this.subjectDN = CertificateManager.getInstance().getSubjectDN(CertificateManager.getInstance().loadCertificate());
111
    }
112
    
113
    /**
114
     * populate from the source
115
     */
116
    public void populate()
117
      throws Exception
118
    {
119
        //String sourceSessionid = loginSource();
120
        
121
        //do a query
122
        String params = "returndoctype=eml://ecoinformatics.org/eml-2.1.0&" +
123
                        "returndoctype=eml://ecoinformatics.org/eml-2.0.1&" +
124
                        "returndoctype=eml://ecoinformatics.org/eml-2.0.0&";
125
        params += "action=query&";
126
        params += "qformat=xml&";
127
        params += "anyfield=" + query;
128
        
129
        printHeader("Searching source");
130
        System.out.println("searching '" + sourceUrl + "' for '" + query + "'");
131
        InputStream is = getResponse(sourceUrl, "/metacat", params, "POST");
132
        String response = streamToString(is);
133
        //System.out.println("response: " + response);
134
        Vector<Document> docs = parseResponse(response);
135
        
136
        printHeader("Parsing source results");
137
        System.out.println("creating MN with url: " + destinationUrl + "/");
138
        MNode mn = D1Client.getMN(destinationUrl + "/");
139
        
140
        printHeader("Processing " + docs.size() + " results.");
141
        for (int i=0; i<docs.size(); i++) {
142
        	
143
        	// for generating the ORE map
144
            Map<Identifier, List<Identifier>> idMap = new HashMap<Identifier, List<Identifier>>();
145
            List<Identifier> dataIds = new ArrayList<Identifier>();
146
            
147
            //for each document in the query
148
            Document doc = docs.get(i);
149
            String docid = doc.docid;
150
            //get the doc from source
151
            printHeader("Getting document " + doc.docid + " from source " + sourceUrl);
152
            params = "action=read&qformat=xml&docid=" + docid;
153
            is = getResponse(sourceUrl, "/metacat", params, "POST");
154
            String doctext = streamToString(is);
155
            System.out.println("doctext: " + doctext);
156
            is = stringToStream(doctext);
157
            //parse the document
158
            DatabaseConnectionPoolInterface connectionPool = MetacatDatabaseConnectionPoolFactory.getDatabaseConnectionPoolInterface();
159
        	DataManager dataManager = DataManager.getInstance(connectionPool, connectionPool.getDBAdapterName());
160
        	DataPackage dataPackage = dataManager.parseMetadata(is);
161
        	
162
            if (dataPackage == null) {
163
                continue;
164
            }
165
            
166
            //go through the DistributionMetadata and download any described data
167
            is = stringToStream(doctext);
168
            doc.doctext = doctext;
169

    
170
            printHeader("creating document on destination " + destinationUrl);            
171
            SystemMetadata sysmeta = generateSystemMetadata(doc);
172
            
173
            // iterate through the data objects
174
            if (dataPackage.getEntityList() != null) {
175
	            for (int j=0; j < dataPackage.getEntityList().length; j++) {
176
	                String dataDocUrl = dataPackage.getEntityList()[j].getURL();
177
	                String dataDocMimeType = dataPackage.getEntityList()[j].getDataFormat();
178
	                if (dataDocMimeType == null) {
179
		                dataDocMimeType = 
180
		                	ObjectFormatCache.getInstance().getFormat("application/octet-stream").getFormatId().getValue();
181
	                }
182
	                String dataDocLocalId = "";
183
	                if (dataDocUrl.trim().startsWith("ecogrid://knb/")) { //we only handle ecogrid urls right now
184
	                    dataDocLocalId = dataDocUrl.substring(dataDocUrl.indexOf("ecogrid://knb/") + 
185
	                            "ecogrid://knb/".length(), dataDocUrl.length());
186
	                    //get the file
187
	                    params = "action=read&qformat=xml&docid=" + dataDocLocalId;
188
	                    InputStream dataDocIs = getResponse(sourceUrl, "/metacat", params, "POST");
189
	                    String dataDocText = streamToString(dataDocIs);
190
	                    
191
	                    //set the id
192
	                    Identifier did = new Identifier();
193
	                    did.setValue(dataDocLocalId);
194
	                    
195
	                    // add the data identifier for ORE map 
196
	                    dataIds.add(did);
197
	                    
198
	                    //create sysmeta for the data doc                    
199
	                    SystemMetadata dataDocSysMeta = generateSystemMetadata(doc);
200
	                    //overwrite the bogus values from the last call 
201
	                    dataDocSysMeta.setIdentifier(did);
202
	                    ObjectFormat format = null;
203
	                    try {
204
	                    	format = ObjectFormatCache.getInstance().getFormat(dataDocMimeType);
205
							dataDocSysMeta.setFormatId(format.getFormatId());
206
	                    } catch (NotFound e) {
207
							System.out.println(e.getMessage());
208
						}
209
	                    dataDocIs = stringToStream(dataDocText);
210
	                    Checksum checksum = ChecksumUtil.checksum(dataDocIs, "MD5");
211
	                    dataDocSysMeta.setChecksum(checksum);
212
	                    String sizeStr = 
213
	                    	Long.toString(dataDocText.getBytes(MetaCatServlet.DEFAULT_ENCODING).length);
214
	                    dataDocSysMeta.setSize(new BigInteger(sizeStr));
215

    
216
	                    boolean error = false;
217
	                    
218
	                    //create the data doc on d1
219
	                    try {
220
	                        mn.create(session, dataDocSysMeta.getIdentifier(), IOUtils.toInputStream(dataDocText), dataDocSysMeta);
221
	                    }
222
	                    catch(Exception e) {
223
	                        error = true;
224
	                        System.out.println("ERROR: Could not create data document with id " + 
225
	                                dataDocSysMeta.getIdentifier().getValue() + " : " + e.getMessage());
226
	                    }
227
	                    finally {
228
	                        if (error) {
229
	                            printHeader("Insertion of document " + dataDocSysMeta.getIdentifier().getValue() + 
230
	                                    "FAILED.");
231
	                        }
232
	                        else {
233
	                            printHeader("Done inserting document " + dataDocSysMeta.getIdentifier().getValue() +
234
	                                " which is described by " + sysmeta.getIdentifier().getValue());
235
	                        }
236
	                    }
237
	                }
238
	                else {
239
	                    System.out.println("WARNING: Could not process describes url " +
240
	                            dataDocUrl + " for document " + doc.docid + 
241
	                    ".  Only ecogrid://knb/ urls are currently supported.");
242
	                }
243
	            }
244
            }
245
            
246
            try {
247
              Identifier id = 
248
            	  mn.create(session, sysmeta.getIdentifier(), IOUtils.toInputStream(doc.doctext), sysmeta);
249
              System.out.println("Success inserting document " + id.getValue());
250
              
251
              // no need for an ORE map if there's no data
252
              if (!dataIds.isEmpty()) {
253
	              // generate the ORE map for this datapackage
254
	              Identifier resourceMapId = new Identifier();
255
	              resourceMapId.setValue("resourceMap_" + sysmeta.getIdentifier().getValue());
256
	              idMap.put(sysmeta.getIdentifier(), dataIds);
257
	              ResourceMap rm = ResourceMapFactory.getInstance().createResourceMap(resourceMapId, idMap);
258
	              String resourceMapXML = ResourceMapFactory.getInstance().serializeResourceMap(rm);
259
	              Document rmDoc = new Document(resourceMapId.getValue(), "http://www.openarchives.org/ore/terms", "", "");
260
	              rmDoc.doctext = resourceMapXML;
261
	              SystemMetadata resourceMapSysMeta = generateSystemMetadata(rmDoc);
262
	              mn.create(session, resourceMapId, IOUtils.toInputStream(resourceMapXML), resourceMapSysMeta);
263
	              
264
	              // clean up the permissions (FORCE public read)
265
	              for (Identifier dataId: dataIds) {
266
	            	  mn.setAccessPolicy(session, dataId, sysmeta.getAccessPolicy());
267
	            	  System.out.println("Set public access policy for: " + dataId.getValue());
268
	              }
269
            }
270
              
271
            }
272
            catch(Exception e) {
273
                e.printStackTrace();
274
                System.out.println("Could not create document with id " + 
275
                        sysmeta.getIdentifier().getValue() + " : " + e.getMessage());
276
            }
277
            finally {
278
                printHeader("Done processing document " + sysmeta.getIdentifier().getValue());
279
            }
280
        }
281
        
282
        //logout();
283
    }
284
    
285

    
286
    
287
    /**
288
     * @param doc
289
     * @return
290
     */
291
    private SystemMetadata generateSystemMetadata(Document doc)
292
      throws Exception {
293
        SystemMetadata sm = new SystemMetadata();
294
        sm.setSerialVersion(BigInteger.valueOf(1));
295
        //set the id
296
        Identifier id = new Identifier();
297
        id.setValue(doc.docid.trim());
298
        sm.setIdentifier(id);
299
        
300
        //set the object format
301
        ObjectFormat format = ObjectFormatCache.getInstance().getFormat(doc.doctype);
302
        if (format == null) {
303
            if (doc.doctype.trim().equals("BIN")) {
304
                format = ObjectFormatCache.getInstance().getFormat("application/octet-stream");
305
            }
306
            else {
307
                format = ObjectFormatCache.getInstance().getFormat("text/plain");
308
            }
309
        }
310
        sm.setFormatId(format.getFormatId());
311
        
312
        //create the checksum
313
        ByteArrayInputStream bais = new ByteArrayInputStream(doc.doctext.getBytes(MetaCatServlet.DEFAULT_ENCODING));
314
        Checksum checksum = ChecksumUtil.checksum(bais, "MD5");
315
        sm.setChecksum(checksum);
316
        
317
        //set the size
318
        String sizeStr = Long.toString(doc.doctext.getBytes(MetaCatServlet.DEFAULT_ENCODING).length);
319
        sm.setSize(new BigInteger(sizeStr));
320
        
321
        //submitter, rights holder
322
        Subject p = new Subject();
323
        p.setValue(subjectDN);
324
        sm.setSubmitter(p);
325
        sm.setRightsHolder(p);
326
        try {
327
            Date dateCreated = parseMetacatDate(doc.createDate);
328
            sm.setDateUploaded(dateCreated);
329
            Date dateUpdated = parseMetacatDate(doc.updateDate);
330
            sm.setDateSysMetadataModified(dateUpdated);
331
        }
332
        catch(Exception e) {
333
            System.out.println("couldn't parse a date: " + e.getMessage());
334
            Date dateCreated = new Date();
335
            sm.setDateUploaded(dateCreated);
336
            Date dateUpdated = new Date();
337
            sm.setDateSysMetadataModified(dateUpdated);
338
        }
339
        NodeReference nr = new NodeReference();
340
        nr.setValue(PropertyService.getProperty("dataone.memberNodeId"));
341
        sm.setOriginMemberNode(nr);
342
        sm.setAuthoritativeMemberNode(nr);
343
        
344
        // create access policy
345
        AccessPolicy accessPolicy = new AccessPolicy();
346
        AccessRule accessRule = new AccessRule();
347
		accessRule.addPermission(Permission.READ);
348
        Subject subject = new Subject();
349
        subject.setValue(Constants.SUBJECT_PUBLIC);
350
		accessRule.addSubject(subject);
351
		accessPolicy.addAllow(accessRule);
352
		
353
		sm.setAccessPolicy(accessPolicy);
354
        
355
        return sm;
356
    }
357
    
358
    private void printHeader(String s) {
359
        System.out.println("****** " + s + " *******");
360
    }
361
    
362
    /**
363
     * parse the metacat date which looks like 2010-06-08 (YYYY-MM-DD) into
364
     * a proper date object
365
     * @param date
366
     * @return
367
     */
368
    private Date parseMetacatDate(String date)
369
    {
370
        String year = date.substring(0, 4);
371
        String month = date.substring(5, 7);
372
        String day = date.substring(8, 10);
373
        Calendar c = Calendar.getInstance();
374
        c.set(new Integer(year).intValue(), 
375
              new Integer(month).intValue(), 
376
              new Integer(day).intValue());
377
        return c.getTime();
378
    }
379

    
380
    /**
381
     * send a request to the resource
382
     */
383
    private InputStream sendRequest(String contextRootUrl, String resource, 
384
            String sessionid, String method, String urlParamaters, 
385
            String contentType, InputStream dataStream) 
386
        throws Exception 
387
    {
388
        
389
        HttpURLConnection connection = null ;
390
        String restURL = contextRootUrl + resource;
391

    
392
        if (urlParamaters != null) {
393
            if (restURL.indexOf("?") == -1)             
394
                restURL += "?";
395
            restURL += urlParamaters; 
396
            if(restURL.indexOf(" ") != -1)
397
            {
398
                restURL = restURL.replaceAll("\\s", "%20");
399
            }
400
        }
401
        
402
        if(sessionid != null)
403
        {
404
            if(restURL.indexOf("?") == -1)
405
            {
406
                restURL += "?sessionid=" + sessionid;
407
            }
408
            else
409
            {
410
                restURL += "&sessionid=" + sessionid;
411
            }
412
        }
413

    
414
        URL u = null;
415
        InputStream content = null;
416
        System.out.println("url: " + restURL);
417
        System.out.println("method: " + method);
418
        u = new URL(restURL);
419
        connection = (HttpURLConnection) u.openConnection();
420
        if (contentType!=null) {
421
            connection.setRequestProperty("Content-Type",contentType);
422
        }
423

    
424
        connection.setDoOutput(true);
425
        connection.setDoInput(true);
426
        connection.setRequestMethod(method);
427

    
428
        if (!method.equals("GET")) {
429
            if (dataStream != null) {
430
                OutputStream out = connection.getOutputStream();
431
                IOUtils.copy(dataStream, out);
432
            }
433
        }
434

    
435
        return connection.getInputStream();   
436
    }
437
    
438
    /**
439
     * create a mime multipart message from object and sysmeta
440
     */
441
    private MimeMultipart createMimeMultipart(InputStream object)
442
      throws Exception
443
    {
444
        final MimeMultipart mmp = new MimeMultipart();
445
        MimeBodyPart objectPart = new MimeBodyPart();
446
        objectPart.addHeaderLine("Content-Transfer-Encoding: base64");
447
        objectPart.setFileName("doctext");
448
        DataSource ds = new InputStreamDataSource("doctext", object);
449
        DataHandler dh = new DataHandler(ds);
450
        objectPart.setDataHandler(dh);
451
        mmp.addBodyPart(objectPart);
452
        return mmp;
453
    }
454
    
455
    /**
456
     * parse a metacat query response and return a vector of docids
457
     * @param response
458
     * @return
459
     */
460
    private Vector<Document> parseResponse(String response)
461
    {
462
        Vector<Document> v = new Vector<Document>();
463
        int dstart = response.indexOf("<document>");
464
        int dend = response.indexOf("</document>", dstart);
465
        while(dstart != -1)
466
        {
467
            String doc = response.substring(dstart + "<document>".length(), dend);
468
            //System.out.println("adding " + docid);
469
            Document d = new Document(getFieldFromDoc(doc, "docid"),
470
                    getFieldFromDoc(doc, "doctype"),
471
                    getFieldFromDoc(doc, "createdate"),
472
                    getFieldFromDoc(doc, "updatedate"));
473
            v.add(d);
474
            dstart = response.indexOf("<document>", dend);
475
            dend = response.indexOf("</document>", dstart);
476
        }
477
        
478
        return v;
479
    }
480
    
481
    private String getFieldFromDoc(String doc, String fieldname)
482
    {
483
        String field = "<" + fieldname + ">";
484
        String fieldend = "</" + fieldname + ">";
485
        int start = doc.indexOf(field);
486
        int end = doc.indexOf(fieldend);
487
        String s = doc.substring(start + field.length(), end);
488
        //System.out.println("field: " + fieldname + " : " + s);
489
        return s;
490
    }
491
    
492
    /**
493
     * login the source
494
     * @return
495
     * @throws Exception
496
     */
497
    private String loginSource()
498
      throws Exception
499
    {
500
        return login(sourceUrl);
501
    }
502
    
503
    
504
    /**
505
     * returns a sessionid
506
     * @return
507
     */
508
    private String login(String sourceUrl)
509
      throws Exception
510
    {
511
        InputStream is = getResponse(sourceUrl, "/metacat", 
512
                "action=login&username=" + username + "&password=" + password + "&qformat=xml", "POST");
513
        String response = streamToString(is);
514
        //System.out.println("response: " + response);
515
        if(response.indexOf("sessionId") == -1)
516
        {
517
            throw new Exception("Error logging into " + sourceUrl);
518
        }
519
        
520
        String sessionid = response.substring(
521
                response.indexOf("<sessionId>") + "<sessionId>".length(), 
522
                response.indexOf("</sessionId>"));
523
        System.out.println("sessionid: " + sessionid);
524
        return sessionid;
525
    }
526
    
527
    /**
528
     * logout both the source and destination
529
     * @throws Exception
530
     */
531
    private void logout()
532
        throws Exception
533
    {
534
        getResponse(sourceUrl, "/metacat", "action=logout&username=" + username, "POST");
535
    }
536
    
537
    /**
538
     * get an http response
539
     * @param contextRootUrl
540
     * @param resource
541
     * @param urlParameters
542
     * @param method
543
     * @return
544
     * @throws Exception
545
     */
546
    private InputStream getResponse(String contextRootUrl, String resource, 
547
            String urlParameters, String method)
548
      throws Exception
549
    {
550
        HttpURLConnection connection = null ;
551

    
552
        String restURL = contextRootUrl+resource;
553

    
554
        if (urlParameters != null) {
555
            if (restURL.indexOf("?") == -1)             
556
                restURL += "?";
557
            restURL += urlParameters; 
558
            if(restURL.indexOf(" ") != -1)
559
            {
560
                restURL = restURL.replaceAll("\\s", "%20");
561
            }
562
        }
563

    
564
        URL u = null;
565
        InputStream content = null;            
566
        System.out.println("url: " + restURL);
567
        System.out.println("method: " + method);
568
        u = new URL(restURL);
569
        connection = (HttpURLConnection) u.openConnection();
570
        connection.setDoOutput(true);
571
        connection.setDoInput(true);
572
        connection.setRequestMethod(method);
573
        content = connection.getInputStream();
574
        return content;
575
    }
576
    
577
    private String streamToString(InputStream is)
578
        throws Exception
579
    {
580
        byte b[] = new byte[1024];
581
        int numread = is.read(b, 0, 1024);
582
        String response = new String();
583
        while(numread != -1)
584
        {
585
            response += new String(b, 0, numread);
586
            numread = is.read(b, 0, 1024);
587
        }
588
        return response;
589
    }
590
    
591
    private InputStream stringToStream(String s)
592
      throws Exception
593
    {
594
        ByteArrayInputStream bais = new ByteArrayInputStream(s.getBytes(MetaCatServlet.DEFAULT_ENCODING));
595
        return bais;
596
    }
597
    
598
    private class Document
599
    {
600
        public String docid;
601
        public String doctype;
602
        public String createDate;
603
        public String updateDate;
604
        public String doctext;
605
        
606
        public Document(String docid, String doctype, String createDate, String updateDate)
607
        {
608
            this.docid = docid.trim();
609
            this.doctype = doctype.trim();
610
            this.createDate = createDate.trim();
611
            this.updateDate = updateDate.trim();
612
        }
613
    }
614
}
(8-8/16)