Project

General

Profile

1 5394 berkley
/**
2
 *  '$RCSfile$'
3
 *    Purpose: A Class that implements administrative methods
4
 *  Copyright: 2010 Regents of the University of California and the
5
 *             National Center for Ecological Analysis and Synthesis
6
 *    Authors: Michael Daigle
7
 *
8
 *   '$Author: berkley $'
9
 *     '$Date: 2010-06-08 12:34:30 -0700 (Tue, 08 Jun 2010) $'
10
 * '$Revision: 5374 $'
11
 *
12
 * This program is free software; you can redistribute it and/or modify
13
 * it under the terms of the GNU General Public License as published by
14
 * the Free Software Foundation; either version 2 of the License, or
15
 * (at your option) any later version.
16
 *
17
 * This program is distributed in the hope that it will be useful,
18
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
 * GNU General Public License for more details.
21
 *
22
 * You should have received a copy of the GNU General Public License
23
 * along with this program; if not, write to the Free Software
24
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25
 */
26
package edu.ucsb.nceas.metacat.util;
27
28 6092 jones
import java.io.ByteArrayInputStream;
29
import java.io.InputStream;
30 6386 cjones
import java.math.BigInteger;
31 6092 jones
import java.net.HttpURLConnection;
32
import java.net.URL;
33 6538 leinfelder
import java.util.ArrayList;
34 6092 jones
import java.util.Calendar;
35
import java.util.Date;
36 6538 leinfelder
import java.util.HashMap;
37
import java.util.List;
38
import java.util.Map;
39 6092 jones
import java.util.Vector;
40 5394 berkley
41
import org.apache.commons.io.IOUtils;
42 8810 leinfelder
import org.dataone.client.v2.itk.D1Client;
43
import org.dataone.client.v2.MNode;
44
import org.dataone.client.v2.formats.ObjectFormatCache;
45 6152 leinfelder
import org.dataone.client.auth.CertificateManager;
46 6538 leinfelder
import org.dataone.ore.ResourceMapFactory;
47 6359 leinfelder
import org.dataone.service.exceptions.NotFound;
48 6366 leinfelder
import org.dataone.service.types.v1.AccessPolicy;
49
import org.dataone.service.types.v1.AccessRule;
50
import org.dataone.service.types.v1.Checksum;
51
import org.dataone.service.types.v1.Identifier;
52
import org.dataone.service.types.v1.NodeReference;
53
import org.dataone.service.types.v1.ObjectFormat;
54
import org.dataone.service.types.v1.Permission;
55
import org.dataone.service.types.v1.Session;
56
import org.dataone.service.types.v1.Subject;
57 8810 leinfelder
import org.dataone.service.types.v2.SystemMetadata;
58 6534 leinfelder
import org.dataone.service.types.v1.util.ChecksumUtil;
59
import org.dataone.service.util.Constants;
60 6538 leinfelder
import org.dspace.foresite.ResourceMap;
61 6134 leinfelder
import org.ecoinformatics.datamanager.DataManager;
62
import org.ecoinformatics.datamanager.database.DatabaseConnectionPoolInterface;
63
import org.ecoinformatics.datamanager.parser.DataPackage;
64 5394 berkley
65 6092 jones
import edu.ucsb.nceas.metacat.MetaCatServlet;
66 6134 leinfelder
import edu.ucsb.nceas.metacat.dataquery.MetacatDatabaseConnectionPoolFactory;
67 6092 jones
import edu.ucsb.nceas.metacat.properties.PropertyService;
68 5394 berkley
69
/**
70
 * @author berkley
71
 * A class to populate a metacat instance based on documents returned from a query
72
 */
73
public class MetacatPopulator
74
{
75
    private String sourceUrl = null;
76
    private String destinationUrl = null;
77
    private String query = null;
78
    private String username = null;
79
    private String password = null;
80 6152 leinfelder
    private Session session = null;
81 6538 leinfelder
    private String subjectDN = null;
82 5394 berkley
83
    /**
84
     * create a new MetacatPopulator with given source and destination urls.
85
     * These should be
86
     * of the form "http://<url>/<metacat_instance>"
87
     * If username and/or password is null, the query will be run as public
88
     * @param sourceUrl
89
     * @param destUrl
90
     * @param query
91
     * @param username
92
     * @param password
93
     */
94
    public MetacatPopulator(String sourceUrl, String destUrl, String query, String username, String password)
95
    {
96
        this.sourceUrl = sourceUrl;
97
        this.query = query;
98
        this.username = username;
99
        this.password = password;
100
        this.destinationUrl = destUrl;
101 6534 leinfelder
        // TODO: use specific certificate?
102
        this.session = null; //new Session();
103 6538 leinfelder
        this.subjectDN = CertificateManager.getInstance().getSubjectDN(CertificateManager.getInstance().loadCertificate());
104 5394 berkley
    }
105
106
    /**
107
     * populate from the source
108
     */
109
    public void populate()
110
      throws Exception
111
    {
112 6700 leinfelder
        //String sourceSessionid = login();
113 5397 berkley
114 5394 berkley
        //do a query
115 5568 berkley
        String params = "returndoctype=eml://ecoinformatics.org/eml-2.1.0&" +
116
                        "returndoctype=eml://ecoinformatics.org/eml-2.0.1&" +
117
                        "returndoctype=eml://ecoinformatics.org/eml-2.0.0&";
118 5394 berkley
        params += "action=query&";
119
        params += "qformat=xml&";
120
        params += "anyfield=" + query;
121
122 5397 berkley
        printHeader("Searching source");
123 6534 leinfelder
        System.out.println("searching '" + sourceUrl + "' for '" + query + "'");
124
        InputStream is = getResponse(sourceUrl, "/metacat", params, "POST");
125 6700 leinfelder
        String response = IOUtils.toString(is, MetaCatServlet.DEFAULT_ENCODING);
126 5394 berkley
        //System.out.println("response: " + response);
127
        Vector<Document> docs = parseResponse(response);
128 5441 berkley
129 5397 berkley
        printHeader("Parsing source results");
130 5874 berkley
        System.out.println("creating MN with url: " + destinationUrl + "/");
131 5860 jones
        MNode mn = D1Client.getMN(destinationUrl + "/");
132 5568 berkley
133 5441 berkley
        printHeader("Processing " + docs.size() + " results.");
134 6538 leinfelder
        for (int i=0; i<docs.size(); i++) {
135
136
        	// for generating the ORE map
137
            Map<Identifier, List<Identifier>> idMap = new HashMap<Identifier, List<Identifier>>();
138
            List<Identifier> dataIds = new ArrayList<Identifier>();
139
140 5394 berkley
            //for each document in the query
141
            Document doc = docs.get(i);
142
            String docid = doc.docid;
143
            //get the doc from source
144 5397 berkley
            printHeader("Getting document " + doc.docid + " from source " + sourceUrl);
145 5394 berkley
            params = "action=read&qformat=xml&docid=" + docid;
146
            is = getResponse(sourceUrl, "/metacat", params, "POST");
147 6700 leinfelder
            String doctext = IOUtils.toString(is, MetaCatServlet.DEFAULT_ENCODING);
148 5568 berkley
            System.out.println("doctext: " + doctext);
149 6700 leinfelder
            is = IOUtils.toInputStream(doctext, MetaCatServlet.DEFAULT_ENCODING);
150 5568 berkley
            //parse the document
151 6134 leinfelder
            DatabaseConnectionPoolInterface connectionPool = MetacatDatabaseConnectionPoolFactory.getDatabaseConnectionPoolInterface();
152
        	DataManager dataManager = DataManager.getInstance(connectionPool, connectionPool.getDBAdapterName());
153
        	DataPackage dataPackage = dataManager.parseMetadata(is);
154
155 6538 leinfelder
            if (dataPackage == null) {
156 5577 berkley
                continue;
157
            }
158 6538 leinfelder
159 5568 berkley
            //go through the DistributionMetadata and download any described data
160 6700 leinfelder
            is = IOUtils.toInputStream(doctext, MetaCatServlet.DEFAULT_ENCODING);
161 5394 berkley
            doc.doctext = doctext;
162 5397 berkley
163
            printHeader("creating document on destination " + destinationUrl);
164 5394 berkley
            SystemMetadata sysmeta = generateSystemMetadata(doc);
165 6538 leinfelder
166
            // iterate through the data objects
167 6359 leinfelder
            if (dataPackage.getEntityList() != null) {
168 6538 leinfelder
	            for (int j=0; j < dataPackage.getEntityList().length; j++) {
169 6359 leinfelder
	                String dataDocUrl = dataPackage.getEntityList()[j].getURL();
170 6538 leinfelder
	                String dataDocMimeType = dataPackage.getEntityList()[j].getDataFormat();
171 6359 leinfelder
	                if (dataDocMimeType == null) {
172
		                dataDocMimeType =
173 6561 leinfelder
		                	ObjectFormatCache.getInstance().getFormat("application/octet-stream").getFormatId().getValue();
174 6359 leinfelder
	                }
175
	                String dataDocLocalId = "";
176 6538 leinfelder
	                if (dataDocUrl.trim().startsWith("ecogrid://knb/")) { //we only handle ecogrid urls right now
177 6359 leinfelder
	                    dataDocLocalId = dataDocUrl.substring(dataDocUrl.indexOf("ecogrid://knb/") +
178
	                            "ecogrid://knb/".length(), dataDocUrl.length());
179
	                    //get the file
180
	                    params = "action=read&qformat=xml&docid=" + dataDocLocalId;
181
	                    InputStream dataDocIs = getResponse(sourceUrl, "/metacat", params, "POST");
182 6700 leinfelder
	                    String dataDocText = IOUtils.toString(dataDocIs, MetaCatServlet.DEFAULT_ENCODING);
183 6359 leinfelder
184
	                    //set the id
185
	                    Identifier did = new Identifier();
186
	                    did.setValue(dataDocLocalId);
187
188 6538 leinfelder
	                    // add the data identifier for ORE map
189
	                    dataIds.add(did);
190 6359 leinfelder
191
	                    //create sysmeta for the data doc
192
	                    SystemMetadata dataDocSysMeta = generateSystemMetadata(doc);
193
	                    //overwrite the bogus values from the last call
194
	                    dataDocSysMeta.setIdentifier(did);
195
	                    ObjectFormat format = null;
196
	                    try {
197
	                    	format = ObjectFormatCache.getInstance().getFormat(dataDocMimeType);
198 6561 leinfelder
							dataDocSysMeta.setFormatId(format.getFormatId());
199 6359 leinfelder
	                    } catch (NotFound e) {
200
							System.out.println(e.getMessage());
201
						}
202 6700 leinfelder
	                    dataDocIs = IOUtils.toInputStream(dataDocText, MetaCatServlet.DEFAULT_ENCODING);
203 7222 leinfelder
	                    String algorithm = PropertyService.getProperty("dataone.checksumAlgorithm.default");
204
	                    Checksum checksum = ChecksumUtil.checksum(dataDocIs, algorithm);
205 6359 leinfelder
	                    dataDocSysMeta.setChecksum(checksum);
206 6386 cjones
	                    String sizeStr =
207
	                    	Long.toString(dataDocText.getBytes(MetaCatServlet.DEFAULT_ENCODING).length);
208
	                    dataDocSysMeta.setSize(new BigInteger(sizeStr));
209 6538 leinfelder
210 6359 leinfelder
	                    boolean error = false;
211
212
	                    //create the data doc on d1
213 6538 leinfelder
	                    try {
214 6700 leinfelder
	                        mn.create(session, dataDocSysMeta.getIdentifier(), IOUtils.toInputStream(dataDocText, MetaCatServlet.DEFAULT_ENCODING), dataDocSysMeta);
215 6359 leinfelder
	                    }
216 6538 leinfelder
	                    catch(Exception e) {
217 6359 leinfelder
	                        error = true;
218
	                        System.out.println("ERROR: Could not create data document with id " +
219
	                                dataDocSysMeta.getIdentifier().getValue() + " : " + e.getMessage());
220
	                    }
221 6538 leinfelder
	                    finally {
222
	                        if (error) {
223 6359 leinfelder
	                            printHeader("Insertion of document " + dataDocSysMeta.getIdentifier().getValue() +
224
	                                    "FAILED.");
225
	                        }
226 6538 leinfelder
	                        else {
227 6359 leinfelder
	                            printHeader("Done inserting document " + dataDocSysMeta.getIdentifier().getValue() +
228
	                                " which is described by " + sysmeta.getIdentifier().getValue());
229
	                        }
230
	                    }
231
	                }
232 6538 leinfelder
	                else {
233 6359 leinfelder
	                    System.out.println("WARNING: Could not process describes url " +
234
	                            dataDocUrl + " for document " + doc.docid +
235
	                    ".  Only ecogrid://knb/ urls are currently supported.");
236
	                }
237
	            }
238 5568 berkley
            }
239
240 6538 leinfelder
            try {
241
              Identifier id =
242 6700 leinfelder
            	  mn.create(session, sysmeta.getIdentifier(), IOUtils.toInputStream(doc.doctext, MetaCatServlet.DEFAULT_ENCODING), sysmeta);
243 5397 berkley
              System.out.println("Success inserting document " + id.getValue());
244 5568 berkley
245 6538 leinfelder
              // no need for an ORE map if there's no data
246
              if (!dataIds.isEmpty()) {
247
	              // generate the ORE map for this datapackage
248
	              Identifier resourceMapId = new Identifier();
249
	              resourceMapId.setValue("resourceMap_" + sysmeta.getIdentifier().getValue());
250
	              idMap.put(sysmeta.getIdentifier(), dataIds);
251
	              ResourceMap rm = ResourceMapFactory.getInstance().createResourceMap(resourceMapId, idMap);
252
	              String resourceMapXML = ResourceMapFactory.getInstance().serializeResourceMap(rm);
253
	              Document rmDoc = new Document(resourceMapId.getValue(), "http://www.openarchives.org/ore/terms", "", "");
254
	              rmDoc.doctext = resourceMapXML;
255
	              SystemMetadata resourceMapSysMeta = generateSystemMetadata(rmDoc);
256 6700 leinfelder
	              mn.create(session, resourceMapId, IOUtils.toInputStream(resourceMapXML, MetaCatServlet.DEFAULT_ENCODING), resourceMapSysMeta);
257 6538 leinfelder
258 5397 berkley
            }
259 6538 leinfelder
260
            }
261
            catch(Exception e) {
262 5568 berkley
                e.printStackTrace();
263 5397 berkley
                System.out.println("Could not create document with id " +
264
                        sysmeta.getIdentifier().getValue() + " : " + e.getMessage());
265
            }
266 6538 leinfelder
            finally {
267
                printHeader("Done processing document " + sysmeta.getIdentifier().getValue());
268 5397 berkley
            }
269 5394 berkley
        }
270
271 6534 leinfelder
        //logout();
272 5394 berkley
    }
273
274 6134 leinfelder
275 5394 berkley
276
    /**
277
     * @param doc
278
     * @return
279
     */
280
    private SystemMetadata generateSystemMetadata(Document doc)
281 6538 leinfelder
      throws Exception {
282 5394 berkley
        SystemMetadata sm = new SystemMetadata();
283 6561 leinfelder
        sm.setSerialVersion(BigInteger.valueOf(1));
284 5394 berkley
        //set the id
285
        Identifier id = new Identifier();
286 5452 berkley
        id.setValue(doc.docid.trim());
287 5394 berkley
        sm.setIdentifier(id);
288
289
        //set the object format
290 6144 cjones
        ObjectFormat format = ObjectFormatCache.getInstance().getFormat(doc.doctype);
291 6538 leinfelder
        if (format == null) {
292
            if (doc.doctype.trim().equals("BIN")) {
293 6144 cjones
                format = ObjectFormatCache.getInstance().getFormat("application/octet-stream");
294 5394 berkley
            }
295 6538 leinfelder
            else {
296 6144 cjones
                format = ObjectFormatCache.getInstance().getFormat("text/plain");
297 5394 berkley
            }
298
        }
299 6561 leinfelder
        sm.setFormatId(format.getFormatId());
300 5394 berkley
301
        //create the checksum
302 5760 leinfelder
        ByteArrayInputStream bais = new ByteArrayInputStream(doc.doctext.getBytes(MetaCatServlet.DEFAULT_ENCODING));
303 7222 leinfelder
        String algorithm = PropertyService.getProperty("dataone.checksumAlgorithm.default");
304
        Checksum checksum = ChecksumUtil.checksum(bais, algorithm);
305 5394 berkley
        sm.setChecksum(checksum);
306
307
        //set the size
308 6386 cjones
        String sizeStr = Long.toString(doc.doctext.getBytes(MetaCatServlet.DEFAULT_ENCODING).length);
309
        sm.setSize(new BigInteger(sizeStr));
310 5394 berkley
311 6538 leinfelder
        //submitter, rights holder
312 6092 jones
        Subject p = new Subject();
313 6538 leinfelder
        p.setValue(subjectDN);
314 5394 berkley
        sm.setSubmitter(p);
315
        sm.setRightsHolder(p);
316 6538 leinfelder
        try {
317 5394 berkley
            Date dateCreated = parseMetacatDate(doc.createDate);
318
            sm.setDateUploaded(dateCreated);
319
            Date dateUpdated = parseMetacatDate(doc.updateDate);
320
            sm.setDateSysMetadataModified(dateUpdated);
321
        }
322 6538 leinfelder
        catch(Exception e) {
323 5394 berkley
            System.out.println("couldn't parse a date: " + e.getMessage());
324
            Date dateCreated = new Date();
325
            sm.setDateUploaded(dateCreated);
326
            Date dateUpdated = new Date();
327
            sm.setDateSysMetadataModified(dateUpdated);
328
        }
329
        NodeReference nr = new NodeReference();
330 7030 cjones
        nr.setValue(PropertyService.getProperty("dataone.nodeId"));
331 5394 berkley
        sm.setOriginMemberNode(nr);
332
        sm.setAuthoritativeMemberNode(nr);
333 5568 berkley
334 6534 leinfelder
        // create access policy
335
        AccessPolicy accessPolicy = new AccessPolicy();
336
        AccessRule accessRule = new AccessRule();
337
		accessRule.addPermission(Permission.READ);
338
        Subject subject = new Subject();
339
        subject.setValue(Constants.SUBJECT_PUBLIC);
340
		accessRule.addSubject(subject);
341
		accessPolicy.addAllow(accessRule);
342 6538 leinfelder
343 6534 leinfelder
		sm.setAccessPolicy(accessPolicy);
344
345 5394 berkley
        return sm;
346
    }
347
348 6538 leinfelder
    private void printHeader(String s) {
349 5568 berkley
        System.out.println("****** " + s + " *******");
350
    }
351
352
    /**
353 5394 berkley
     * parse the metacat date which looks like 2010-06-08 (YYYY-MM-DD) into
354
     * a proper date object
355
     * @param date
356
     * @return
357
     */
358
    private Date parseMetacatDate(String date)
359
    {
360
        String year = date.substring(0, 4);
361
        String month = date.substring(5, 7);
362
        String day = date.substring(8, 10);
363
        Calendar c = Calendar.getInstance();
364
        c.set(new Integer(year).intValue(),
365
              new Integer(month).intValue(),
366
              new Integer(day).intValue());
367
        return c.getTime();
368
    }
369
370
    /**
371
     * parse a metacat query response and return a vector of docids
372
     * @param response
373
     * @return
374
     */
375
    private Vector<Document> parseResponse(String response)
376
    {
377
        Vector<Document> v = new Vector<Document>();
378
        int dstart = response.indexOf("<document>");
379
        int dend = response.indexOf("</document>", dstart);
380
        while(dstart != -1)
381
        {
382
            String doc = response.substring(dstart + "<document>".length(), dend);
383
            //System.out.println("adding " + docid);
384
            Document d = new Document(getFieldFromDoc(doc, "docid"),
385
                    getFieldFromDoc(doc, "doctype"),
386
                    getFieldFromDoc(doc, "createdate"),
387
                    getFieldFromDoc(doc, "updatedate"));
388
            v.add(d);
389
            dstart = response.indexOf("<document>", dend);
390
            dend = response.indexOf("</document>", dstart);
391
        }
392
393
        return v;
394
    }
395
396
    private String getFieldFromDoc(String doc, String fieldname)
397
    {
398
        String field = "<" + fieldname + ">";
399
        String fieldend = "</" + fieldname + ">";
400
        int start = doc.indexOf(field);
401
        int end = doc.indexOf(fieldend);
402
        String s = doc.substring(start + field.length(), end);
403 5441 berkley
        //System.out.println("field: " + fieldname + " : " + s);
404 5394 berkley
        return s;
405
    }
406
407
408
    /**
409
     * returns a sessionid
410
     * @return
411
     */
412 6700 leinfelder
    private String login()
413 5394 berkley
      throws Exception
414
    {
415
        InputStream is = getResponse(sourceUrl, "/metacat",
416 6152 leinfelder
                "action=login&username=" + username + "&password=" + password + "&qformat=xml", "POST");
417 6700 leinfelder
        String response = IOUtils.toString(is, MetaCatServlet.DEFAULT_ENCODING);
418 5394 berkley
        //System.out.println("response: " + response);
419 5397 berkley
        if(response.indexOf("sessionId") == -1)
420
        {
421
            throw new Exception("Error logging into " + sourceUrl);
422
        }
423
424 5394 berkley
        String sessionid = response.substring(
425
                response.indexOf("<sessionId>") + "<sessionId>".length(),
426
                response.indexOf("</sessionId>"));
427
        System.out.println("sessionid: " + sessionid);
428
        return sessionid;
429
    }
430
431
    /**
432
     * logout both the source and destination
433
     * @throws Exception
434
     */
435
    private void logout()
436
        throws Exception
437
    {
438
        getResponse(sourceUrl, "/metacat", "action=logout&username=" + username, "POST");
439
    }
440
441
    /**
442
     * get an http response
443
     * @param contextRootUrl
444
     * @param resource
445
     * @param urlParameters
446
     * @param method
447
     * @return
448
     * @throws Exception
449
     */
450
    private InputStream getResponse(String contextRootUrl, String resource,
451
            String urlParameters, String method)
452
      throws Exception
453
    {
454
        HttpURLConnection connection = null ;
455
456
        String restURL = contextRootUrl+resource;
457
458
        if (urlParameters != null) {
459
            if (restURL.indexOf("?") == -1)
460
                restURL += "?";
461
            restURL += urlParameters;
462
            if(restURL.indexOf(" ") != -1)
463
            {
464
                restURL = restURL.replaceAll("\\s", "%20");
465
            }
466
        }
467
468
        URL u = null;
469
        InputStream content = null;
470
        System.out.println("url: " + restURL);
471
        System.out.println("method: " + method);
472
        u = new URL(restURL);
473
        connection = (HttpURLConnection) u.openConnection();
474
        connection.setDoOutput(true);
475
        connection.setDoInput(true);
476
        connection.setRequestMethod(method);
477
        content = connection.getInputStream();
478
        return content;
479
    }
480
481
    private class Document
482
    {
483
        public String docid;
484
        public String doctype;
485
        public String createDate;
486
        public String updateDate;
487
        public String doctext;
488
489
        public Document(String docid, String doctype, String createDate, String updateDate)
490
        {
491 5452 berkley
            this.docid = docid.trim();
492
            this.doctype = doctype.trim();
493
            this.createDate = createDate.trim();
494
            this.updateDate = updateDate.trim();
495 5394 berkley
        }
496
    }
497
}