Project

General

Profile

1
/**
2
 *  '$RCSfile$'
3
 *  Copyright: 2007 Regents of the University of California and the
4
 *             National Center for Ecological Analysis and Synthesis
5
 *
6
 *   '$Author: jones $'
7
 *     '$Date: 2013-05-23 13:26:48 -0700 (Thu, 23 May 2013) $'
8
 * '$Revision: 7766 $'
9
 *
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * This program is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
 * GNU General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU General Public License
21
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23
 */
24

    
25
package edu.ucsb.nceas.metacat;
26

    
27
import java.io.File;
28
import java.io.FileOutputStream;
29
import java.io.IOException;
30
import java.io.OutputStreamWriter;
31
import java.io.Writer;
32
import java.nio.charset.Charset;
33
import java.sql.PreparedStatement;
34
import java.sql.ResultSet;
35
import java.sql.SQLException;
36
import java.text.SimpleDateFormat;
37
import java.util.Date;
38
import java.util.TimerTask;
39

    
40
import org.apache.log4j.Logger;
41

    
42
import edu.ucsb.nceas.metacat.database.DBConnection;
43
import edu.ucsb.nceas.metacat.database.DBConnectionPool;
44
import edu.ucsb.nceas.metacat.properties.PropertyService;
45
import edu.ucsb.nceas.utilities.PropertyNotFoundException;
46

    
47
/**
48
 * A Sitemap represents a document that lists all of the content of the Metacat
49
 * server for use by harvesting spiders that wish to index the contents of the
50
 * Metacat site. It is used to generate an XML representation of all of the URLs
51
 * of the site in order to facilitate indexing of the metacat site by search
52
 * engines.
53
 * 
54
 * @author Matt Jones
55
 */
56
public class Sitemap extends TimerTask {
57
	
58
	private static Logger logMetacat = Logger.getLogger(Sitemap.class);
59

    
60
    /**
61
     * Construct a new instance of the Sitemap.
62
     * 
63
     * @param directory
64
     *            the location to store sitemap files
65
     * @param urlRoot
66
     *            the base URL for constructing sitemap URLs
67
     * @param skin
68
     *            the format skin to be used in URLs
69
     */
70
    public Sitemap(File directory, String urlRoot, String skin) {
71
        super();
72
        this.directory = directory;
73
        this.urlRoot = urlRoot;
74
        this.skin = skin;
75
    }
76

    
77
    /**
78
     * Execute the timed task when called, in this case by generating the
79
     * sitemap files needed for this Metacat instance.
80
     */
81
    public void run() {
82
        generateSitemaps();
83
    }
84

    
85
    /**
86
     * Generate all of the sitemap files needed to list the URLs from this
87
     * instance of Metacat, using the open sitemap format described here:
88
     *   http://www.sitemaps.org/protocol.html
89
     * URLs are written to a single file, unless the maximum number of URLs
90
     * allowed in the sitemap file is exceeded, in which subsequent numbered
91
     * files are created. An index of the sitemaps is also created.
92
     * 
93
     * The sitemap index can be registered with search index providers such as
94
     * Google, but beware that it needs to be accessible in a location above the
95
     * mount point for the service URLs.  By default the files are placed in 
96
     * {context}/sitemaps, but you will need to expose them at {context}/ for
97
     * them to be trusted by Google.  See the Sitemaps.org documentation for
98
     * details.
99
     * 
100
     * @param directory
101
     *            an existing File directory in which to write the sitemaps
102
     * @param urlRoot
103
     *            the base URL to use in constructing document URLs
104
     * @param skin
105
     *            the name of the skin to be used in formatting metacat
106
     *            documents
107
     */
108
    public void generateSitemaps() {
109

    
110
        logMetacat.info("Running the Sitemap task.");
111

    
112
        // Test if the passed in File is a directory
113
        if (directory.isDirectory()) {
114
            // Query xml_documents to get list of documents
115
            StringBuffer query = new StringBuffer();
116
            // TODO: make the doctype configurable in the query
117
            String sql =
118
            	"SELECT xml_documents.docid, xml_documents.rev " +
119
            	"FROM xml_documents, xml_access, identifier " +
120
                "WHERE xml_documents.doctype LIKE 'eml:%' " + 
121
                "AND xml_documents.docid = identifier.docid " +
122
                "AND xml_documents.rev = identifier.rev " +
123
                "AND identifier.guid = xml_access.guid " +
124
                "AND xml_access.principal_name = 'public' " +
125
                "AND xml_access.perm_type = 'allow' " +
126
                "order by docid, rev";
127
            query.append(sql);
128

    
129
            DBConnection dbConn = null;
130
            int serialNumber = -1;
131
            try {
132
                // Get a database connection from the pool
133
                dbConn = DBConnectionPool
134
                        .getDBConnection("Sitemap.generateSitemap()");
135
                serialNumber = dbConn.getCheckOutSerialNumber();
136

    
137
                // Execute the query statement
138
                PreparedStatement stmt = dbConn.prepareStatement(query.toString());
139
                stmt.execute();
140
                ResultSet rs = stmt.getResultSet();
141

    
142
                // Loop through all of the documents, and write them to a
143
                // sitemap
144
                File sitemapFile = null;
145
                OutputStreamWriter sitemap = null;
146
                int counter = 0;
147
                int fileNumber = 0;
148
                while (rs.next()) {
149
                    // Check if a new sitemap file needs to be created
150
                    if (counter % MAX_URLS_IN_FILE == 0) {
151

    
152
                        // if a sitemap file is already open
153
                        if (sitemapFile != null && sitemapFile.canWrite()) {
154
                            // write the footer and close the file
155
                            writeSitemapFooter(sitemap);
156
                        }
157

    
158
                        // Open a new sitemap file for writing
159
                        fileNumber++;
160
                        sitemapFile = new File(directory, fileRoot + fileNumber
161
                                + ".xml");
162
                        sitemap = new OutputStreamWriter(new FileOutputStream(sitemapFile), Charset.forName("UTF-8"));
163

    
164
                        // Write the sitemap document header for the new file
165
                        writeSitemapHeader(sitemap);
166
                    }
167

    
168
                    String separator = PropertyService.getProperty("document.accNumSeparator");
169
                    String docid = rs.getString(1) + separator
170
                            + rs.getString(2);
171
                    writeSitemapEntry(sitemap, docid);
172
                    counter++;
173
                }
174
                stmt.close();
175
                writeSitemapFooter(sitemap);
176
                writeSitemapIndex(fileNumber);
177
            } catch (SQLException e) {
178
                logMetacat.warn("Error while writing to the sitemap file: "
179
                        + e.getMessage());
180
            } catch (IOException ioe) {
181
                logMetacat.warn("Could not open or write to the sitemap file."
182
                        + ioe.getMessage());
183
            } catch (PropertyNotFoundException pnfe) {
184
                logMetacat.warn("Could not retrieve the account number separator."
185
                        + pnfe.getMessage());
186
            } finally {
187
                // Return database connection to the pool
188
                DBConnectionPool.returnDBConnection(dbConn, serialNumber);
189
            }
190
        } else {
191
            logMetacat.warn("Sitemap not created because directory not valid.");
192
        }
193
    }
194

    
195
    /**
196
     * Write the header information in a single sitemap file. This includes the
197
     * XML prolog, the root element and namespace declaration, and the elements
198
     * leading up to the first URL entry.
199
     * 
200
     * @param sitemap
201
     *            the Writer to use for writing the header
202
     * @throws IOException
203
     *             if there is a problem writing to the Writer
204
     */
205
    private void writeSitemapHeader(Writer sitemap) throws IOException {
206
        sitemap.write(PROLOG);
207
        String header = "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\"\n" +
208
                "xmlns:sm=\"http://www.sitemaps.org/schemas/sitemap/0.9\"\n" +
209
                "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n" +
210
                "xsi:schemaLocation=\"http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd\">\n";
211
        
212
        sitemap.write(header);
213
        sitemap.flush();
214
    }
215

    
216
    /**
217
     * Write a URL entry to a single sitemap file. This includes the XML markup
218
     * surrounding a particular site URL.
219
     * 
220
     * @param sitemap
221
     *            the Writer to use for writing the URL
222
     * @param docid
223
     *            the identifier to be written in the URL
224
     * @param urlRoot
225
     *            the base URL to be used in constructing a URL
226
     * @param skin
227
     *            the name of the skin to be used in constructing a URL
228
     * @throws IOException
229
     *             if there is a problem writing to the Writer
230
     */
231
    private void writeSitemapEntry(Writer sitemap, String docid)
232
            throws IOException {
233
        if (sitemap != null && docid != null && urlRoot != null) {
234
            StringBuffer url = new StringBuffer();
235
            url.append(urlRoot);
236
            if (!urlRoot.endsWith("/")) {
237
                url.append("/");
238
            }
239
            url.append(docid);
240
            if (skin != null) {
241
                url.append("/");
242
                url.append(skin);
243
            }
244
            sitemap.write("<url><loc>");
245
            sitemap.write(url.toString());
246
            sitemap.write("</loc>");
247
            // <lastmod>2005-01-01</lastmod>
248
            // <changefreq>monthly</changefreq>
249
            // <priority>0.8</priority>
250
            sitemap.write("</url>");
251
            sitemap.write("\n");
252
            sitemap.flush();
253
        }
254
    }
255

    
256
    /**
257
     * Write the footer information in a single sitemap file and close the file.
258
     * This includes the closing tag for the root element.
259
     * 
260
     * @param sitemap
261
     *            the Writer to use for writing the footer
262
     * @throws IOException
263
     *             if there is a problem writing to the Writer
264
     */
265
    private void writeSitemapFooter(Writer sitemap) throws IOException {
266
        if (sitemap != null)
267
        {
268
	    	String footer = "</urlset>\n";
269
	        sitemap.write(footer);
270
	        sitemap.close();
271
        }
272
    }
273

    
274
    /**
275
     * Create an index file listing all of the sitemap files that were created.
276
     * @param fileNumber the number of sitemap files that were created.
277
     */
278
    private void writeSitemapIndex(int fileNumber) {
279
        
280
        // Open a new sitemapIndex file for writing
281
        File sitemapIndexFile = null;
282
        OutputStreamWriter sitemapIndex = null;
283
        sitemapIndexFile = new File(directory, indexFilename);
284
        try {
285
            sitemapIndex = new OutputStreamWriter(new FileOutputStream(sitemapIndexFile), Charset.forName("UTF-8"));
286

    
287
            // Write the sitemap index header for the new file
288
            sitemapIndex.write(PROLOG);
289
            String header = "<sitemapindex xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\"\n"
290
                    + "xmlns:sm=\"http://www.sitemaps.org/schemas/sitemap/0.9\"\n" + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n"
291
                    + "xsi:schemaLocation=\"http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd\">\n";
292
            sitemapIndex.write(header);
293
            sitemapIndex.flush();
294

    
295
            // Write out one index entry for each sitemap file
296
            for (int fn = 1; fn <= fileNumber; fn++) {
297
                String filename = fileRoot + fileNumber + ".xml";
298
                writeSitemapIndexEntry(sitemapIndex, filename);
299
            }
300

    
301
            // Write the sitemap index footer content
302
            if (sitemapIndex != null) {
303
                String footer = "</sitemapindex>\n";
304
                sitemapIndex.write(footer);
305
                sitemapIndex.close();
306
            }
307

    
308
            // Close the index file
309
            if (sitemapIndex != null) {
310
                sitemapIndex.close();
311
            }
312

    
313
        } catch (IOException e) {
314
            logMetacat.warn("Could not open or write to the sitemap index file." + e.getMessage());
315
        }
316
    }
317
    
318
    /**
319
     * Write a single line of the sitemap index file containing the URL to a specific sitemap file.
320
     * @param sitemapIndex the writer to which the index information is written
321
     * @param filename the name of the index file to be used
322
     * @throws IOException on error writing to the index file 
323
     */
324
    private void writeSitemapIndexEntry(Writer sitemapIndex, String filename)
325
            throws IOException {
326
        if (sitemapIndex != null && filename != null && urlRoot != null) {
327
            StringBuffer url = new StringBuffer();
328
            url.append(urlRoot);
329
            if (!urlRoot.endsWith("/")) {
330
                url.append("/");
331
            }
332
            url.append(filename);
333
            sitemapIndex.write("<sitemap><loc>");
334
            sitemapIndex.write(url.toString());
335
            sitemapIndex.write("</loc>");
336
            Date now = new Date();
337
            SimpleDateFormat fmt = new SimpleDateFormat("yyyy-MM-dd");
338
            sitemapIndex.write("<lastmod>"+ fmt.format(now) +"</lastmod>");
339
            sitemapIndex.write("</sitemap>");
340
            sitemapIndex.write("\n");
341
            sitemapIndex.flush();
342
        }
343
    }
344
    
345
    // Member variables
346

    
347
    /** The directory in which sitemaps are written. */
348
    private File directory;
349

    
350
    /** The root url for constructing sitemap URLs. */
351
    private String urlRoot;
352

    
353
    /** The name of the format skin to be used in sitemap URLs. */
354
    private String skin;
355

    
356
    /** Maximum number of URLs to write to a single sitemap file */
357
    static final int MAX_URLS_IN_FILE = 25000; // 50,000 according to Google
358

    
359
    /** The root name to be used in naming sitemap files. */
360
    static final String fileRoot = "metacat";
361
    
362
    /** The name to give to the sitemap index file */
363
    static final String indexFilename = "metacatSitemapIndex.xml";
364

    
365
    /** A String constant containing the XML prolog to be written in files. */
366
    static final String PROLOG = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n";
367
}
(60-60/64)