Project

General

Profile

1 3244 jones
/**
2
 *  '$RCSfile$'
3
 *  Copyright: 2007 Regents of the University of California and the
4
 *             National Center for Ecological Analysis and Synthesis
5
 *
6
 *   '$Author$'
7
 *     '$Date$'
8
 * '$Revision$'
9
 *
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * This program is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
 * GNU General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU General Public License
21
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23
 */
24
25
package edu.ucsb.nceas.metacat;
26
27
import java.io.File;
28 7764 jones
import java.io.FileOutputStream;
29 3244 jones
import java.io.IOException;
30 7764 jones
import java.io.OutputStreamWriter;
31 3244 jones
import java.io.Writer;
32 7764 jones
import java.nio.charset.Charset;
33 3244 jones
import java.sql.PreparedStatement;
34
import java.sql.ResultSet;
35
import java.sql.SQLException;
36
import java.util.TimerTask;
37
38
import org.apache.log4j.Logger;
39
40 5015 daigle
import edu.ucsb.nceas.metacat.database.DBConnection;
41
import edu.ucsb.nceas.metacat.database.DBConnectionPool;
42 5030 daigle
import edu.ucsb.nceas.metacat.properties.PropertyService;
43 4080 daigle
import edu.ucsb.nceas.utilities.PropertyNotFoundException;
44
45 3244 jones
/**
46
 * A Sitemap represents a document that lists all of the content of the Metacat
47
 * server for use by harvesting spiders that wish to index the contents of the
48
 * Metacat site. It is used to generate an XML representation of all of the URLs
49
 * of the site in order to facilitate indexing of the metacat site by search
50
 * engines.
51
 *
52
 * @author Matt Jones
53
 */
54
public class Sitemap extends TimerTask {
55 4734 daigle
56
	private static Logger logMetacat = Logger.getLogger(Sitemap.class);
57 3244 jones
58
    /**
59
     * Construct a new instance of the Sitemap.
60
     *
61
     * @param directory
62
     *            the location to store sitemap files
63
     * @param urlRoot
64
     *            the base URL for constructing sitemap URLs
65
     * @param skin
66
     *            the format skin to be used in URLs
67
     */
68
    public Sitemap(File directory, String urlRoot, String skin) {
69
        super();
70
        this.directory = directory;
71
        this.urlRoot = urlRoot;
72
        this.skin = skin;
73
    }
74
75
    /**
76
     * Execute the timed task when called, in this case by generating the
77
     * sitemap files needed for this Metacat instance.
78
     */
79
    public void run() {
80
        generateSitemaps();
81
    }
82
83
    /**
84
     * Generate all of the sitemap files needed to list the URLs from this
85
     * instance of Metacat, using the open sitemap format described here:
86
     *
87
     * URLs are written to a single file, unless the maximum number of URLs
88
     * allowed inthe sitemap file is exceeded, in which subsequent numbered
89
     * files are created.
90
     *
91
     * @param directory
92
     *            an existing File directory in which to write the sitemaps
93
     * @param urlRoot
94
     *            the base URL to use in constructing document URLs
95
     * @param skin
96
     *            the name of the skin to be used in formatting metacat
97
     *            documents
98
     */
99
    public void generateSitemaps() {
100
101
        logMetacat.info("Running the Sitemap task.");
102
103
        // Test if the passed in File is a directory
104
        if (directory.isDirectory()) {
105
            // Query xml_documents to get list of documents
106
            StringBuffer query = new StringBuffer();
107
            // TODO: make the doctype configurable in the query
108 4511 daigle
            String sql =
109
            	"SELECT xml_documents.docid, xml_documents.rev " +
110 6744 leinfelder
            	"FROM xml_documents, xml_access, identifier " +
111 4511 daigle
                "WHERE xml_documents.doctype LIKE 'eml:%' " +
112 6744 leinfelder
                "AND xml_documents.docid = identifier.docid " +
113
                "AND xml_documents.rev = identifier.rev " +
114
                "AND identifier.guid = xml_access.guid " +
115 4511 daigle
                "AND xml_access.principal_name = 'public' " +
116
                "AND xml_access.perm_type = 'allow' " +
117
                "order by docid, rev";
118
            query.append(sql);
119 3244 jones
120
            DBConnection dbConn = null;
121
            int serialNumber = -1;
122
            try {
123
                // Get a database connection from the pool
124
                dbConn = DBConnectionPool
125
                        .getDBConnection("Sitemap.generateSitemap()");
126
                serialNumber = dbConn.getCheckOutSerialNumber();
127
128
                // Execute the query statement
129 6595 leinfelder
                PreparedStatement stmt = dbConn.prepareStatement(query.toString());
130 3244 jones
                stmt.execute();
131
                ResultSet rs = stmt.getResultSet();
132
133
                // Loop through all of the documents, and write them to a
134
                // sitemap
135
                File sitemapFile = null;
136 7764 jones
                OutputStreamWriter sitemap = null;
137 3244 jones
                int counter = 0;
138
                int fileNumber = 0;
139
                while (rs.next()) {
140
                    // Check if a new sitemap file needs to be created
141
                    if (counter % MAX_URLS_IN_FILE == 0) {
142
143
                        // if a sitemap file is already open
144
                        if (sitemapFile != null && sitemapFile.canWrite()) {
145
                            // write the footer and close the file
146
                            writeSitemapFooter(sitemap);
147
                        }
148
149
                        // Open a new sitemap file for writing
150
                        fileNumber++;
151
                        sitemapFile = new File(directory, fileRoot + fileNumber
152
                                + ".xml");
153 7764 jones
                        sitemap = new OutputStreamWriter(new FileOutputStream(sitemapFile), Charset.forName("UTF-8"));
154 3244 jones
155
                        // Write the sitemap document header for the new file
156
                        writeSitemapHeader(sitemap);
157
                    }
158
159 4212 daigle
                    String separator = PropertyService.getProperty("document.accNumSeparator");
160 3244 jones
                    String docid = rs.getString(1) + separator
161
                            + rs.getString(2);
162
                    writeSitemapEntry(sitemap, docid);
163
                    counter++;
164
                }
165
                stmt.close();
166
                writeSitemapFooter(sitemap);
167 7764 jones
                writeSitemapIndex(fileNumber);
168 3244 jones
            } catch (SQLException e) {
169
                logMetacat.warn("Error while writing to the sitemap file: "
170
                        + e.getMessage());
171
            } catch (IOException ioe) {
172
                logMetacat.warn("Could not open or write to the sitemap file."
173
                        + ioe.getMessage());
174 4080 daigle
            } catch (PropertyNotFoundException pnfe) {
175
                logMetacat.warn("Could not retrieve the account number separator."
176
                        + pnfe.getMessage());
177 3244 jones
            } finally {
178
                // Return database connection to the pool
179
                DBConnectionPool.returnDBConnection(dbConn, serialNumber);
180
            }
181
        } else {
182
            logMetacat.warn("Sitemap not created because directory not valid.");
183
        }
184
    }
185
186
    /**
187
     * Write the header information in a single sitemap file. This includes the
188
     * XML prolog, the root element and namespace declaration, and the elements
189
     * leading up to the first URL entry.
190
     *
191
     * @param sitemap
192
     *            the Writer to use for writing the header
193
     * @throws IOException
194
     *             if there is a problem writing to the Writer
195
     */
196
    private void writeSitemapHeader(Writer sitemap) throws IOException {
197
        sitemap.write(PROLOG);
198 7735 jones
        String header = "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\"\n" +
199
                "xmlns:sm=\"http://www.sitemaps.org/schemas/sitemap/0.9\"\n" +
200
                "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n" +
201 7764 jones
                "xsi:schemaLocation=\"http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd\">\n";
202 7735 jones
203 3244 jones
        sitemap.write(header);
204
        sitemap.flush();
205
    }
206
207
    /**
208
     * Write a URL entry to a single sitemap file. This includes the XML markup
209
     * surrounding a particular site URL.
210
     *
211
     * @param sitemap
212
     *            the Writer to use for writing the URL
213
     * @param docid
214
     *            the identifier to be written in the URL
215
     * @param urlRoot
216
     *            the base URL to be used in constructing a URL
217
     * @param skin
218
     *            the name of the skin to be used in constructing a URL
219
     * @throws IOException
220
     *             if there is a problem writing to the Writer
221
     */
222
    private void writeSitemapEntry(Writer sitemap, String docid)
223
            throws IOException {
224
        if (sitemap != null && docid != null && urlRoot != null) {
225
            StringBuffer url = new StringBuffer();
226
            url.append(urlRoot);
227
            if (!urlRoot.endsWith("/")) {
228
                url.append("/");
229
            }
230
            url.append(docid);
231
            if (skin != null) {
232
                url.append("/");
233
                url.append(skin);
234
            }
235
            sitemap.write("<url><loc>");
236
            sitemap.write(url.toString());
237
            sitemap.write("</loc>");
238
            // <lastmod>2005-01-01</lastmod>
239
            // <changefreq>monthly</changefreq>
240
            // <priority>0.8</priority>
241
            sitemap.write("</url>");
242
            sitemap.write("\n");
243
            sitemap.flush();
244
        }
245
    }
246
247
    /**
248
     * Write the footer information in a single sitemap file and close the file.
249
     * This includes the closing tag for the root element.
250
     *
251
     * @param sitemap
252
     *            the Writer to use for writing the footer
253
     * @throws IOException
254
     *             if there is a problem writing to the Writer
255
     */
256
    private void writeSitemapFooter(Writer sitemap) throws IOException {
257 3250 tao
        if (sitemap != null)
258
        {
259
	    	String footer = "</urlset>\n";
260
	        sitemap.write(footer);
261
	        sitemap.close();
262
        }
263 3244 jones
    }
264
265 7764 jones
    /**
266
     * Create an index file listing all of the sitemap files that were created.
267
     * @param fileNumber the number of sitemap files that were created.
268
     */
269
    private void writeSitemapIndex(int fileNumber) {
270
271
        // Open a new sitemapIndex file for writing
272
        File sitemapIndexFile = null;
273
        OutputStreamWriter sitemapIndex = null;
274
        sitemapIndexFile = new File(directory, indexFilename);
275
        try {
276
            sitemapIndex = new OutputStreamWriter(new FileOutputStream(sitemapIndexFile), Charset.forName("UTF-8"));
277
278
            // Write the sitemap index header for the new file
279
            sitemapIndex.write(PROLOG);
280
            String header = "<sitemapindex xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\"\n"
281
                    + "xmlns:sm=\"http://www.sitemaps.org/schemas/sitemap/0.9\"\n" + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n"
282
                    + "xsi:schemaLocation=\"http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd\">\n";
283
            sitemapIndex.write(header);
284
            sitemapIndex.flush();
285
286
            // Write out one index entry for each sitemap file
287
            for (int fn = 1; fn <= fileNumber; fn++) {
288
                String filename = fileRoot + fileNumber + ".xml";
289
                writeSitemapIndexEntry(sitemapIndex, filename);
290
            }
291
292
            // Write the sitemap index footer content
293
            if (sitemapIndex != null) {
294
                String footer = "</sitemapindex>\n";
295
                sitemapIndex.write(footer);
296
                sitemapIndex.close();
297
            }
298
299
            // Close the index file
300
            if (sitemapIndex != null) {
301
                sitemapIndex.close();
302
            }
303
304
        } catch (IOException e) {
305
            logMetacat.warn("Could not open or write to the sitemap index file." + e.getMessage());
306
        }
307
    }
308
309
    /**
310
     * Write a single line of the sitemap index file containing the URL to a specific sitemap file.
311
     * @param sitemapIndex the writer to which the index information is written
312
     * @param filename the name of the index file to be used
313
     * @throws IOException on error writing to the index file
314
     */
315
    private void writeSitemapIndexEntry(Writer sitemapIndex, String filename)
316
            throws IOException {
317
        if (sitemapIndex != null && filename != null && urlRoot != null) {
318
            StringBuffer url = new StringBuffer();
319
            url.append(urlRoot);
320
            if (!urlRoot.endsWith("/")) {
321
                url.append("/");
322
            }
323
            url.append(filename);
324
            sitemapIndex.write("<sitemap><loc>");
325
            sitemapIndex.write(url.toString());
326
            sitemapIndex.write("</loc>");
327
            sitemapIndex.write("</sitemap>");
328
            sitemapIndex.write("\n");
329
            sitemapIndex.flush();
330
        }
331
    }
332
333 3244 jones
    // Member variables
334
335
    /** The directory in which sitemaps are written. */
336
    private File directory;
337
338
    /** The root url for constructing sitemap URLs. */
339
    private String urlRoot;
340
341
    /** The name of the format skin to be used in sitemap URLs. */
342
    private String skin;
343
344
    /** Maximum number of URLs to write to a single sitemap file */
345
    static final int MAX_URLS_IN_FILE = 25000; // 50,000 according to Google
346
347
    /** The root name to be used in naming sitemap files. */
348
    static final String fileRoot = "metacat";
349 7764 jones
350
    /** The name to give to the sitemap index file */
351
    static final String indexFilename = "metacatSitemapIndex.xml";
352 3244 jones
353
    /** A String constant containing the XML prolog to be written in files. */
354
    static final String PROLOG = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n";
355
}