Project

General

Profile

1
/**
2
 *  '$RCSfile$'
3
 *  Copyright: 2007 Regents of the University of California and the
4
 *             National Center for Ecological Analysis and Synthesis
5
 *
6
 *   '$Author: daigle $'
7
 *     '$Date: 2008-08-05 17:33:45 -0700 (Tue, 05 Aug 2008) $'
8
 * '$Revision: 4212 $'
9
 *
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * This program is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
 * GNU General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU General Public License
21
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23
 */
24

    
25
package edu.ucsb.nceas.metacat;
26

    
27
import java.io.File;
28
import java.io.FileWriter;
29
import java.io.IOException;
30
import java.io.Writer;
31
import java.sql.PreparedStatement;
32
import java.sql.ResultSet;
33
import java.sql.SQLException;
34
import java.util.TimerTask;
35

    
36
import org.apache.log4j.Logger;
37

    
38
import edu.ucsb.nceas.metacat.service.PropertyService;
39
import edu.ucsb.nceas.utilities.PropertyNotFoundException;
40

    
41
/**
42
 * A Sitemap represents a document that lists all of the content of the Metacat
43
 * server for use by harvesting spiders that wish to index the contents of the
44
 * Metacat site. It is used to generate an XML representation of all of the URLs
45
 * of the site in order to facilitate indexing of the metacat site by search
46
 * engines.
47
 * 
48
 * @author Matt Jones
49
 */
50
public class Sitemap extends TimerTask {
51

    
52
    /**
53
     * Construct a new instance of the Sitemap.
54
     * 
55
     * @param directory
56
     *            the location to store sitemap files
57
     * @param urlRoot
58
     *            the base URL for constructing sitemap URLs
59
     * @param skin
60
     *            the format skin to be used in URLs
61
     */
62
    public Sitemap(File directory, String urlRoot, String skin) {
63
        super();
64
        this.directory = directory;
65
        this.urlRoot = urlRoot;
66
        this.skin = skin;
67
    }
68

    
69
    /**
70
     * Execute the timed task when called, in this case by generating the
71
     * sitemap files needed for this Metacat instance.
72
     */
73
    public void run() {
74
        generateSitemaps();
75
    }
76

    
77
    /**
78
     * Generate all of the sitemap files needed to list the URLs from this
79
     * instance of Metacat, using the open sitemap format described here:
80
     * 
81
     * URLs are written to a single file, unless the maximum number of URLs
82
     * allowed inthe sitemap file is exceeded, in which subsequent numbered
83
     * files are created.
84
     * 
85
     * @param directory
86
     *            an existing File directory in which to write the sitemaps
87
     * @param urlRoot
88
     *            the base URL to use in constructing document URLs
89
     * @param skin
90
     *            the name of the skin to be used in formatting metacat
91
     *            documents
92
     */
93
    public void generateSitemaps() {
94
        Logger logMetacat = Logger.getLogger(Sitemap.class);
95

    
96
        logMetacat.info("Running the Sitemap task.");
97

    
98
        // Test if the passed in File is a directory
99
        if (directory.isDirectory()) {
100
            // Query xml_documents to get list of documents
101
            StringBuffer query = new StringBuffer();
102
            // TODO: make the doctype configurable in the query
103
            query.append("select docid, rev from xml_documents "
104
                    + "where doctype LIKE 'eml:%' " + "order by docid, rev");
105

    
106
            DBConnection dbConn = null;
107
            int serialNumber = -1;
108
            try {
109
                // Get a database connection from the pool
110
                dbConn = DBConnectionPool
111
                        .getDBConnection("Sitemap.generateSitemap()");
112
                serialNumber = dbConn.getCheckOutSerialNumber();
113

    
114
                // Execute the query statement
115
                PreparedStatement stmt = dbConn.prepareStatement(query
116
                        .toString());
117
                stmt.execute();
118
                ResultSet rs = stmt.getResultSet();
119

    
120
                // Loop through all of the documents, and write them to a
121
                // sitemap
122
                File sitemapFile = null;
123
                FileWriter sitemap = null;
124
                int counter = 0;
125
                int fileNumber = 0;
126
                while (rs.next()) {
127
                    // Check if a new sitemap file needs to be created
128
                    if (counter % MAX_URLS_IN_FILE == 0) {
129

    
130
                        // if a sitemap file is already open
131
                        if (sitemapFile != null && sitemapFile.canWrite()) {
132
                            // write the footer and close the file
133
                            writeSitemapFooter(sitemap);
134
                        }
135

    
136
                        // Open a new sitemap file for writing
137
                        fileNumber++;
138
                        sitemapFile = new File(directory, fileRoot + fileNumber
139
                                + ".xml");
140
                        sitemap = new FileWriter(sitemapFile);
141

    
142
                        // Write the sitemap document header for the new file
143
                        writeSitemapHeader(sitemap);
144
                    }
145

    
146
                    String separator = PropertyService.getProperty("document.accNumSeparator");
147
                    String docid = rs.getString(1) + separator
148
                            + rs.getString(2);
149
                    writeSitemapEntry(sitemap, docid);
150
                    counter++;
151
                }
152
                stmt.close();
153
                writeSitemapFooter(sitemap);
154
            } catch (SQLException e) {
155
                logMetacat.warn("Error while writing to the sitemap file: "
156
                        + e.getMessage());
157
            } catch (IOException ioe) {
158
                logMetacat.warn("Could not open or write to the sitemap file."
159
                        + ioe.getMessage());
160
            } catch (PropertyNotFoundException pnfe) {
161
                logMetacat.warn("Could not retrieve the account number separator."
162
                        + pnfe.getMessage());
163
            } finally {
164
                // Return database connection to the pool
165
                DBConnectionPool.returnDBConnection(dbConn, serialNumber);
166
            }
167
        } else {
168
            logMetacat.warn("Sitemap not created because directory not valid.");
169
        }
170
    }
171

    
172
    /**
173
     * Write the header information in a single sitemap file. This includes the
174
     * XML prolog, the root element and namespace declaration, and the elements
175
     * leading up to the first URL entry.
176
     * 
177
     * @param sitemap
178
     *            the Writer to use for writing the header
179
     * @throws IOException
180
     *             if there is a problem writing to the Writer
181
     */
182
    private void writeSitemapHeader(Writer sitemap) throws IOException {
183
        sitemap.write(PROLOG);
184
        String header = "<urlset xmlns=\"http://www.google.com/schemas/sitemap/0.84\">\n";
185
        sitemap.write(header);
186
        sitemap.flush();
187
    }
188

    
189
    /**
190
     * Write a URL entry to a single sitemap file. This includes the XML markup
191
     * surrounding a particular site URL.
192
     * 
193
     * @param sitemap
194
     *            the Writer to use for writing the URL
195
     * @param docid
196
     *            the identifier to be written in the URL
197
     * @param urlRoot
198
     *            the base URL to be used in constructing a URL
199
     * @param skin
200
     *            the name of the skin to be used in constructing a URL
201
     * @throws IOException
202
     *             if there is a problem writing to the Writer
203
     */
204
    private void writeSitemapEntry(Writer sitemap, String docid)
205
            throws IOException {
206
        if (sitemap != null && docid != null && urlRoot != null) {
207
            StringBuffer url = new StringBuffer();
208
            url.append(urlRoot);
209
            if (!urlRoot.endsWith("/")) {
210
                url.append("/");
211
            }
212
            url.append(docid);
213
            if (skin != null) {
214
                url.append("/");
215
                url.append(skin);
216
            }
217
            sitemap.write("<url><loc>");
218
            sitemap.write(url.toString());
219
            sitemap.write("</loc>");
220
            // <lastmod>2005-01-01</lastmod>
221
            // <changefreq>monthly</changefreq>
222
            // <priority>0.8</priority>
223
            sitemap.write("</url>");
224
            sitemap.write("\n");
225
            sitemap.flush();
226
        }
227
    }
228

    
229
    /**
230
     * Write the footer information in a single sitemap file and close the file.
231
     * This includes the closing tag for the root element.
232
     * 
233
     * @param sitemap
234
     *            the Writer to use for writing the footer
235
     * @throws IOException
236
     *             if there is a problem writing to the Writer
237
     */
238
    private void writeSitemapFooter(Writer sitemap) throws IOException {
239
        if (sitemap != null)
240
        {
241
	    	String footer = "</urlset>\n";
242
	        sitemap.write(footer);
243
	        sitemap.close();
244
        }
245
    }
246

    
247
    // Member variables
248

    
249
    /** The directory in which sitemaps are written. */
250
    private File directory;
251

    
252
    /** The root url for constructing sitemap URLs. */
253
    private String urlRoot;
254

    
255
    /** The name of the format skin to be used in sitemap URLs. */
256
    private String skin;
257

    
258
    /** Maximum number of URLs to write to a single sitemap file */
259
    static final int MAX_URLS_IN_FILE = 25000; // 50,000 according to Google
260

    
261
    /** The root name to be used in naming sitemap files. */
262
    static final String fileRoot = "metacat";
263

    
264
    /** A String constant containing the XML prolog to be written in files. */
265
    static final String PROLOG = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n";
266
}
(64-64/67)