Project

General

Profile

1
/**
2
 *  '$RCSfile$'
3
 *  Copyright: 2007 Regents of the University of California and the
4
 *             National Center for Ecological Analysis and Synthesis
5
 *
6
 *   '$Author: jones $'
7
 *     '$Date: 2007-04-18 02:26:24 -0700 (Wed, 18 Apr 2007) $'
8
 * '$Revision: 3244 $'
9
 *
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * This program is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
 * GNU General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU General Public License
21
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23
 */
24

    
25
package edu.ucsb.nceas.metacat;
26

    
27
import java.io.File;
28
import java.io.FileWriter;
29
import java.io.IOException;
30
import java.io.Writer;
31
import java.sql.PreparedStatement;
32
import java.sql.ResultSet;
33
import java.sql.SQLException;
34
import java.util.TimerTask;
35

    
36
import org.apache.log4j.Logger;
37

    
38
/**
39
 * A Sitemap represents a document that lists all of the content of the Metacat
40
 * server for use by harvesting spiders that wish to index the contents of the
41
 * Metacat site. It is used to generate an XML representation of all of the URLs
42
 * of the site in order to facilitate indexing of the metacat site by search
43
 * engines.
44
 * 
45
 * @author Matt Jones
46
 */
47
public class Sitemap extends TimerTask {
48

    
49
    /**
50
     * Construct a new instance of the Sitemap.
51
     * 
52
     * @param directory
53
     *            the location to store sitemap files
54
     * @param urlRoot
55
     *            the base URL for constructing sitemap URLs
56
     * @param skin
57
     *            the format skin to be used in URLs
58
     */
59
    public Sitemap(File directory, String urlRoot, String skin) {
60
        super();
61
        this.directory = directory;
62
        this.urlRoot = urlRoot;
63
        this.skin = skin;
64
    }
65

    
66
    /**
67
     * Execute the timed task when called, in this case by generating the
68
     * sitemap files needed for this Metacat instance.
69
     */
70
    public void run() {
71
        generateSitemaps();
72
    }
73

    
74
    /**
75
     * Generate all of the sitemap files needed to list the URLs from this
76
     * instance of Metacat, using the open sitemap format described here:
77
     * 
78
     * URLs are written to a single file, unless the maximum number of URLs
79
     * allowed inthe sitemap file is exceeded, in which subsequent numbered
80
     * files are created.
81
     * 
82
     * @param directory
83
     *            an existing File directory in which to write the sitemaps
84
     * @param urlRoot
85
     *            the base URL to use in constructing document URLs
86
     * @param skin
87
     *            the name of the skin to be used in formatting metacat
88
     *            documents
89
     */
90
    public void generateSitemaps() {
91
        Logger logMetacat = Logger.getLogger(Sitemap.class);
92

    
93
        logMetacat.info("Running the Sitemap task.");
94

    
95
        // Test if the passed in File is a directory
96
        if (directory.isDirectory()) {
97
            // Query xml_documents to get list of documents
98
            StringBuffer query = new StringBuffer();
99
            // TODO: make the doctype configurable in the query
100
            query.append("select docid, rev from xml_documents "
101
                    + "where doctype LIKE 'eml:%' " + "order by docid, rev");
102

    
103
            DBConnection dbConn = null;
104
            int serialNumber = -1;
105
            try {
106
                // Get a database connection from the pool
107
                dbConn = DBConnectionPool
108
                        .getDBConnection("Sitemap.generateSitemap()");
109
                serialNumber = dbConn.getCheckOutSerialNumber();
110

    
111
                // Execute the query statement
112
                PreparedStatement stmt = dbConn.prepareStatement(query
113
                        .toString());
114
                stmt.execute();
115
                ResultSet rs = stmt.getResultSet();
116

    
117
                // Loop through all of the documents, and write them to a
118
                // sitemap
119
                File sitemapFile = null;
120
                FileWriter sitemap = null;
121
                int counter = 0;
122
                int fileNumber = 0;
123
                while (rs.next()) {
124
                    // Check if a new sitemap file needs to be created
125
                    if (counter % MAX_URLS_IN_FILE == 0) {
126

    
127
                        // if a sitemap file is already open
128
                        if (sitemapFile != null && sitemapFile.canWrite()) {
129
                            // write the footer and close the file
130
                            writeSitemapFooter(sitemap);
131
                        }
132

    
133
                        // Open a new sitemap file for writing
134
                        fileNumber++;
135
                        sitemapFile = new File(directory, fileRoot + fileNumber
136
                                + ".xml");
137
                        sitemap = new FileWriter(sitemapFile);
138

    
139
                        // Write the sitemap document header for the new file
140
                        writeSitemapHeader(sitemap);
141
                    }
142

    
143
                    String separator = MetaCatUtil.getOption("accNumSeparator");
144
                    String docid = rs.getString(1) + separator
145
                            + rs.getString(2);
146
                    writeSitemapEntry(sitemap, docid);
147
                    counter++;
148
                }
149
                stmt.close();
150
                writeSitemapFooter(sitemap);
151
            } catch (SQLException e) {
152
                logMetacat.warn("Error while writing to the sitemap file: "
153
                        + e.getMessage());
154
            } catch (IOException ioe) {
155
                logMetacat.warn("Could not open or write to the sitemap file."
156
                        + ioe.getMessage());
157
            } finally {
158
                // Return database connection to the pool
159
                DBConnectionPool.returnDBConnection(dbConn, serialNumber);
160
            }
161
        } else {
162
            logMetacat.warn("Sitemap not created because directory not valid.");
163
        }
164
    }
165

    
166
    /**
167
     * Write the header information in a single sitemap file. This includes the
168
     * XML prolog, the root element and namespace declaration, and the elements
169
     * leading up to the first URL entry.
170
     * 
171
     * @param sitemap
172
     *            the Writer to use for writing the header
173
     * @throws IOException
174
     *             if there is a problem writing to the Writer
175
     */
176
    private void writeSitemapHeader(Writer sitemap) throws IOException {
177
        sitemap.write(PROLOG);
178
        String header = "<urlset xmlns=\"http://www.google.com/schemas/sitemap/0.84\">\n";
179
        sitemap.write(header);
180
        sitemap.flush();
181
    }
182

    
183
    /**
184
     * Write a URL entry to a single sitemap file. This includes the XML markup
185
     * surrounding a particular site URL.
186
     * 
187
     * @param sitemap
188
     *            the Writer to use for writing the URL
189
     * @param docid
190
     *            the identifier to be written in the URL
191
     * @param urlRoot
192
     *            the base URL to be used in constructing a URL
193
     * @param skin
194
     *            the name of the skin to be used in constructing a URL
195
     * @throws IOException
196
     *             if there is a problem writing to the Writer
197
     */
198
    private void writeSitemapEntry(Writer sitemap, String docid)
199
            throws IOException {
200
        if (sitemap != null && docid != null && urlRoot != null) {
201
            StringBuffer url = new StringBuffer();
202
            url.append(urlRoot);
203
            if (!urlRoot.endsWith("/")) {
204
                url.append("/");
205
            }
206
            url.append(docid);
207
            if (skin != null) {
208
                url.append("/");
209
                url.append(skin);
210
            }
211
            sitemap.write("<url><loc>");
212
            sitemap.write(url.toString());
213
            sitemap.write("</loc>");
214
            // <lastmod>2005-01-01</lastmod>
215
            // <changefreq>monthly</changefreq>
216
            // <priority>0.8</priority>
217
            sitemap.write("</url>");
218
            sitemap.write("\n");
219
            sitemap.flush();
220
        }
221
    }
222

    
223
    /**
224
     * Write the footer information in a single sitemap file and close the file.
225
     * This includes the closing tag for the root element.
226
     * 
227
     * @param sitemap
228
     *            the Writer to use for writing the footer
229
     * @throws IOException
230
     *             if there is a problem writing to the Writer
231
     */
232
    private void writeSitemapFooter(Writer sitemap) throws IOException {
233
        String footer = "</urlset>\n";
234
        sitemap.write(footer);
235
        sitemap.close();
236
    }
237

    
238
    // Member variables
239

    
240
    /** The directory in which sitemaps are written. */
241
    private File directory;
242

    
243
    /** The root url for constructing sitemap URLs. */
244
    private String urlRoot;
245

    
246
    /** The name of the format skin to be used in sitemap URLs. */
247
    private String skin;
248

    
249
    /** Maximum number of URLs to write to a single sitemap file */
250
    static final int MAX_URLS_IN_FILE = 25000; // 50,000 according to Google
251

    
252
    /** The root name to be used in naming sitemap files. */
253
    static final String fileRoot = "metacat";
254

    
255
    /** A String constant containing the XML prolog to be written in files. */
256
    static final String PROLOG = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n";
257
}
(64-64/67)