Project

General

Profile

1
/**
2
 *  '$RCSfile$'
3
 *  Copyright: 2007 Regents of the University of California and the
4
 *             National Center for Ecological Analysis and Synthesis
5
 *
6
 *   '$Author: daigle $'
7
 *     '$Date: 2009-01-08 09:45:40 -0800 (Thu, 08 Jan 2009) $'
8
 * '$Revision: 4734 $'
9
 *
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * This program is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
 * GNU General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU General Public License
21
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23
 */
24

    
25
package edu.ucsb.nceas.metacat;
26

    
27
import java.io.File;
28
import java.io.FileWriter;
29
import java.io.IOException;
30
import java.io.Writer;
31
import java.sql.PreparedStatement;
32
import java.sql.ResultSet;
33
import java.sql.SQLException;
34
import java.util.TimerTask;
35

    
36
import org.apache.log4j.Logger;
37

    
38
import edu.ucsb.nceas.metacat.service.PropertyService;
39
import edu.ucsb.nceas.utilities.PropertyNotFoundException;
40

    
41
/**
42
 * A Sitemap represents a document that lists all of the content of the Metacat
43
 * server for use by harvesting spiders that wish to index the contents of the
44
 * Metacat site. It is used to generate an XML representation of all of the URLs
45
 * of the site in order to facilitate indexing of the metacat site by search
46
 * engines.
47
 * 
48
 * @author Matt Jones
49
 */
50
public class Sitemap extends TimerTask {
51
	
52
	private static Logger logMetacat = Logger.getLogger(Sitemap.class);
53

    
54
    /**
55
     * Construct a new instance of the Sitemap.
56
     * 
57
     * @param directory
58
     *            the location to store sitemap files
59
     * @param urlRoot
60
     *            the base URL for constructing sitemap URLs
61
     * @param skin
62
     *            the format skin to be used in URLs
63
     */
64
    public Sitemap(File directory, String urlRoot, String skin) {
65
        super();
66
        this.directory = directory;
67
        this.urlRoot = urlRoot;
68
        this.skin = skin;
69
    }
70

    
71
    /**
72
     * Execute the timed task when called, in this case by generating the
73
     * sitemap files needed for this Metacat instance.
74
     */
75
    public void run() {
76
        generateSitemaps();
77
    }
78

    
79
    /**
80
     * Generate all of the sitemap files needed to list the URLs from this
81
     * instance of Metacat, using the open sitemap format described here:
82
     * 
83
     * URLs are written to a single file, unless the maximum number of URLs
84
     * allowed inthe sitemap file is exceeded, in which subsequent numbered
85
     * files are created.
86
     * 
87
     * @param directory
88
     *            an existing File directory in which to write the sitemaps
89
     * @param urlRoot
90
     *            the base URL to use in constructing document URLs
91
     * @param skin
92
     *            the name of the skin to be used in formatting metacat
93
     *            documents
94
     */
95
    public void generateSitemaps() {
96

    
97
        logMetacat.info("Running the Sitemap task.");
98

    
99
        // Test if the passed in File is a directory
100
        if (directory.isDirectory()) {
101
            // Query xml_documents to get list of documents
102
            StringBuffer query = new StringBuffer();
103
            // TODO: make the doctype configurable in the query
104
            String sql =
105
            	"SELECT xml_documents.docid, xml_documents.rev " +
106
            	"FROM xml_documents, xml_access " +
107
                "WHERE xml_documents.doctype LIKE 'eml:%' " + 
108
                "AND xml_documents.docid = xml_access.docid " +
109
                "AND xml_access.principal_name = 'public' " +
110
                "AND xml_access.perm_type = 'allow' " +
111
                "order by docid, rev";
112
            query.append(sql);
113

    
114
            DBConnection dbConn = null;
115
            int serialNumber = -1;
116
            try {
117
                // Get a database connection from the pool
118
                dbConn = DBConnectionPool
119
                        .getDBConnection("Sitemap.generateSitemap()");
120
                serialNumber = dbConn.getCheckOutSerialNumber();
121

    
122
                // Execute the query statement
123
                PreparedStatement stmt = dbConn.prepareStatement(query
124
                        .toString());
125
                stmt.execute();
126
                ResultSet rs = stmt.getResultSet();
127

    
128
                // Loop through all of the documents, and write them to a
129
                // sitemap
130
                File sitemapFile = null;
131
                FileWriter sitemap = null;
132
                int counter = 0;
133
                int fileNumber = 0;
134
                while (rs.next()) {
135
                    // Check if a new sitemap file needs to be created
136
                    if (counter % MAX_URLS_IN_FILE == 0) {
137

    
138
                        // if a sitemap file is already open
139
                        if (sitemapFile != null && sitemapFile.canWrite()) {
140
                            // write the footer and close the file
141
                            writeSitemapFooter(sitemap);
142
                        }
143

    
144
                        // Open a new sitemap file for writing
145
                        fileNumber++;
146
                        sitemapFile = new File(directory, fileRoot + fileNumber
147
                                + ".xml");
148
                        sitemap = new FileWriter(sitemapFile);
149

    
150
                        // Write the sitemap document header for the new file
151
                        writeSitemapHeader(sitemap);
152
                    }
153

    
154
                    String separator = PropertyService.getProperty("document.accNumSeparator");
155
                    String docid = rs.getString(1) + separator
156
                            + rs.getString(2);
157
                    writeSitemapEntry(sitemap, docid);
158
                    counter++;
159
                }
160
                stmt.close();
161
                writeSitemapFooter(sitemap);
162
            } catch (SQLException e) {
163
                logMetacat.warn("Error while writing to the sitemap file: "
164
                        + e.getMessage());
165
            } catch (IOException ioe) {
166
                logMetacat.warn("Could not open or write to the sitemap file."
167
                        + ioe.getMessage());
168
            } catch (PropertyNotFoundException pnfe) {
169
                logMetacat.warn("Could not retrieve the account number separator."
170
                        + pnfe.getMessage());
171
            } finally {
172
                // Return database connection to the pool
173
                DBConnectionPool.returnDBConnection(dbConn, serialNumber);
174
            }
175
        } else {
176
            logMetacat.warn("Sitemap not created because directory not valid.");
177
        }
178
    }
179

    
180
    /**
181
     * Write the header information in a single sitemap file. This includes the
182
     * XML prolog, the root element and namespace declaration, and the elements
183
     * leading up to the first URL entry.
184
     * 
185
     * @param sitemap
186
     *            the Writer to use for writing the header
187
     * @throws IOException
188
     *             if there is a problem writing to the Writer
189
     */
190
    private void writeSitemapHeader(Writer sitemap) throws IOException {
191
        sitemap.write(PROLOG);
192
        String header = "<urlset xmlns=\"http://www.google.com/schemas/sitemap/0.84\">\n";
193
        sitemap.write(header);
194
        sitemap.flush();
195
    }
196

    
197
    /**
198
     * Write a URL entry to a single sitemap file. This includes the XML markup
199
     * surrounding a particular site URL.
200
     * 
201
     * @param sitemap
202
     *            the Writer to use for writing the URL
203
     * @param docid
204
     *            the identifier to be written in the URL
205
     * @param urlRoot
206
     *            the base URL to be used in constructing a URL
207
     * @param skin
208
     *            the name of the skin to be used in constructing a URL
209
     * @throws IOException
210
     *             if there is a problem writing to the Writer
211
     */
212
    private void writeSitemapEntry(Writer sitemap, String docid)
213
            throws IOException {
214
        if (sitemap != null && docid != null && urlRoot != null) {
215
            StringBuffer url = new StringBuffer();
216
            url.append(urlRoot);
217
            if (!urlRoot.endsWith("/")) {
218
                url.append("/");
219
            }
220
            url.append(docid);
221
            if (skin != null) {
222
                url.append("/");
223
                url.append(skin);
224
            }
225
            sitemap.write("<url><loc>");
226
            sitemap.write(url.toString());
227
            sitemap.write("</loc>");
228
            // <lastmod>2005-01-01</lastmod>
229
            // <changefreq>monthly</changefreq>
230
            // <priority>0.8</priority>
231
            sitemap.write("</url>");
232
            sitemap.write("\n");
233
            sitemap.flush();
234
        }
235
    }
236

    
237
    /**
238
     * Write the footer information in a single sitemap file and close the file.
239
     * This includes the closing tag for the root element.
240
     * 
241
     * @param sitemap
242
     *            the Writer to use for writing the footer
243
     * @throws IOException
244
     *             if there is a problem writing to the Writer
245
     */
246
    private void writeSitemapFooter(Writer sitemap) throws IOException {
247
        if (sitemap != null)
248
        {
249
	    	String footer = "</urlset>\n";
250
	        sitemap.write(footer);
251
	        sitemap.close();
252
        }
253
    }
254

    
255
    // Member variables
256

    
257
    /** The directory in which sitemaps are written. */
258
    private File directory;
259

    
260
    /** The root url for constructing sitemap URLs. */
261
    private String urlRoot;
262

    
263
    /** The name of the format skin to be used in sitemap URLs. */
264
    private String skin;
265

    
266
    /** Maximum number of URLs to write to a single sitemap file */
267
    static final int MAX_URLS_IN_FILE = 25000; // 50,000 according to Google
268

    
269
    /** The root name to be used in naming sitemap files. */
270
    static final String fileRoot = "metacat";
271

    
272
    /** A String constant containing the XML prolog to be written in files. */
273
    static final String PROLOG = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n";
274
}
(66-66/69)