Project

General

Profile

1
/**
2
 *  '$RCSfile$'
3
 *  Copyright: 2007 Regents of the University of California and the
4
 *             National Center for Ecological Analysis and Synthesis
5
 *
6
 *   '$Author: daigle $'
7
 *     '$Date: 2008-11-04 16:04:54 -0800 (Tue, 04 Nov 2008) $'
8
 * '$Revision: 4511 $'
9
 *
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * This program is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
 * GNU General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU General Public License
21
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23
 */
24

    
25
package edu.ucsb.nceas.metacat;
26

    
27
import java.io.File;
28
import java.io.FileWriter;
29
import java.io.IOException;
30
import java.io.Writer;
31
import java.sql.PreparedStatement;
32
import java.sql.ResultSet;
33
import java.sql.SQLException;
34
import java.util.TimerTask;
35

    
36
import org.apache.log4j.Logger;
37

    
38
import edu.ucsb.nceas.metacat.service.PropertyService;
39
import edu.ucsb.nceas.utilities.PropertyNotFoundException;
40

    
41
/**
42
 * A Sitemap represents a document that lists all of the content of the Metacat
43
 * server for use by harvesting spiders that wish to index the contents of the
44
 * Metacat site. It is used to generate an XML representation of all of the URLs
45
 * of the site in order to facilitate indexing of the metacat site by search
46
 * engines.
47
 * 
48
 * @author Matt Jones
49
 */
50
public class Sitemap extends TimerTask {
51

    
52
    /**
53
     * Construct a new instance of the Sitemap.
54
     * 
55
     * @param directory
56
     *            the location to store sitemap files
57
     * @param urlRoot
58
     *            the base URL for constructing sitemap URLs
59
     * @param skin
60
     *            the format skin to be used in URLs
61
     */
62
    public Sitemap(File directory, String urlRoot, String skin) {
63
        super();
64
        this.directory = directory;
65
        this.urlRoot = urlRoot;
66
        this.skin = skin;
67
    }
68

    
69
    /**
70
     * Execute the timed task when called, in this case by generating the
71
     * sitemap files needed for this Metacat instance.
72
     */
73
    public void run() {
74
        generateSitemaps();
75
    }
76

    
77
    /**
78
     * Generate all of the sitemap files needed to list the URLs from this
79
     * instance of Metacat, using the open sitemap format described here:
80
     * 
81
     * URLs are written to a single file, unless the maximum number of URLs
82
     * allowed inthe sitemap file is exceeded, in which subsequent numbered
83
     * files are created.
84
     * 
85
     * @param directory
86
     *            an existing File directory in which to write the sitemaps
87
     * @param urlRoot
88
     *            the base URL to use in constructing document URLs
89
     * @param skin
90
     *            the name of the skin to be used in formatting metacat
91
     *            documents
92
     */
93
    public void generateSitemaps() {
94
        Logger logMetacat = Logger.getLogger(Sitemap.class);
95

    
96
        logMetacat.info("Running the Sitemap task.");
97

    
98
        // Test if the passed in File is a directory
99
        if (directory.isDirectory()) {
100
            // Query xml_documents to get list of documents
101
            StringBuffer query = new StringBuffer();
102
            // TODO: make the doctype configurable in the query
103
            String sql =
104
            	"SELECT xml_documents.docid, xml_documents.rev " +
105
            	"FROM xml_documents, xml_access " +
106
                "WHERE xml_documents.doctype LIKE 'eml:%' " + 
107
                "AND xml_documents.docid = xml_access.docid " +
108
                "AND xml_access.principal_name = 'public' " +
109
                "AND xml_access.perm_type = 'allow' " +
110
                "order by docid, rev";
111
            query.append(sql);
112

    
113
            DBConnection dbConn = null;
114
            int serialNumber = -1;
115
            try {
116
                // Get a database connection from the pool
117
                dbConn = DBConnectionPool
118
                        .getDBConnection("Sitemap.generateSitemap()");
119
                serialNumber = dbConn.getCheckOutSerialNumber();
120

    
121
                // Execute the query statement
122
                PreparedStatement stmt = dbConn.prepareStatement(query
123
                        .toString());
124
                stmt.execute();
125
                ResultSet rs = stmt.getResultSet();
126

    
127
                // Loop through all of the documents, and write them to a
128
                // sitemap
129
                File sitemapFile = null;
130
                FileWriter sitemap = null;
131
                int counter = 0;
132
                int fileNumber = 0;
133
                while (rs.next()) {
134
                    // Check if a new sitemap file needs to be created
135
                    if (counter % MAX_URLS_IN_FILE == 0) {
136

    
137
                        // if a sitemap file is already open
138
                        if (sitemapFile != null && sitemapFile.canWrite()) {
139
                            // write the footer and close the file
140
                            writeSitemapFooter(sitemap);
141
                        }
142

    
143
                        // Open a new sitemap file for writing
144
                        fileNumber++;
145
                        sitemapFile = new File(directory, fileRoot + fileNumber
146
                                + ".xml");
147
                        sitemap = new FileWriter(sitemapFile);
148

    
149
                        // Write the sitemap document header for the new file
150
                        writeSitemapHeader(sitemap);
151
                    }
152

    
153
                    String separator = PropertyService.getProperty("document.accNumSeparator");
154
                    String docid = rs.getString(1) + separator
155
                            + rs.getString(2);
156
                    writeSitemapEntry(sitemap, docid);
157
                    counter++;
158
                }
159
                stmt.close();
160
                writeSitemapFooter(sitemap);
161
            } catch (SQLException e) {
162
                logMetacat.warn("Error while writing to the sitemap file: "
163
                        + e.getMessage());
164
            } catch (IOException ioe) {
165
                logMetacat.warn("Could not open or write to the sitemap file."
166
                        + ioe.getMessage());
167
            } catch (PropertyNotFoundException pnfe) {
168
                logMetacat.warn("Could not retrieve the account number separator."
169
                        + pnfe.getMessage());
170
            } finally {
171
                // Return database connection to the pool
172
                DBConnectionPool.returnDBConnection(dbConn, serialNumber);
173
            }
174
        } else {
175
            logMetacat.warn("Sitemap not created because directory not valid.");
176
        }
177
    }
178

    
179
    /**
180
     * Write the header information in a single sitemap file. This includes the
181
     * XML prolog, the root element and namespace declaration, and the elements
182
     * leading up to the first URL entry.
183
     * 
184
     * @param sitemap
185
     *            the Writer to use for writing the header
186
     * @throws IOException
187
     *             if there is a problem writing to the Writer
188
     */
189
    private void writeSitemapHeader(Writer sitemap) throws IOException {
190
        sitemap.write(PROLOG);
191
        String header = "<urlset xmlns=\"http://www.google.com/schemas/sitemap/0.84\">\n";
192
        sitemap.write(header);
193
        sitemap.flush();
194
    }
195

    
196
    /**
197
     * Write a URL entry to a single sitemap file. This includes the XML markup
198
     * surrounding a particular site URL.
199
     * 
200
     * @param sitemap
201
     *            the Writer to use for writing the URL
202
     * @param docid
203
     *            the identifier to be written in the URL
204
     * @param urlRoot
205
     *            the base URL to be used in constructing a URL
206
     * @param skin
207
     *            the name of the skin to be used in constructing a URL
208
     * @throws IOException
209
     *             if there is a problem writing to the Writer
210
     */
211
    private void writeSitemapEntry(Writer sitemap, String docid)
212
            throws IOException {
213
        if (sitemap != null && docid != null && urlRoot != null) {
214
            StringBuffer url = new StringBuffer();
215
            url.append(urlRoot);
216
            if (!urlRoot.endsWith("/")) {
217
                url.append("/");
218
            }
219
            url.append(docid);
220
            if (skin != null) {
221
                url.append("/");
222
                url.append(skin);
223
            }
224
            sitemap.write("<url><loc>");
225
            sitemap.write(url.toString());
226
            sitemap.write("</loc>");
227
            // <lastmod>2005-01-01</lastmod>
228
            // <changefreq>monthly</changefreq>
229
            // <priority>0.8</priority>
230
            sitemap.write("</url>");
231
            sitemap.write("\n");
232
            sitemap.flush();
233
        }
234
    }
235

    
236
    /**
237
     * Write the footer information in a single sitemap file and close the file.
238
     * This includes the closing tag for the root element.
239
     * 
240
     * @param sitemap
241
     *            the Writer to use for writing the footer
242
     * @throws IOException
243
     *             if there is a problem writing to the Writer
244
     */
245
    private void writeSitemapFooter(Writer sitemap) throws IOException {
246
        if (sitemap != null)
247
        {
248
	    	String footer = "</urlset>\n";
249
	        sitemap.write(footer);
250
	        sitemap.close();
251
        }
252
    }
253

    
254
    // Member variables
255

    
256
    /** The directory in which sitemaps are written. */
257
    private File directory;
258

    
259
    /** The root url for constructing sitemap URLs. */
260
    private String urlRoot;
261

    
262
    /** The name of the format skin to be used in sitemap URLs. */
263
    private String skin;
264

    
265
    /** Maximum number of URLs to write to a single sitemap file */
266
    static final int MAX_URLS_IN_FILE = 25000; // 50,000 according to Google
267

    
268
    /** The root name to be used in naming sitemap files. */
269
    static final String fileRoot = "metacat";
270

    
271
    /** A String constant containing the XML prolog to be written in files. */
272
    static final String PROLOG = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n";
273
}
(66-66/69)