Project

General

Profile

1
/**
2
 *  '$RCSfile$'
3
 *  Copyright: 2007 Regents of the University of California and the
4
 *             National Center for Ecological Analysis and Synthesis
5
 *
6
 *   '$Author: leinfelder $'
7
 *     '$Date: 2011-12-07 12:18:24 -0800 (Wed, 07 Dec 2011) $'
8
 * '$Revision: 6744 $'
9
 *
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * This program is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
 * GNU General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU General Public License
21
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23
 */
24

    
25
package edu.ucsb.nceas.metacat;
26

    
27
import java.io.File;
28
import java.io.FileWriter;
29
import java.io.IOException;
30
import java.io.Writer;
31
import java.sql.PreparedStatement;
32
import java.sql.ResultSet;
33
import java.sql.SQLException;
34
import java.util.TimerTask;
35

    
36
import org.apache.log4j.Logger;
37

    
38
import edu.ucsb.nceas.metacat.database.DBConnection;
39
import edu.ucsb.nceas.metacat.database.DBConnectionPool;
40
import edu.ucsb.nceas.metacat.properties.PropertyService;
41
import edu.ucsb.nceas.utilities.PropertyNotFoundException;
42

    
43
/**
44
 * A Sitemap represents a document that lists all of the content of the Metacat
45
 * server for use by harvesting spiders that wish to index the contents of the
46
 * Metacat site. It is used to generate an XML representation of all of the URLs
47
 * of the site in order to facilitate indexing of the metacat site by search
48
 * engines.
49
 * 
50
 * @author Matt Jones
51
 */
52
public class Sitemap extends TimerTask {
53
	
54
	private static Logger logMetacat = Logger.getLogger(Sitemap.class);
55

    
56
    /**
57
     * Construct a new instance of the Sitemap.
58
     * 
59
     * @param directory
60
     *            the location to store sitemap files
61
     * @param urlRoot
62
     *            the base URL for constructing sitemap URLs
63
     * @param skin
64
     *            the format skin to be used in URLs
65
     */
66
    public Sitemap(File directory, String urlRoot, String skin) {
67
        super();
68
        this.directory = directory;
69
        this.urlRoot = urlRoot;
70
        this.skin = skin;
71
    }
72

    
73
    /**
74
     * Execute the timed task when called, in this case by generating the
75
     * sitemap files needed for this Metacat instance.
76
     */
77
    public void run() {
78
        generateSitemaps();
79
    }
80

    
81
    /**
82
     * Generate all of the sitemap files needed to list the URLs from this
83
     * instance of Metacat, using the open sitemap format described here:
84
     * 
85
     * URLs are written to a single file, unless the maximum number of URLs
86
     * allowed inthe sitemap file is exceeded, in which subsequent numbered
87
     * files are created.
88
     * 
89
     * @param directory
90
     *            an existing File directory in which to write the sitemaps
91
     * @param urlRoot
92
     *            the base URL to use in constructing document URLs
93
     * @param skin
94
     *            the name of the skin to be used in formatting metacat
95
     *            documents
96
     */
97
    public void generateSitemaps() {
98

    
99
        logMetacat.info("Running the Sitemap task.");
100

    
101
        // Test if the passed in File is a directory
102
        if (directory.isDirectory()) {
103
            // Query xml_documents to get list of documents
104
            StringBuffer query = new StringBuffer();
105
            // TODO: make the doctype configurable in the query
106
            String sql =
107
            	"SELECT xml_documents.docid, xml_documents.rev " +
108
            	"FROM xml_documents, xml_access, identifier " +
109
                "WHERE xml_documents.doctype LIKE 'eml:%' " + 
110
                "AND xml_documents.docid = identifier.docid " +
111
                "AND xml_documents.rev = identifier.rev " +
112
                "AND identifier.guid = xml_access.guid " +
113
                "AND xml_access.principal_name = 'public' " +
114
                "AND xml_access.perm_type = 'allow' " +
115
                "order by docid, rev";
116
            query.append(sql);
117

    
118
            DBConnection dbConn = null;
119
            int serialNumber = -1;
120
            try {
121
                // Get a database connection from the pool
122
                dbConn = DBConnectionPool
123
                        .getDBConnection("Sitemap.generateSitemap()");
124
                serialNumber = dbConn.getCheckOutSerialNumber();
125

    
126
                // Execute the query statement
127
                PreparedStatement stmt = dbConn.prepareStatement(query.toString());
128
                stmt.execute();
129
                ResultSet rs = stmt.getResultSet();
130

    
131
                // Loop through all of the documents, and write them to a
132
                // sitemap
133
                File sitemapFile = null;
134
                FileWriter sitemap = null;
135
                int counter = 0;
136
                int fileNumber = 0;
137
                while (rs.next()) {
138
                    // Check if a new sitemap file needs to be created
139
                    if (counter % MAX_URLS_IN_FILE == 0) {
140

    
141
                        // if a sitemap file is already open
142
                        if (sitemapFile != null && sitemapFile.canWrite()) {
143
                            // write the footer and close the file
144
                            writeSitemapFooter(sitemap);
145
                        }
146

    
147
                        // Open a new sitemap file for writing
148
                        fileNumber++;
149
                        sitemapFile = new File(directory, fileRoot + fileNumber
150
                                + ".xml");
151
                        sitemap = new FileWriter(sitemapFile);
152

    
153
                        // Write the sitemap document header for the new file
154
                        writeSitemapHeader(sitemap);
155
                    }
156

    
157
                    String separator = PropertyService.getProperty("document.accNumSeparator");
158
                    String docid = rs.getString(1) + separator
159
                            + rs.getString(2);
160
                    writeSitemapEntry(sitemap, docid);
161
                    counter++;
162
                }
163
                stmt.close();
164
                writeSitemapFooter(sitemap);
165
            } catch (SQLException e) {
166
                logMetacat.warn("Error while writing to the sitemap file: "
167
                        + e.getMessage());
168
            } catch (IOException ioe) {
169
                logMetacat.warn("Could not open or write to the sitemap file."
170
                        + ioe.getMessage());
171
            } catch (PropertyNotFoundException pnfe) {
172
                logMetacat.warn("Could not retrieve the account number separator."
173
                        + pnfe.getMessage());
174
            } finally {
175
                // Return database connection to the pool
176
                DBConnectionPool.returnDBConnection(dbConn, serialNumber);
177
            }
178
        } else {
179
            logMetacat.warn("Sitemap not created because directory not valid.");
180
        }
181
    }
182

    
183
    /**
184
     * Write the header information in a single sitemap file. This includes the
185
     * XML prolog, the root element and namespace declaration, and the elements
186
     * leading up to the first URL entry.
187
     * 
188
     * @param sitemap
189
     *            the Writer to use for writing the header
190
     * @throws IOException
191
     *             if there is a problem writing to the Writer
192
     */
193
    private void writeSitemapHeader(Writer sitemap) throws IOException {
194
        sitemap.write(PROLOG);
195
        String header = "<urlset xmlns=\"http://www.google.com/schemas/sitemap/0.84\">\n";
196
        sitemap.write(header);
197
        sitemap.flush();
198
    }
199

    
200
    /**
201
     * Write a URL entry to a single sitemap file. This includes the XML markup
202
     * surrounding a particular site URL.
203
     * 
204
     * @param sitemap
205
     *            the Writer to use for writing the URL
206
     * @param docid
207
     *            the identifier to be written in the URL
208
     * @param urlRoot
209
     *            the base URL to be used in constructing a URL
210
     * @param skin
211
     *            the name of the skin to be used in constructing a URL
212
     * @throws IOException
213
     *             if there is a problem writing to the Writer
214
     */
215
    private void writeSitemapEntry(Writer sitemap, String docid)
216
            throws IOException {
217
        if (sitemap != null && docid != null && urlRoot != null) {
218
            StringBuffer url = new StringBuffer();
219
            url.append(urlRoot);
220
            if (!urlRoot.endsWith("/")) {
221
                url.append("/");
222
            }
223
            url.append(docid);
224
            if (skin != null) {
225
                url.append("/");
226
                url.append(skin);
227
            }
228
            sitemap.write("<url><loc>");
229
            sitemap.write(url.toString());
230
            sitemap.write("</loc>");
231
            // <lastmod>2005-01-01</lastmod>
232
            // <changefreq>monthly</changefreq>
233
            // <priority>0.8</priority>
234
            sitemap.write("</url>");
235
            sitemap.write("\n");
236
            sitemap.flush();
237
        }
238
    }
239

    
240
    /**
241
     * Write the footer information in a single sitemap file and close the file.
242
     * This includes the closing tag for the root element.
243
     * 
244
     * @param sitemap
245
     *            the Writer to use for writing the footer
246
     * @throws IOException
247
     *             if there is a problem writing to the Writer
248
     */
249
    private void writeSitemapFooter(Writer sitemap) throws IOException {
250
        if (sitemap != null)
251
        {
252
	    	String footer = "</urlset>\n";
253
	        sitemap.write(footer);
254
	        sitemap.close();
255
        }
256
    }
257

    
258
    // Member variables
259

    
260
    /** The directory in which sitemaps are written. */
261
    private File directory;
262

    
263
    /** The root url for constructing sitemap URLs. */
264
    private String urlRoot;
265

    
266
    /** The name of the format skin to be used in sitemap URLs. */
267
    private String skin;
268

    
269
    /** Maximum number of URLs to write to a single sitemap file */
270
    static final int MAX_URLS_IN_FILE = 25000; // 50,000 according to Google
271

    
272
    /** The root name to be used in naming sitemap files. */
273
    static final String fileRoot = "metacat";
274

    
275
    /** A String constant containing the XML prolog to be written in files. */
276
    static final String PROLOG = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n";
277
}
(60-60/64)