Project

General

Profile

« Previous | Next » 

Revision 3244

Added by Matt Jones about 17 years ago

Added a new class called Sitemap that is used to generate a series of XML documents representing the URLs of metacat documents following the sitemap protocol. The Sitemap class extends TimerTask so that it can be scheduled to run once a day or so. New configuration options were added to metacat.properties to control where the sitemaps are written and hw often they are updated. By default we do it once a day, as more often is overkill for search engines.

Included a JUnit unit test to test the Sitemap generation functionality.

Included changes to MetaCatServlet to schedule the Sitemap task the first time Metacat is called.

View differences:

test/edu/ucsb/nceas/metacattest/SitemapTest.java
1
/**
2
 *  '$RCSfile$'
3
 *  Copyright: 2007 Regents of the University of California and the
4
 *              National Center for Ecological Analysis and Synthesis
5
 *
6
 *   '$Author$'
7
 *     '$Date$'
8
 * '$Revision$'
9
 *
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * This program is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
 * GNU General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU General Public License
21
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23
 */
24

  
25
package edu.ucsb.nceas.metacattest;
26

  
27
import java.io.BufferedReader;
28
import java.io.File;
29
import java.io.FileNotFoundException;
30
import java.io.FileReader;
31
import java.io.IOException;
32

  
33
import edu.ucsb.nceas.metacat.DBConnectionPool;
34
import edu.ucsb.nceas.metacat.MetaCatUtil;
35
import edu.ucsb.nceas.metacat.Sitemap;
36
import edu.ucsb.nceas.utilities.Options;
37

  
38
import junit.framework.TestCase;
39

  
40
/**
41
 * Test the Sitemap class by generating the sitemaps in a separate directory.
42
 * 
43
 * @author Matt Jones
44
 */
45
public class SitemapTest extends TestCase {
46

  
47
    private String propertyFileName = 
48
        "@install-dir@/WEB-INF/metacat.properties";
49
        // "/usr/share/tomcat5.5/webapps/knb/WEB-INF/metacat.properties";
50

  
51
    private String directoryName = "/tmp/sitemaps";
52

  
53
    /**
54
     * Initialize the Metacat environment so the test can run.
55
     */
56
    protected void setUp() throws Exception {
57
        super.setUp();
58
        try {
59
            File propertyFile = new File(propertyFileName);
60
            Options options = Options.initialize(propertyFile);
61
            MetaCatUtil util = new MetaCatUtil();
62
            DBConnectionPool pool = DBConnectionPool.getInstance();
63
        } catch (FileNotFoundException e) {
64
            fail(e.getMessage());
65
        } catch (IOException e) {
66
            fail(e.getMessage());
67
        }
68
    }
69

  
70
    /**
71
     * Test the static generateSitemaps() method.
72
     */
73
    public void testGenerateSitemaps() {
74
        File directory = new File(directoryName);
75
        directory.mkdirs();
76
        String urlRoot = "http://foo.example.com/ctx/metacat";
77
        String skin = "testskin";
78
        Sitemap smap = new Sitemap(directory, urlRoot, skin);
79
        smap.generateSitemaps();
80
        File sitemap1 = new File(directory, "metacat1.xml");
81
        assertTrue(sitemap1.exists() && sitemap1.isFile());
82
        try {
83
            FileReader r = new FileReader(sitemap1);
84
            BufferedReader br = new BufferedReader(r);
85
            char[] buf = new char[1024];
86
            br.read(buf, 0, 1024);
87
            br.close();
88
            String doc = new String(buf);
89
            assertTrue(doc.indexOf("<?xml") >= 0);
90
            assertTrue(doc.indexOf("<urlset") >= 0);
91
            assertTrue(doc.indexOf("<url>") >= 0);
92
            assertTrue(doc.indexOf("http:") >= 0);
93
        } catch (FileNotFoundException e) {
94
            fail("Failed to read the sitemap file." + e.getMessage());
95
        } catch (IOException ioe) {
96
            fail("Failed while reading sitemap file." + ioe.getMessage());
97
        }
98
    }
99
}
0 100

  
lib/metacat.properties
264 264
metacatUrl=@systemidserver@@servlet-path@
265 265
baseUrl=@systemidserver@@html-path@
266 266
# END spatial options
267

  
268
# START sitemap properties
269

  
270
# relative directory path in which sitemap files should be written
271
sitemapDirectory=@install-dir@/sitemaps
272

  
273
# Interval (in milliseconds) between rebuilding the sitemap
274
sitemapInterval=86400000
275

  
276
# END sitemap properties
src/edu/ucsb/nceas/metacat/MetaCatServlet.java
126 126
{
127 127
    private static Hashtable sessionHash = new Hashtable();
128 128
    private Timer timer = null;
129
    private static boolean sitemapScheduled;
129 130
    
130 131
    // Constants -- these should be final in a servlet
131 132
    private static final String PROLOG = "<?xml version=\"1.0\"?>";
......
244 245
	    } else {
245 246
                logMetacat.info(" \n **** Spatial cache is not set to regenerate on restart");
246 247
            }
247
           
248
           
248
            
249
            sitemapScheduled = false;
250
            
249 251
            logMetacat.info("Metacat (" + Version.getVersion()
250 252
                               + ") initialized.");
251 253

  
......
829 831
            //util.closeConnections();
830 832
            // Close the stream to the client
831 833
            //out.close();
834
            
835
            // Schedule the sitemap generator to run periodically
836
            scheduleSitemapGeneration(request);
832 837
        }
833 838
    }
834 839

  
......
3456 3461
            }
3457 3462
        }
3458 3463
    }
3464
    
3465
    /**
3466
     * Schedule the sitemap generator to run periodically and update all
3467
     * of the sitemap files for search indexing engines.
3468
     * 
3469
     * @param request a servlet request, from which we determine the context
3470
     */
3471
    private void scheduleSitemapGeneration(HttpServletRequest request) {
3472
    	if (!sitemapScheduled) {
3473
	    	String directoryName = MetaCatUtil.getOption("sitemapDirectory");
3474
			File directory = new File(directoryName);
3475
			directory.mkdirs();
3476
			String urlRoot = request.getRequestURL().toString();
3477
			String skin = MetaCatUtil.getOption("default-style");
3478
			Sitemap smap = new Sitemap(directory, urlRoot, skin);
3479
			long sitemapInterval = Integer.parseInt(
3480
					MetaCatUtil.getOption("sitemapInterval"));
3481
			long firstDelay = 60*1000;   // 60 seconds delay
3482
			timer.schedule(smap, firstDelay, sitemapInterval);
3483
			sitemapScheduled = true;
3484
    	}
3485
    }
3459 3486
}
src/edu/ucsb/nceas/metacat/Sitemap.java
1
/**
2
 *  '$RCSfile$'
3
 *  Copyright: 2007 Regents of the University of California and the
4
 *             National Center for Ecological Analysis and Synthesis
5
 *
6
 *   '$Author$'
7
 *     '$Date$'
8
 * '$Revision$'
9
 *
10
 * This program is free software; you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation; either version 2 of the License, or
13
 * (at your option) any later version.
14
 *
15
 * This program is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
 * GNU General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU General Public License
21
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23
 */
24

  
25
package edu.ucsb.nceas.metacat;
26

  
27
import java.io.File;
28
import java.io.FileWriter;
29
import java.io.IOException;
30
import java.io.Writer;
31
import java.sql.PreparedStatement;
32
import java.sql.ResultSet;
33
import java.sql.SQLException;
34
import java.util.TimerTask;
35

  
36
import org.apache.log4j.Logger;
37

  
38
/**
39
 * A Sitemap represents a document that lists all of the content of the Metacat
40
 * server for use by harvesting spiders that wish to index the contents of the
41
 * Metacat site. It is used to generate an XML representation of all of the URLs
42
 * of the site in order to facilitate indexing of the metacat site by search
43
 * engines.
44
 * 
45
 * @author Matt Jones
46
 */
47
public class Sitemap extends TimerTask {
48

  
49
    /**
50
     * Construct a new instance of the Sitemap.
51
     * 
52
     * @param directory
53
     *            the location to store sitemap files
54
     * @param urlRoot
55
     *            the base URL for constructing sitemap URLs
56
     * @param skin
57
     *            the format skin to be used in URLs
58
     */
59
    public Sitemap(File directory, String urlRoot, String skin) {
60
        super();
61
        this.directory = directory;
62
        this.urlRoot = urlRoot;
63
        this.skin = skin;
64
    }
65

  
66
    /**
67
     * Execute the timed task when called, in this case by generating the
68
     * sitemap files needed for this Metacat instance.
69
     */
70
    public void run() {
71
        generateSitemaps();
72
    }
73

  
74
    /**
75
     * Generate all of the sitemap files needed to list the URLs from this
76
     * instance of Metacat, using the open sitemap format described here:
77
     * 
78
     * URLs are written to a single file, unless the maximum number of URLs
79
     * allowed inthe sitemap file is exceeded, in which subsequent numbered
80
     * files are created.
81
     * 
82
     * @param directory
83
     *            an existing File directory in which to write the sitemaps
84
     * @param urlRoot
85
     *            the base URL to use in constructing document URLs
86
     * @param skin
87
     *            the name of the skin to be used in formatting metacat
88
     *            documents
89
     */
90
    public void generateSitemaps() {
91
        Logger logMetacat = Logger.getLogger(Sitemap.class);
92

  
93
        logMetacat.info("Running the Sitemap task.");
94

  
95
        // Test if the passed in File is a directory
96
        if (directory.isDirectory()) {
97
            // Query xml_documents to get list of documents
98
            StringBuffer query = new StringBuffer();
99
            // TODO: make the doctype configurable in the query
100
            query.append("select docid, rev from xml_documents "
101
                    + "where doctype LIKE 'eml:%' " + "order by docid, rev");
102

  
103
            DBConnection dbConn = null;
104
            int serialNumber = -1;
105
            try {
106
                // Get a database connection from the pool
107
                dbConn = DBConnectionPool
108
                        .getDBConnection("Sitemap.generateSitemap()");
109
                serialNumber = dbConn.getCheckOutSerialNumber();
110

  
111
                // Execute the query statement
112
                PreparedStatement stmt = dbConn.prepareStatement(query
113
                        .toString());
114
                stmt.execute();
115
                ResultSet rs = stmt.getResultSet();
116

  
117
                // Loop through all of the documents, and write them to a
118
                // sitemap
119
                File sitemapFile = null;
120
                FileWriter sitemap = null;
121
                int counter = 0;
122
                int fileNumber = 0;
123
                while (rs.next()) {
124
                    // Check if a new sitemap file needs to be created
125
                    if (counter % MAX_URLS_IN_FILE == 0) {
126

  
127
                        // if a sitemap file is already open
128
                        if (sitemapFile != null && sitemapFile.canWrite()) {
129
                            // write the footer and close the file
130
                            writeSitemapFooter(sitemap);
131
                        }
132

  
133
                        // Open a new sitemap file for writing
134
                        fileNumber++;
135
                        sitemapFile = new File(directory, fileRoot + fileNumber
136
                                + ".xml");
137
                        sitemap = new FileWriter(sitemapFile);
138

  
139
                        // Write the sitemap document header for the new file
140
                        writeSitemapHeader(sitemap);
141
                    }
142

  
143
                    String separator = MetaCatUtil.getOption("accNumSeparator");
144
                    String docid = rs.getString(1) + separator
145
                            + rs.getString(2);
146
                    writeSitemapEntry(sitemap, docid);
147
                    counter++;
148
                }
149
                stmt.close();
150
                writeSitemapFooter(sitemap);
151
            } catch (SQLException e) {
152
                logMetacat.warn("Error while writing to the sitemap file: "
153
                        + e.getMessage());
154
            } catch (IOException ioe) {
155
                logMetacat.warn("Could not open or write to the sitemap file."
156
                        + ioe.getMessage());
157
            } finally {
158
                // Return database connection to the pool
159
                DBConnectionPool.returnDBConnection(dbConn, serialNumber);
160
            }
161
        } else {
162
            logMetacat.warn("Sitemap not created because directory not valid.");
163
        }
164
    }
165

  
166
    /**
167
     * Write the header information in a single sitemap file. This includes the
168
     * XML prolog, the root element and namespace declaration, and the elements
169
     * leading up to the first URL entry.
170
     * 
171
     * @param sitemap
172
     *            the Writer to use for writing the header
173
     * @throws IOException
174
     *             if there is a problem writing to the Writer
175
     */
176
    private void writeSitemapHeader(Writer sitemap) throws IOException {
177
        sitemap.write(PROLOG);
178
        String header = "<urlset xmlns=\"http://www.google.com/schemas/sitemap/0.84\">\n";
179
        sitemap.write(header);
180
        sitemap.flush();
181
    }
182

  
183
    /**
184
     * Write a URL entry to a single sitemap file. This includes the XML markup
185
     * surrounding a particular site URL.
186
     * 
187
     * @param sitemap
188
     *            the Writer to use for writing the URL
189
     * @param docid
190
     *            the identifier to be written in the URL
191
     * @param urlRoot
192
     *            the base URL to be used in constructing a URL
193
     * @param skin
194
     *            the name of the skin to be used in constructing a URL
195
     * @throws IOException
196
     *             if there is a problem writing to the Writer
197
     */
198
    private void writeSitemapEntry(Writer sitemap, String docid)
199
            throws IOException {
200
        if (sitemap != null && docid != null && urlRoot != null) {
201
            StringBuffer url = new StringBuffer();
202
            url.append(urlRoot);
203
            if (!urlRoot.endsWith("/")) {
204
                url.append("/");
205
            }
206
            url.append(docid);
207
            if (skin != null) {
208
                url.append("/");
209
                url.append(skin);
210
            }
211
            sitemap.write("<url><loc>");
212
            sitemap.write(url.toString());
213
            sitemap.write("</loc>");
214
            // <lastmod>2005-01-01</lastmod>
215
            // <changefreq>monthly</changefreq>
216
            // <priority>0.8</priority>
217
            sitemap.write("</url>");
218
            sitemap.write("\n");
219
            sitemap.flush();
220
        }
221
    }
222

  
223
    /**
224
     * Write the footer information in a single sitemap file and close the file.
225
     * This includes the closing tag for the root element.
226
     * 
227
     * @param sitemap
228
     *            the Writer to use for writing the footer
229
     * @throws IOException
230
     *             if there is a problem writing to the Writer
231
     */
232
    private void writeSitemapFooter(Writer sitemap) throws IOException {
233
        String footer = "</urlset>\n";
234
        sitemap.write(footer);
235
        sitemap.close();
236
    }
237

  
238
    // Member variables
239

  
240
    /** The directory in which sitemaps are written. */
241
    private File directory;
242

  
243
    /** The root url for constructing sitemap URLs. */
244
    private String urlRoot;
245

  
246
    /** The name of the format skin to be used in sitemap URLs. */
247
    private String skin;
248

  
249
    /** Maximum number of URLs to write to a single sitemap file */
250
    static final int MAX_URLS_IN_FILE = 25000; // 50,000 according to Google
251

  
252
    /** The root name to be used in naming sitemap files. */
253
    static final String fileRoot = "metacat";
254

  
255
    /** A String constant containing the XML prolog to be written in files. */
256
    static final String PROLOG = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n";
257
}
0 258

  

Also available in: Unified diff