Revision 3244
Added by Matt Jones over 17 years ago
test/edu/ucsb/nceas/metacattest/SitemapTest.java | ||
---|---|---|
1 |
/** |
|
2 |
* '$RCSfile$' |
|
3 |
* Copyright: 2007 Regents of the University of California and the |
|
4 |
* National Center for Ecological Analysis and Synthesis |
|
5 |
* |
|
6 |
* '$Author$' |
|
7 |
* '$Date$' |
|
8 |
* '$Revision$' |
|
9 |
* |
|
10 |
* This program is free software; you can redistribute it and/or modify |
|
11 |
* it under the terms of the GNU General Public License as published by |
|
12 |
* the Free Software Foundation; either version 2 of the License, or |
|
13 |
* (at your option) any later version. |
|
14 |
* |
|
15 |
* This program is distributed in the hope that it will be useful, |
|
16 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
17 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
18 |
* GNU General Public License for more details. |
|
19 |
* |
|
20 |
* You should have received a copy of the GNU General Public License |
|
21 |
* along with this program; if not, write to the Free Software |
|
22 |
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|
23 |
*/ |
|
24 |
|
|
25 |
package edu.ucsb.nceas.metacattest; |
|
26 |
|
|
27 |
import java.io.BufferedReader; |
|
28 |
import java.io.File; |
|
29 |
import java.io.FileNotFoundException; |
|
30 |
import java.io.FileReader; |
|
31 |
import java.io.IOException; |
|
32 |
|
|
33 |
import edu.ucsb.nceas.metacat.DBConnectionPool; |
|
34 |
import edu.ucsb.nceas.metacat.MetaCatUtil; |
|
35 |
import edu.ucsb.nceas.metacat.Sitemap; |
|
36 |
import edu.ucsb.nceas.utilities.Options; |
|
37 |
|
|
38 |
import junit.framework.TestCase; |
|
39 |
|
|
40 |
/** |
|
41 |
* Test the Sitemap class by generating the sitemaps in a separate directory. |
|
42 |
* |
|
43 |
* @author Matt Jones |
|
44 |
*/ |
|
45 |
public class SitemapTest extends TestCase { |
|
46 |
|
|
47 |
private String propertyFileName = |
|
48 |
"@install-dir@/WEB-INF/metacat.properties"; |
|
49 |
// "/usr/share/tomcat5.5/webapps/knb/WEB-INF/metacat.properties"; |
|
50 |
|
|
51 |
private String directoryName = "/tmp/sitemaps"; |
|
52 |
|
|
53 |
/** |
|
54 |
* Initialize the Metacat environment so the test can run. |
|
55 |
*/ |
|
56 |
protected void setUp() throws Exception { |
|
57 |
super.setUp(); |
|
58 |
try { |
|
59 |
File propertyFile = new File(propertyFileName); |
|
60 |
Options options = Options.initialize(propertyFile); |
|
61 |
MetaCatUtil util = new MetaCatUtil(); |
|
62 |
DBConnectionPool pool = DBConnectionPool.getInstance(); |
|
63 |
} catch (FileNotFoundException e) { |
|
64 |
fail(e.getMessage()); |
|
65 |
} catch (IOException e) { |
|
66 |
fail(e.getMessage()); |
|
67 |
} |
|
68 |
} |
|
69 |
|
|
70 |
/** |
|
71 |
* Test the static generateSitemaps() method. |
|
72 |
*/ |
|
73 |
public void testGenerateSitemaps() { |
|
74 |
File directory = new File(directoryName); |
|
75 |
directory.mkdirs(); |
|
76 |
String urlRoot = "http://foo.example.com/ctx/metacat"; |
|
77 |
String skin = "testskin"; |
|
78 |
Sitemap smap = new Sitemap(directory, urlRoot, skin); |
|
79 |
smap.generateSitemaps(); |
|
80 |
File sitemap1 = new File(directory, "metacat1.xml"); |
|
81 |
assertTrue(sitemap1.exists() && sitemap1.isFile()); |
|
82 |
try { |
|
83 |
FileReader r = new FileReader(sitemap1); |
|
84 |
BufferedReader br = new BufferedReader(r); |
|
85 |
char[] buf = new char[1024]; |
|
86 |
br.read(buf, 0, 1024); |
|
87 |
br.close(); |
|
88 |
String doc = new String(buf); |
|
89 |
assertTrue(doc.indexOf("<?xml") >= 0); |
|
90 |
assertTrue(doc.indexOf("<urlset") >= 0); |
|
91 |
assertTrue(doc.indexOf("<url>") >= 0); |
|
92 |
assertTrue(doc.indexOf("http:") >= 0); |
|
93 |
} catch (FileNotFoundException e) { |
|
94 |
fail("Failed to read the sitemap file." + e.getMessage()); |
|
95 |
} catch (IOException ioe) { |
|
96 |
fail("Failed while reading sitemap file." + ioe.getMessage()); |
|
97 |
} |
|
98 |
} |
|
99 |
} |
|
0 | 100 |
lib/metacat.properties | ||
---|---|---|
264 | 264 |
metacatUrl=@systemidserver@@servlet-path@ |
265 | 265 |
baseUrl=@systemidserver@@html-path@ |
266 | 266 |
# END spatial options |
267 |
|
|
268 |
# START sitemap properties |
|
269 |
|
|
270 |
# relative directory path in which sitemap files should be written |
|
271 |
sitemapDirectory=@install-dir@/sitemaps |
|
272 |
|
|
273 |
# Interval (in milliseconds) between rebuilding the sitemap |
|
274 |
sitemapInterval=86400000 |
|
275 |
|
|
276 |
# END sitemap properties |
src/edu/ucsb/nceas/metacat/MetaCatServlet.java | ||
---|---|---|
126 | 126 |
{ |
127 | 127 |
private static Hashtable sessionHash = new Hashtable(); |
128 | 128 |
private Timer timer = null; |
129 |
private static boolean sitemapScheduled; |
|
129 | 130 |
|
130 | 131 |
// Constants -- these should be final in a servlet |
131 | 132 |
private static final String PROLOG = "<?xml version=\"1.0\"?>"; |
... | ... | |
244 | 245 |
} else { |
245 | 246 |
logMetacat.info(" \n **** Spatial cache is not set to regenerate on restart"); |
246 | 247 |
} |
247 |
|
|
248 |
|
|
248 |
|
|
249 |
sitemapScheduled = false; |
|
250 |
|
|
249 | 251 |
logMetacat.info("Metacat (" + Version.getVersion() |
250 | 252 |
+ ") initialized."); |
251 | 253 |
|
... | ... | |
829 | 831 |
//util.closeConnections(); |
830 | 832 |
// Close the stream to the client |
831 | 833 |
//out.close(); |
834 |
|
|
835 |
// Schedule the sitemap generator to run periodically |
|
836 |
scheduleSitemapGeneration(request); |
|
832 | 837 |
} |
833 | 838 |
} |
834 | 839 |
|
... | ... | |
3456 | 3461 |
} |
3457 | 3462 |
} |
3458 | 3463 |
} |
3464 |
|
|
3465 |
/** |
|
3466 |
* Schedule the sitemap generator to run periodically and update all |
|
3467 |
* of the sitemap files for search indexing engines. |
|
3468 |
* |
|
3469 |
* @param request a servlet request, from which we determine the context |
|
3470 |
*/ |
|
3471 |
private void scheduleSitemapGeneration(HttpServletRequest request) { |
|
3472 |
if (!sitemapScheduled) { |
|
3473 |
String directoryName = MetaCatUtil.getOption("sitemapDirectory"); |
|
3474 |
File directory = new File(directoryName); |
|
3475 |
directory.mkdirs(); |
|
3476 |
String urlRoot = request.getRequestURL().toString(); |
|
3477 |
String skin = MetaCatUtil.getOption("default-style"); |
|
3478 |
Sitemap smap = new Sitemap(directory, urlRoot, skin); |
|
3479 |
long sitemapInterval = Integer.parseInt( |
|
3480 |
MetaCatUtil.getOption("sitemapInterval")); |
|
3481 |
long firstDelay = 60*1000; // 60 seconds delay |
|
3482 |
timer.schedule(smap, firstDelay, sitemapInterval); |
|
3483 |
sitemapScheduled = true; |
|
3484 |
} |
|
3485 |
} |
|
3459 | 3486 |
} |
src/edu/ucsb/nceas/metacat/Sitemap.java | ||
---|---|---|
1 |
/** |
|
2 |
* '$RCSfile$' |
|
3 |
* Copyright: 2007 Regents of the University of California and the |
|
4 |
* National Center for Ecological Analysis and Synthesis |
|
5 |
* |
|
6 |
* '$Author$' |
|
7 |
* '$Date$' |
|
8 |
* '$Revision$' |
|
9 |
* |
|
10 |
* This program is free software; you can redistribute it and/or modify |
|
11 |
* it under the terms of the GNU General Public License as published by |
|
12 |
* the Free Software Foundation; either version 2 of the License, or |
|
13 |
* (at your option) any later version. |
|
14 |
* |
|
15 |
* This program is distributed in the hope that it will be useful, |
|
16 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
17 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
18 |
* GNU General Public License for more details. |
|
19 |
* |
|
20 |
* You should have received a copy of the GNU General Public License |
|
21 |
* along with this program; if not, write to the Free Software |
|
22 |
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|
23 |
*/ |
|
24 |
|
|
25 |
package edu.ucsb.nceas.metacat; |
|
26 |
|
|
27 |
import java.io.File; |
|
28 |
import java.io.FileWriter; |
|
29 |
import java.io.IOException; |
|
30 |
import java.io.Writer; |
|
31 |
import java.sql.PreparedStatement; |
|
32 |
import java.sql.ResultSet; |
|
33 |
import java.sql.SQLException; |
|
34 |
import java.util.TimerTask; |
|
35 |
|
|
36 |
import org.apache.log4j.Logger; |
|
37 |
|
|
38 |
/** |
|
39 |
* A Sitemap represents a document that lists all of the content of the Metacat |
|
40 |
* server for use by harvesting spiders that wish to index the contents of the |
|
41 |
* Metacat site. It is used to generate an XML representation of all of the URLs |
|
42 |
* of the site in order to facilitate indexing of the metacat site by search |
|
43 |
* engines. |
|
44 |
* |
|
45 |
* @author Matt Jones |
|
46 |
*/ |
|
47 |
public class Sitemap extends TimerTask { |
|
48 |
|
|
49 |
/** |
|
50 |
* Construct a new instance of the Sitemap. |
|
51 |
* |
|
52 |
* @param directory |
|
53 |
* the location to store sitemap files |
|
54 |
* @param urlRoot |
|
55 |
* the base URL for constructing sitemap URLs |
|
56 |
* @param skin |
|
57 |
* the format skin to be used in URLs |
|
58 |
*/ |
|
59 |
public Sitemap(File directory, String urlRoot, String skin) { |
|
60 |
super(); |
|
61 |
this.directory = directory; |
|
62 |
this.urlRoot = urlRoot; |
|
63 |
this.skin = skin; |
|
64 |
} |
|
65 |
|
|
66 |
/** |
|
67 |
* Execute the timed task when called, in this case by generating the |
|
68 |
* sitemap files needed for this Metacat instance. |
|
69 |
*/ |
|
70 |
public void run() { |
|
71 |
generateSitemaps(); |
|
72 |
} |
|
73 |
|
|
74 |
/** |
|
75 |
* Generate all of the sitemap files needed to list the URLs from this |
|
76 |
* instance of Metacat, using the open sitemap format described here: |
|
77 |
* |
|
78 |
* URLs are written to a single file, unless the maximum number of URLs |
|
79 |
* allowed inthe sitemap file is exceeded, in which subsequent numbered |
|
80 |
* files are created. |
|
81 |
* |
|
82 |
* @param directory |
|
83 |
* an existing File directory in which to write the sitemaps |
|
84 |
* @param urlRoot |
|
85 |
* the base URL to use in constructing document URLs |
|
86 |
* @param skin |
|
87 |
* the name of the skin to be used in formatting metacat |
|
88 |
* documents |
|
89 |
*/ |
|
90 |
public void generateSitemaps() { |
|
91 |
Logger logMetacat = Logger.getLogger(Sitemap.class); |
|
92 |
|
|
93 |
logMetacat.info("Running the Sitemap task."); |
|
94 |
|
|
95 |
// Test if the passed in File is a directory |
|
96 |
if (directory.isDirectory()) { |
|
97 |
// Query xml_documents to get list of documents |
|
98 |
StringBuffer query = new StringBuffer(); |
|
99 |
// TODO: make the doctype configurable in the query |
|
100 |
query.append("select docid, rev from xml_documents " |
|
101 |
+ "where doctype LIKE 'eml:%' " + "order by docid, rev"); |
|
102 |
|
|
103 |
DBConnection dbConn = null; |
|
104 |
int serialNumber = -1; |
|
105 |
try { |
|
106 |
// Get a database connection from the pool |
|
107 |
dbConn = DBConnectionPool |
|
108 |
.getDBConnection("Sitemap.generateSitemap()"); |
|
109 |
serialNumber = dbConn.getCheckOutSerialNumber(); |
|
110 |
|
|
111 |
// Execute the query statement |
|
112 |
PreparedStatement stmt = dbConn.prepareStatement(query |
|
113 |
.toString()); |
|
114 |
stmt.execute(); |
|
115 |
ResultSet rs = stmt.getResultSet(); |
|
116 |
|
|
117 |
// Loop through all of the documents, and write them to a |
|
118 |
// sitemap |
|
119 |
File sitemapFile = null; |
|
120 |
FileWriter sitemap = null; |
|
121 |
int counter = 0; |
|
122 |
int fileNumber = 0; |
|
123 |
while (rs.next()) { |
|
124 |
// Check if a new sitemap file needs to be created |
|
125 |
if (counter % MAX_URLS_IN_FILE == 0) { |
|
126 |
|
|
127 |
// if a sitemap file is already open |
|
128 |
if (sitemapFile != null && sitemapFile.canWrite()) { |
|
129 |
// write the footer and close the file |
|
130 |
writeSitemapFooter(sitemap); |
|
131 |
} |
|
132 |
|
|
133 |
// Open a new sitemap file for writing |
|
134 |
fileNumber++; |
|
135 |
sitemapFile = new File(directory, fileRoot + fileNumber |
|
136 |
+ ".xml"); |
|
137 |
sitemap = new FileWriter(sitemapFile); |
|
138 |
|
|
139 |
// Write the sitemap document header for the new file |
|
140 |
writeSitemapHeader(sitemap); |
|
141 |
} |
|
142 |
|
|
143 |
String separator = MetaCatUtil.getOption("accNumSeparator"); |
|
144 |
String docid = rs.getString(1) + separator |
|
145 |
+ rs.getString(2); |
|
146 |
writeSitemapEntry(sitemap, docid); |
|
147 |
counter++; |
|
148 |
} |
|
149 |
stmt.close(); |
|
150 |
writeSitemapFooter(sitemap); |
|
151 |
} catch (SQLException e) { |
|
152 |
logMetacat.warn("Error while writing to the sitemap file: " |
|
153 |
+ e.getMessage()); |
|
154 |
} catch (IOException ioe) { |
|
155 |
logMetacat.warn("Could not open or write to the sitemap file." |
|
156 |
+ ioe.getMessage()); |
|
157 |
} finally { |
|
158 |
// Return database connection to the pool |
|
159 |
DBConnectionPool.returnDBConnection(dbConn, serialNumber); |
|
160 |
} |
|
161 |
} else { |
|
162 |
logMetacat.warn("Sitemap not created because directory not valid."); |
|
163 |
} |
|
164 |
} |
|
165 |
|
|
166 |
/** |
|
167 |
* Write the header information in a single sitemap file. This includes the |
|
168 |
* XML prolog, the root element and namespace declaration, and the elements |
|
169 |
* leading up to the first URL entry. |
|
170 |
* |
|
171 |
* @param sitemap |
|
172 |
* the Writer to use for writing the header |
|
173 |
* @throws IOException |
|
174 |
* if there is a problem writing to the Writer |
|
175 |
*/ |
|
176 |
private void writeSitemapHeader(Writer sitemap) throws IOException { |
|
177 |
sitemap.write(PROLOG); |
|
178 |
String header = "<urlset xmlns=\"http://www.google.com/schemas/sitemap/0.84\">\n"; |
|
179 |
sitemap.write(header); |
|
180 |
sitemap.flush(); |
|
181 |
} |
|
182 |
|
|
183 |
/** |
|
184 |
* Write a URL entry to a single sitemap file. This includes the XML markup |
|
185 |
* surrounding a particular site URL. |
|
186 |
* |
|
187 |
* @param sitemap |
|
188 |
* the Writer to use for writing the URL |
|
189 |
* @param docid |
|
190 |
* the identifier to be written in the URL |
|
191 |
* @param urlRoot |
|
192 |
* the base URL to be used in constructing a URL |
|
193 |
* @param skin |
|
194 |
* the name of the skin to be used in constructing a URL |
|
195 |
* @throws IOException |
|
196 |
* if there is a problem writing to the Writer |
|
197 |
*/ |
|
198 |
private void writeSitemapEntry(Writer sitemap, String docid) |
|
199 |
throws IOException { |
|
200 |
if (sitemap != null && docid != null && urlRoot != null) { |
|
201 |
StringBuffer url = new StringBuffer(); |
|
202 |
url.append(urlRoot); |
|
203 |
if (!urlRoot.endsWith("/")) { |
|
204 |
url.append("/"); |
|
205 |
} |
|
206 |
url.append(docid); |
|
207 |
if (skin != null) { |
|
208 |
url.append("/"); |
|
209 |
url.append(skin); |
|
210 |
} |
|
211 |
sitemap.write("<url><loc>"); |
|
212 |
sitemap.write(url.toString()); |
|
213 |
sitemap.write("</loc>"); |
|
214 |
// <lastmod>2005-01-01</lastmod> |
|
215 |
// <changefreq>monthly</changefreq> |
|
216 |
// <priority>0.8</priority> |
|
217 |
sitemap.write("</url>"); |
|
218 |
sitemap.write("\n"); |
|
219 |
sitemap.flush(); |
|
220 |
} |
|
221 |
} |
|
222 |
|
|
223 |
/** |
|
224 |
* Write the footer information in a single sitemap file and close the file. |
|
225 |
* This includes the closing tag for the root element. |
|
226 |
* |
|
227 |
* @param sitemap |
|
228 |
* the Writer to use for writing the footer |
|
229 |
* @throws IOException |
|
230 |
* if there is a problem writing to the Writer |
|
231 |
*/ |
|
232 |
private void writeSitemapFooter(Writer sitemap) throws IOException { |
|
233 |
String footer = "</urlset>\n"; |
|
234 |
sitemap.write(footer); |
|
235 |
sitemap.close(); |
|
236 |
} |
|
237 |
|
|
238 |
// Member variables |
|
239 |
|
|
240 |
/** The directory in which sitemaps are written. */ |
|
241 |
private File directory; |
|
242 |
|
|
243 |
/** The root url for constructing sitemap URLs. */ |
|
244 |
private String urlRoot; |
|
245 |
|
|
246 |
/** The name of the format skin to be used in sitemap URLs. */ |
|
247 |
private String skin; |
|
248 |
|
|
249 |
/** Maximum number of URLs to write to a single sitemap file */ |
|
250 |
static final int MAX_URLS_IN_FILE = 25000; // 50,000 according to Google |
|
251 |
|
|
252 |
/** The root name to be used in naming sitemap files. */ |
|
253 |
static final String fileRoot = "metacat"; |
|
254 |
|
|
255 |
/** A String constant containing the XML prolog to be written in files. */ |
|
256 |
static final String PROLOG = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"; |
|
257 |
} |
|
0 | 258 |
Also available in: Unified diff
Added a new class called Sitemap that is used to generate a series of XML documents representing the URLs of metacat documents following the sitemap protocol. The Sitemap class extends TimerTask so that it can be scheduled to run once a day or so. New configuration options were added to metacat.properties to control where the sitemaps are written and hw often they are updated. By default we do it once a day, as more often is overkill for search engines.
Included a JUnit unit test to test the Sitemap generation functionality.
Included changes to MetaCatServlet to schedule the Sitemap task the first time Metacat is called.