Project

General

Profile

Revision 429

Fixed document reading bug (bugzilla bug #111) so that reading documents
is no longer a power function of the number of nodes in the document
(which used to be the case). Now, reading a document occurs entirely
within DocumentImpl, by making a single SQL call to get the document data,
and then using the NodeComparator class to return a TreeSet of the nodes
sorted in a depth-first traversal order. This TreeSet is then processed
by the new DocumentImpl.toXml() methods, which formats and outputs a text
representation of the document to the Writer that is passed in. The
DocumentImpl.toString() method has been re-written to utilize
DocumentImpl.toXml() as well.

The old algorithm for searching (that utilized the ElementNode, textNode,
CommentNode, and PINode classes) is still implemented for comparison purposes,
and can be accessed by calling the readUsingSlowAlgorithm() method. A
timing option has been added to DocumentImpl.main() so that the methods can be
compared (see the -t and -old options). Although the difference in read
time is only a fraction of a second for small documents (< 1K), the new
method of reading is 72 times faster than the old method for a 34K
document (1.9 seconds versus 144 seconds). This difference continues to grow
as the node count increases.
BugID: 111

View differences:

src/edu/ucsb/nceas/metacat/DocumentImpl.java
14 14
package edu.ucsb.nceas.metacat;
15 15

  
16 16
import java.sql.*;
17
import java.io.File;
18
import java.io.FileReader;
19
import java.io.IOException;
17 20
import java.io.PrintWriter;
18
import java.util.TreeSet;
21
import java.io.Reader;
22
import java.io.StringWriter;
23
import java.io.Writer;
19 24

  
20
import java.io.*;
25
import java.util.Iterator;
21 26
import java.util.Stack;
27
import java.util.TreeSet;
22 28

  
23 29
import org.xml.sax.AttributeList;
24 30
import org.xml.sax.ContentHandler;
......
50 56
  private long rootnodeid;
51 57
  private ElementNode rootNode = null;
52 58
  private String doctitle = null;
59
  private TreeSet nodeRecordList = null;
53 60

  
54 61
  /**
55 62
   * Constructor, creates document from database connection, used 
......
68 75
      getDocumentInfo(docid);
69 76
      
70 77
      // Download all of the document nodes using a single SQL query
71
      TreeSet nodeRecordList = getNodeRecordList(rootnodeid);
78
      // The sort order of the records is determined by the NodeComparator
79
      // class, and needs to represent a depth-first traversal for the
80
      // toXml() method to work properly
81
      nodeRecordList = getNodeRecordList(rootnodeid);
72 82
  
73
      // Create the elements from the downloaded data in the TreeSet
74
      rootNode = new ElementNode(nodeRecordList, rootnodeid);
75

  
76 83
    } catch (McdbException ex) {
77 84
      throw ex;
78 85
    } catch (Throwable t) {
......
145 152
    return docid;
146 153
  }
147 154

  
155

  
148 156
  /**
149
   * Create an XML document from the database for the document with ID docid
157
   * Print a string representation of the XML document
150 158
   */
151 159
  public String toString()
152 160
  {
161
    StringWriter docwriter = new StringWriter();
162
    this.toXml(docwriter);
163
    String document = docwriter.toString();
164
    return document;
165
  }
166

  
167
  /**
168
   * Get a text representation of the XML document as a string
169
   * This older algorithm uses a recursive tree of Objects to represent the
170
   * nodes of the tree.  Each object is passed the data for the document 
171
   * and searches all of the document data to find its children nodes and
172
   * recursively build.  Thus, because each node reads the whole document,
173
   * this algorithm is extremely slow for larger documents, and the time
174
   * to completion is O(N^N) wrt the number of nodes.  See toXml() for a
175
   * better algorithm.
176
   */
177
  public String readUsingSlowAlgorithm()
178
  {
153 179
    StringBuffer doc = new StringBuffer();
154 180

  
181
    // Create the elements from the downloaded data in the TreeSet
182
    rootNode = new ElementNode(nodeRecordList, rootnodeid);
183

  
155 184
    // Append the resulting document to the StringBuffer and return it
156 185
    doc.append("<?xml version=\"1.0\"?>\n");
157 186
      
......
169 198
  }
170 199

  
171 200
  /**
201
   * Print a text representation of the XML document to a Writer
202
   *
203
   * @param pw the Writer to which we print the document
204
   */
205
  public void toXml(Writer pw)
206
  {
207
    PrintWriter out = null;
208
    if (pw instanceof PrintWriter) {
209
      out = (PrintWriter)pw;
210
    } else {
211
      out = new PrintWriter(pw);
212
    }
213

  
214
    MetaCatUtil util = new MetaCatUtil();
215
    
216
    Stack openElements = new Stack();
217
    boolean atRootElement = true;
218
    boolean previousNodeWasElement = false;
219

  
220
    // Step through all of the node records we were given
221
    Iterator it = nodeRecordList.iterator();
222
    while (it.hasNext()) {
223
      NodeRecord currentNode = (NodeRecord)it.next();
224
      //util.debugMessage("[Got Node ID: " + currentNode.nodeid +
225
                          //" (" + currentNode.parentnodeid +
226
                          //", " + currentNode.nodeindex + 
227
                          //", " + currentNode.nodetype + 
228
                          //", " + currentNode.nodename + 
229
                          //", " + currentNode.nodedata + ")]");
230

  
231
      // Print the end tag for the previous node if needed
232
      //
233
      // This is determined by inspecting the parent nodeid for the
234
      // currentNode.  If it is the same as the nodeid of the last element
235
      // that was pushed onto the stack, then we are still in that previous
236
      // parent element, and we do nothing.  However, if it differs, then we
237
      // have returned to a level above the previous parent, so we go into
238
      // a loop and pop off nodes and print out their end tags until we get
239
      // the node on the stack to match the currentNode parentnodeid
240
      //
241
      // So, this of course means that we rely on the list of elements
242
      // having been sorted in a depth first traversal of the nodes, which
243
      // is handled by the NodeComparator class used by the TreeSet
244
      if (!atRootElement) {
245
        NodeRecord currentElement = (NodeRecord)openElements.peek();
246
        if ( currentNode.parentnodeid != currentElement.nodeid ) {
247
          while ( currentNode.parentnodeid != currentElement.nodeid ) {
248
            currentElement = (NodeRecord)openElements.pop();
249
            util.debugMessage("\n POPPED: " + currentElement.nodename);
250
            out.print("</" + currentElement.nodename + ">" );
251
            currentElement = (NodeRecord)openElements.peek();
252
          }
253
        }
254
      }
255

  
256
      // Handle the DOCUMENT node
257
      if (currentNode.nodetype.equals("DOCUMENT")) {
258
        out.println("<?xml version=\"1.0\"?>");
259
      
260
        if (docname != null) {
261
          if ((doctype != null) && (system_id != null)) {
262
            out.println("<!DOCTYPE " + docname + " PUBLIC \"" + doctype + 
263
                       "\" \"" + system_id + "\">");
264
          } else {
265
            out.println("<!DOCTYPE " + docname + ">");
266
          }
267
        }
268

  
269
      // Handle the ELEMENT nodes
270
      } else if (currentNode.nodetype.equals("ELEMENT")) {
271
        if (atRootElement) {
272
          atRootElement = false;
273
        } else {
274
          if (previousNodeWasElement) {
275
            out.print(">");
276
          }
277
        }
278
        openElements.push(currentNode);
279
        util.debugMessage("\n PUSHED: " + currentNode.nodename);
280
        previousNodeWasElement = true;
281
        out.print("<" + currentNode.nodename);
282

  
283
      // Handle the ATTRIBUTE nodes
284
      } else if (currentNode.nodetype.equals("ATTRIBUTE")) {
285
        out.print(" " + currentNode.nodename + "=\""
286
                 + currentNode.nodedata + "\"");
287
      } else if (currentNode.nodetype.equals("TEXT")) {
288
        if (previousNodeWasElement) {
289
          out.print(">");
290
        }
291
        out.print(currentNode.nodedata);
292
        previousNodeWasElement = false;
293

  
294
      // Handle the COMMENT nodes
295
      } else if (currentNode.nodetype.equals("COMMENT")) {
296
        if (previousNodeWasElement) {
297
          out.print(">");
298
        }
299
        out.print("<!--" + currentNode.nodedata + "-->");
300
        previousNodeWasElement = false;
301

  
302
      // Handle the PI nodes
303
      } else if (currentNode.nodetype.equals("PI")) {
304
        if (previousNodeWasElement) {
305
          out.print(">");
306
        }
307
        out.print("<?" + currentNode.nodename + " " +
308
                        currentNode.nodedata + "?>");
309
        previousNodeWasElement = false;
310

  
311
      // Handle any other node type (do nothing)
312
      } else {
313
        // Any other types of nodes are not handled.
314
        // Probably should throw an exception here to indicate this
315
      }
316
      out.flush();
317
    }
318

  
319
    // Print the final end tag for the root element
320
    NodeRecord currentElement = (NodeRecord)openElements.pop();
321
    util.debugMessage("\n POPPED: " + currentElement.nodename);
322
    out.print("</" + currentElement.nodename + ">" );
323
    out.flush();
324
  }
325

  
326
  /**
172 327
   * Look up the document type information from the database
173 328
   *
174 329
   * @param docid the id of the document to look up
......
750 905
      String filename = null;
751 906
      String action   = null;
752 907
      String docid    = null;
908
      boolean showRuntime = false;
909
      boolean useOldReadAlgorithm = false;
753 910

  
754 911
      // Parse the command line arguments
755 912
      for ( int i=0 ; i < args.length; ++i ) {
......
759 916
          action =  args[++i];
760 917
        } else if ( args[i].equals( "-d" ) ) {
761 918
          docid =  args[++i];
919
        } else if ( args[i].equals( "-t" ) ) {
920
          showRuntime = true;
921
        } else if ( args[i].equals( "-old" ) ) {
922
          useOldReadAlgorithm = true;
762 923
        } else {
763 924
          System.err.println
764 925
            ( "   args[" +i+ "] '" +args[i]+ "' ignored." );
......
791 952
      if (!argsAreValid) {
792 953
        System.err.println("Wrong number of arguments!!!");
793 954
        System.err.println(
794
              "USAGE: java DocumentImpl <-a INSERT> [-d docid] <-f filename>");
955
          "USAGE: java DocumentImpl [-t] <-a INSERT> [-d docid] <-f filename>");
795 956
        System.err.println(
796
              "   OR: java DocumentImpl <-a UPDATE -d docid -f filename>");
957
          "   OR: java DocumentImpl [-t] <-a UPDATE -d docid -f filename>");
797 958
        System.err.println(
798
              "   OR: java DocumentImpl <-a DELETE -d docid>");
959
          "   OR: java DocumentImpl [-t] <-a DELETE -d docid>");
799 960
        System.err.println(
800
              "   OR: java DocumentImpl <-a READ -d docid>");
961
          "   OR: java DocumentImpl [-t] [-old] <-a READ -d docid>");
801 962
        return;
802 963
      }
803
 
964
      
965
      // Time the request if asked for
966
      double startTime = System.currentTimeMillis();
967
      
804 968
      // Open a connection to the database
805 969
      MetaCatUtil util = new MetaCatUtil();
806 970
      Connection dbconn = util.openDBConnection();
......
808 972
      // Execute the action requested (READ, INSERT, UPDATE, DELETE)
809 973
      if (action.equals("READ")) {
810 974
          DocumentImpl xmldoc = new DocumentImpl( dbconn, docid );
811
          System.out.println(xmldoc.toString());
975
          if (useOldReadAlgorithm) {
976
            System.out.println(xmldoc.readUsingSlowAlgorithm());
977
          } else {
978
            xmldoc.toXml(new PrintWriter(System.out));
979
          }
812 980
      } else if (action.equals("DELETE")) {
813 981
        DocumentImpl.delete(dbconn, docid, null, null);
814 982
        System.out.println("Document deleted: " + docid);
......
828 996
              + " (" + newdocid + ")");
829 997
      }
830 998

  
999
      double stopTime = System.currentTimeMillis();
1000
      double executionTime = (stopTime - startTime)/1000;
1001
      if (showRuntime) {
1002
        System.out.println("\n\nExecution time was: " + 
1003
                           executionTime + " seconds");
1004
      }
831 1005
    } catch (McdbException me) {
832 1006
      me.toXml(new PrintWriter(System.err));
833 1007
    } catch (AccessionNumberException ane) {
src/edu/ucsb/nceas/metacat/NodeComparator.java
16 16
import java.util.Comparator;
17 17

  
18 18
/**
19
 * A utility class that sorts two node records
19
 * A utility class that sorts two node records.  
20
 * <p>
21
 * The order of the records
22
 * determines how the XML document is printed from DocumentImpl.toXml(),
23
 * so it is important that the sort order specified here results in a depth
24
 * first traversal of the nodes in tree.  Currently, the nodes are inserted
25
 * into the database in this depth-forst order, so the nodeid identifiers
26
 * are a good indicator of the proper sort order.
27
 * <p>
28
 * However, if we modify data loading semantics to allow document nodes to
29
 * be rearranged, or otherwise change the nodeindex value, this current
30
 * sort algorithm will fail to work.
20 31
 */
21 32
public class NodeComparator implements Comparator {
22 33

  
......
49 60
  public int compare(NodeRecord o1, NodeRecord o2) {
50 61
    if (o1.nodeid == o2.nodeid) {
51 62
      return EQUALS;
63
    } else if (o1.nodeid < o2.nodeid) {
64
      return LESS;
65
    } else if (o1.nodeid > o2.nodeid) {
66
      return GREATER;
67

  
68
/*  // This is old code that used to sort the records into breadth-first
69
    // traversal order, based on the parentnodeid and the nodeindex.
70
    //
71
    if (o1.nodeid == o2.nodeid) {
72
      return EQUALS;
52 73
    } else if (o1.parentnodeid < o2.parentnodeid) {
53 74
      return LESS;
54 75
    } else if (o1.parentnodeid > o2.parentnodeid) {
......
62 83
        // this should never happen because (parentnodeid,nodeindex) is unique
63 84
        return EQUALS;
64 85
      }
86
*/
65 87
    } else {
66 88
      // this should never happen because parentnodeid is always <,>, or =
67 89
      return EQUALS;

Also available in: Unified diff