Java example - XmlParser.java

What this is

This file is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" ^TM.

The source code

// XmlParser.java: the main parser class.
// NO WARRANTY! See README, and copyright below.
// $Id: XmlParser.java,v 1.3 2004/02/22 20:00:47 spestov Exp $

package com.microstar.xml;

import java.io.BufferedInputStream;
import java.io.EOFException;
import java.io.InputStream;
import java.io.Reader;
import java.net.URL;
import java.net.URLConnection;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Stack;


/**
  * Parse XML documents and return parse events through call-backs.
  * You need to define a class implementing the XmlHandler
  * interface: an object belonging to this class will receive the
  * callbacks for the events.  (As an alternative to implementing
  * the full XmlHandler interface, you can simply extend the 
  * HandlerBase convenience class.)
  * 
Usage (assuming that MyHandler is your implementation
  * of the XmlHandler interface):
  * 
  * XmlHandler handler = new MyHandler();
  * XmlParser parser = new XmlParser();
  * parser.setHandler(handler);
  * try {
  *   parser.parse("http://www.host.com/doc.xml", null);
  * } catch (Exception e) {
  *   [do something interesting]
  * }
  * 
  * Alternatively, you can use the standard SAX interfaces
  * with the SAXDriver class as your entry point.
  * @author Copyright (c) 1997, 1998 by Microstar Software Ltd.
  * @author Written by David Megginson <dmeggins@microstar.com>
  * @version 1.1
  * @see XmlHandler
  * @see HandlerBase
  * @see SAXDriver
  */
public class XmlParser {

  //
  // Use special cheats that speed up the code (currently about 50%),
  // but may cause problems with future maintenance and add to the
  // class file size (about 500 bytes).
  //
  private final static boolean USE_CHEATS = true;



  //////////////////////////////////////////////////////////////////////
  // Constructors.
  ////////////////////////////////////////////////////////////////////////


  /**
    * Construct a new parser with no associated handler.
    * @see #setHandler
    * @see #parse
    */
  public XmlParser ()
  {
  }


  /**
    * Set the handler that will receive parsing events.
    * @param handler The handler to receive callback events.
    * @see #parse
    * @see XmlHandler
    */
  public void setHandler (XmlHandler handler)
  {
    this.handler = handler;
  }


  /**
    * Parse an XML document from a URI.
    * 
You may parse a document more than once, but only one thread
    * may call this method for an object at one time.
    * @param systemId The URI of the document.
    * @param publicId The public identifier of the document, or null.
    * @param encoding The suggested encoding, or null if unknown.
    * @exception java.lang.Exception Any exception thrown by your
    *            own handlers, or any derivation of java.io.IOException
    *            thrown by the parser itself.
    */
  public void parse (String systemId, String publicId, String encoding)
    throws java.lang.Exception
  {
    doParse(systemId, publicId, null, null, encoding);
  }


  /**
    * Parse an XML document from a byte stream.
    * 
The URI that you supply will become the base URI for
    * resolving relative links, but Ælfred will actually read
    * the document from the supplied input stream.
    * 
You may parse a document more than once, but only one thread
    * may call this method for an object at one time.
    * @param systemId The base URI of the document, or null if not
    *                 known.
    * @param publicId The public identifier of the document, or null
    *                 if not known.
    * @param stream A byte input stream.
    * @param encoding The suggested encoding, or null if unknown.
    * @exception java.lang.Exception Any exception thrown by your
    *            own handlers, or any derivation of java.io.IOException
    *            thrown by the parser itself.
    */
  public void parse (String systemId, String publicId,
		     InputStream stream, String encoding)
    throws java.lang.Exception
  {
    doParse(systemId, publicId, null, stream, encoding);
  }


  /**
    * Parse an XML document from a character stream.
    * 
The URI that you supply will become the base URI for
    * resolving relative links, but Ælfred will actually read
    * the document from the supplied input stream.
    * 
You may parse a document more than once, but only one thread
    * may call this method for an object at one time.
    * @param systemId The base URI of the document, or null if not
    *                 known.
    * @param publicId The public identifier of the document, or null
    *                 if not known.
    * @param reader A character stream.
    * @exception java.lang.Exception Any exception thrown by your
    *            own handlers, or any derivation of java.io.IOException
    *            thrown by the parser itself.
    */
  public void parse (String systemId, String publicId, Reader reader)
    throws java.lang.Exception
  {
    doParse(systemId, publicId, reader, null, null);
  }


  private synchronized void doParse (String systemId, String publicId,
				     Reader reader, InputStream stream,
				     String encoding)
    throws java.lang.Exception
  {
    basePublicId = publicId;
    baseURI = systemId;
    baseReader = reader;
    baseInputStream = stream;

    initializeVariables();

				// Set the default entities here.
    setInternalEntity(intern("amp"), "&");
    setInternalEntity(intern("lt"), "<");
    setInternalEntity(intern("gt"), ">");
    setInternalEntity(intern("apos"), "'");
    setInternalEntity(intern("quot"), """);

    if (handler != null) {
      handler.startDocument();
    }

    pushURL("[document]", basePublicId, baseURI, baseReader, baseInputStream,
	    encoding);

    parseDocument();

    if (handler != null) {
      handler.endDocument();
    }
    cleanupVariables();
  }



  ////////////////////////////////////////////////////////////////////////
  // Constants.
  ////////////////////////////////////////////////////////////////////////

  //
  // Constants for element content type.
  //

  /**
    * Constant: an element has not been declared.
    * @see #getElementContentType
    */
  public final static int CONTENT_UNDECLARED = 0;

  /**
    * Constant: the element has a content model of ANY.
    * @see #getElementContentType
    */
  public final static int CONTENT_ANY = 1;

  /**
    * Constant: the element has declared content of EMPTY.
    * @see #getElementContentType
    */
  public final static int CONTENT_EMPTY = 2;

  /**
    * Constant: the element has mixed content.
    * @see #getElementContentType
    */
  public final static int CONTENT_MIXED = 3;

  /**
    * Constant: the element has element content.
    * @see #getElementContentType
    */
  public final static int CONTENT_ELEMENTS = 4;


  //
  // Constants for the entity type.
  //

  /**
    * Constant: the entity has not been declared.
    * @see #getEntityType
    */
  public final static int ENTITY_UNDECLARED = 0;

  /**
    * Constant: the entity is internal.
    * @see #getEntityType
    */
  public final static int ENTITY_INTERNAL = 1;

  /**
    * Constant: the entity is external, non-XML data.
    * @see #getEntityType
    */
  public final static int ENTITY_NDATA = 2;

  /**
    * Constant: the entity is external XML data.
    * @see #getEntityType
    */
  public final static int ENTITY_TEXT = 3;


  //
  // Constants for attribute type.
  //

  /**
    * Constant: the attribute has not been declared for this element type.
    * @see #getAttributeType
    */
  public final static int ATTRIBUTE_UNDECLARED = 0;

  /**
    * Constant: the attribute value is a string value.
    * @see #getAttributeType
    */
  public final static int ATTRIBUTE_CDATA = 1;

  /**
    * Constant: the attribute value is a unique identifier.
    * @see #getAttributeType
    */
  public final static int ATTRIBUTE_ID = 2;

  /**
    * Constant: the attribute value is a reference to a unique identifier.
    * @see #getAttributeType
    */
  public final static int ATTRIBUTE_IDREF = 3;

  /**
    * Constant: the attribute value is a list of ID references.
    * @see #getAttributeType
    */
  public final static int ATTRIBUTE_IDREFS = 4;

  /**
    * Constant: the attribute value is the name of an entity.
    * @see #getAttributeType
    */
  public final static int ATTRIBUTE_ENTITY = 5;

  /**
    * Constant: the attribute value is a list of entity names.
    * @see #getAttributeType
    */
  public final static int ATTRIBUTE_ENTITIES = 6;

  /**
    * Constant: the attribute value is a name token.
    * @see #getAttributeType
    */
  public final static int ATTRIBUTE_NMTOKEN = 7;

  /**
    * Constant: the attribute value is a list of name tokens.
    * @see #getAttributeType
    */
  public final static int ATTRIBUTE_NMTOKENS = 8;

  /**
    * Constant: the attribute value is a token from an enumeration.
    * @see #getAttributeType
    */
  public final static int ATTRIBUTE_ENUMERATED = 9;

  /**
    * Constant: the attribute is the name of a notation.
    * @see #getAttributeType
    */
  public final static int ATTRIBUTE_NOTATION = 10;


  //
  // When the class is loaded, populate the hash table of
  // attribute types.
  //

  /**
    * Hash table of attribute types.
    */
  private static Hashtable attributeTypeHash;
  static {
    attributeTypeHash = new Hashtable();
    attributeTypeHash.put("CDATA", new Integer(ATTRIBUTE_CDATA));
    attributeTypeHash.put("ID", new Integer(ATTRIBUTE_ID));
    attributeTypeHash.put("IDREF", new Integer(ATTRIBUTE_IDREF));
    attributeTypeHash.put("IDREFS", new Integer(ATTRIBUTE_IDREFS));
    attributeTypeHash.put("ENTITY", new Integer(ATTRIBUTE_ENTITY));
    attributeTypeHash.put("ENTITIES", new Integer(ATTRIBUTE_ENTITIES));
    attributeTypeHash.put("NMTOKEN", new Integer(ATTRIBUTE_NMTOKEN));
    attributeTypeHash.put("NMTOKENS", new Integer(ATTRIBUTE_NMTOKENS));
    attributeTypeHash.put("NOTATION", new Integer(ATTRIBUTE_NOTATION));
  }


  //
  // Constants for supported encodings.
  //
  private final static int ENCODING_UTF_8 = 1;
  private final static int ENCODING_ISO_8859_1 = 2;
  private final static int ENCODING_UCS_2_12 = 3;
  private final static int ENCODING_UCS_2_21 = 4;
  private final static int ENCODING_UCS_4_1234 = 5;
  private final static int ENCODING_UCS_4_4321 = 6;
  private final static int ENCODING_UCS_4_2143 = 7;
  private final static int ENCODING_UCS_4_3412 = 8;


  //
  // Constants for attribute default value.
  //

  /**
    * Constant: the attribute is not declared.
    * @see #getAttributeDefaultValueType
    */
  public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 0;

  /**
    * Constant: the attribute has a literal default value specified.
    * @see #getAttributeDefaultValueType
    * @see #getAttributeDefaultValue
    */
  public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 1;

  /**
    * Constant: the attribute was declared #IMPLIED.
    * @see #getAttributeDefaultValueType
    */
  public final static int ATTRIBUTE_DEFAULT_IMPLIED = 2;

  /**
    * Constant: the attribute was declared #REQUIRED.
    * @see #getAttributeDefaultValueType
    */
  public final static int ATTRIBUTE_DEFAULT_REQUIRED = 3;

  /**
    * Constant: the attribute was declared #FIXED.
    * @see #getAttributeDefaultValueType
    * @see #getAttributeDefaultValue
    */
  public final static int ATTRIBUTE_DEFAULT_FIXED = 4;


  //
  // Constants for input.
  //
  private final static int INPUT_NONE = 0;
  private final static int INPUT_INTERNAL = 1;
  private final static int INPUT_EXTERNAL = 2;
  private final static int INPUT_STREAM = 3;
  private final static int INPUT_BUFFER = 4;
  private final static int INPUT_READER = 5;


  //
  // Flags for reading literals.
  //
  private final static int LIT_CHAR_REF = 1;
  private final static int LIT_ENTITY_REF = 2;
  private final static int LIT_PE_REF = 4;
  private final static int LIT_NORMALIZE = 8;


  //
  // Flags for parsing context.
  //
  private final static int CONTEXT_NONE = 0;
  private final static int CONTEXT_DTD = 1;
  private final static int CONTEXT_ENTITYVALUE = 2;
  private final static int CONTEXT_ATTRIBUTEVALUE = 3;



  //////////////////////////////////////////////////////////////////////
  // Error reporting.
  //////////////////////////////////////////////////////////////////////


  /**
    * Report an error.
    * @param message The error message.
    * @param textFound The text that caused the error (or null).
    * @see XmlHandler#error
    * @see #line
    */
  void error (String message, String textFound, String textExpected)
    throws java.lang.Exception
  {
    errorCount++;
    if (textFound != null) {
      message = message + " (found \"" + textFound + "\")";
    }
    if (textExpected != null) {
      message = message + " (expected \"" + textExpected + "\")";
    }
    if (handler != null) {
      String uri = null;

      if (externalEntity != null) {
	uri = externalEntity.getURL().toString();
      }
      handler.error(message, uri, line, column);
    }
  }


  /**
    * Report a serious error.
    * @param message The error message.
    * @param textFound The text that caused the error (or null).
    */
  void error (String message, char textFound, String textExpected)
    throws java.lang.Exception
  {
    error(message, new Character(textFound).toString(), textExpected);
  }



  //////////////////////////////////////////////////////////////////////
  // Major syntactic productions.
  //////////////////////////////////////////////////////////////////////


  /**
    * Parse an XML document.
    * 
    * [1] document ::= prolog element Misc*
    * 
    * This is the top-level parsing function for a single XML
    * document.  As a minimum, a well-formed document must have
    * a document element, and a valid document must have a prolog
    * as well.
    */
  void parseDocument ()
    throws java.lang.Exception
    {
    char c;

    parseProlog();
    require('<');
    parseElement();
    try
      {
      parseMisc();  //skip all white, PIs, and comments
      c=readCh();   //if this doesn't throw an exception...
      error("unexpected characters after document end",c,null);
      }
    catch (EOFException e)
      {return;}
    }


  /**
    * Skip a comment.
    * 
    * [18] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
    * 
    * (The <!-- has already been read.)
    */
  void parseComment ()
    throws java.lang.Exception
  {
    skipUntil("-->");
  }


  /**
    * Parse a processing instruction and do a call-back.
    * 
    * [19] PI ::= '<?' Name (S (Char* - (Char* '?>' Char*)))? '?>'
    * 
    * (The <? has already been read.)
    * 
An XML processing instruction must begin with
    * a Name, which is the instruction's target.
    */
  void parsePI ()
    throws java.lang.Exception
  {
    String name;

    name = readNmtoken(true);
    if (!tryRead("?>")) {
      requireWhitespace();
      parseUntil("?>");
    }
    if (handler != null) {
      handler.processingInstruction(name, dataBufferToString());
    }
  }


  /**
    * Parse a CDATA marked section.
    * 
    * [20] CDSect ::= CDStart CData CDEnd
    * [21] CDStart ::= '<![CDATA['
    * [22] CData ::= (Char* - (Char* ']]>' Char*))
    * [23] CDEnd ::= ']]>'
    * 
    * (The '<![CDATA[' has already been read.)
    * 
Note that this just appends characters to the dataBuffer,
    * without actually generating an event.
    */
  void parseCDSect ()
    throws java.lang.Exception
  {
    parseUntil("]]>");
  }


  /**
    * Parse the prolog of an XML document.
    * 
    * [24] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
    * 
    * There are a couple of tricks here.  First, it is necessary to
    * declare the XML default attributes after the DTD (if present)
    * has been read.  Second, it is not possible to expand general
    * references in attribute value literals until after the entire
    * DTD (if present) has been parsed.
    * 
We do not look for the XML declaration here, because it is
    * handled by pushURL().
    * @see pushURL
    */
  void parseProlog ()
    throws java.lang.Exception
  {
    parseMisc();

    if (tryRead("
    * [25] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
    * [26] VersionInfo ::= S 'version' Eq ('"1.0"' | "'1.0'")
    * [33] SDDecl ::= S 'standalone' Eq "'" ('yes' | 'no') "'"
    *               | S 'standalone' Eq '"' ("yes" | "no") '"'
    * [78] EncodingDecl ::= S 'encoding' Eq QEncoding
    *

([80] to [82] are also significant.) *

(The <?xml and whitespace have already been read.) *

TODO: validate value of standalone. * @see #parseTextDecl * @see #checkEncoding */ void parseXMLDecl (boolean ignoreEncoding) throws java.lang.Exception { String version; String encodingName = null; String standalone = null; // Read the version. require("version"); parseEq(); version = readLiteral(0); if (!version.equals("1.0")) { error("unsupported XML version", version, "1.0"); } // Try reading an encoding declaration. skipWhitespace(); if (tryRead("encoding")) { parseEq(); encodingName = readLiteral(0); checkEncoding(encodingName, ignoreEncoding); } // Try reading a standalone declaration skipWhitespace(); if (tryRead("standalone")) { parseEq(); standalone = readLiteral(0); } skipWhitespace(); require("?>"); } /** * Parse the Encoding PI. *

    * [78] EncodingDecl ::= S 'encoding' Eq QEncoding
    * [79] EncodingPI ::= '<?xml' S 'encoding' Eq QEncoding S? '?>'
    * [80] QEncoding ::= '"' Encoding '"' | "'" Encoding "'"
    * [81] Encoding ::= LatinName
    * [82] LatinName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
    *

(The <?xml' and whitespace have already been read.) * @see #parseXMLDecl * @see #checkEncoding */ void parseTextDecl (boolean ignoreEncoding) throws java.lang.Exception { String encodingName = null; // Read an optional version. if (tryRead("version")) { String version; parseEq(); version = readLiteral(0); if (!version.equals("1.0")) { error("unsupported XML version", version, "1.0"); } requireWhitespace(); } // Read the encoding. require("encoding"); parseEq(); encodingName = readLiteral(0); checkEncoding(encodingName, ignoreEncoding); skipWhitespace(); require("?>"); } /** * Check that the encoding specified makes sense. *

Compare what the author has specified in the XML declaration * or encoding PI with what we have detected. *

This is also important for distinguishing among the various * 7- and 8-bit encodings, such as ISO-LATIN-1 (I cannot autodetect * those). * @param encodingName The name of the encoding specified by the user. * @see #parseXMLDecl * @see #parseTextDecl */ void checkEncoding (String encodingName, boolean ignoreEncoding) throws java.lang.Exception { encodingName = encodingName.toUpperCase(); if (ignoreEncoding) { return; } switch (encoding) { // 8-bit encodings case ENCODING_UTF_8: if (encodingName.equals("ISO-8859-1")) { encoding = ENCODING_ISO_8859_1; } else if (!encodingName.equals("UTF-8")) { error("unsupported 8-bit encoding", encodingName, "UTF-8 or ISO-8859-1"); } break; // 16-bit encodings case ENCODING_UCS_2_12: case ENCODING_UCS_2_21: if (!encodingName.equals("ISO-10646-UCS-2") && !encodingName.equals("UTF-16")) { error("unsupported 16-bit encoding", encodingName, "ISO-10646-UCS-2"); } break; // 32-bit encodings case ENCODING_UCS_4_1234: case ENCODING_UCS_4_4321: case ENCODING_UCS_4_2143: case ENCODING_UCS_4_3412: if (!encodingName.equals("ISO-10646-UCS-4")) { error("unsupported 32-bit encoding", encodingName, "ISO-10646-UCS-4"); } } } /** * Parse miscellaneous markup outside the document element and DOCTYPE * declaration. *

    * [27] Misc ::= Comment | PI | S
    *

*/ void parseMisc () throws java.lang.Exception { while (true) { skipWhitespace(); if (tryRead(" * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? * ('[' %markupdecl* ']' S?)? '>' * *

(The <!DOCTYPE has already been read.) */ void parseDoctypedecl () throws java.lang.Exception { char c; String doctypeName, ids[]; // Read the document type name. requireWhitespace(); doctypeName = readNmtoken(true); // Read the ExternalIDs. skipWhitespace(); ids = readExternalIds(false); // Look for a declaration subset. skipWhitespace(); if (tryRead('[')) { // loop until the subset ends while (true) { context = CONTEXT_DTD; skipWhitespace(); context = CONTEXT_NONE; if (tryRead(']')) { break; // end of subset } else { context = CONTEXT_DTD; parseMarkupdecl(); context = CONTEXT_NONE; } } } // Read the external subset, if any if (ids[1] != null) { pushURL("[external subset]", ids[0], ids[1], null, null, null); // Loop until we end up back at '>' while (true) { context = CONTEXT_DTD; skipWhitespace(); context = CONTEXT_NONE; if (tryRead('>')) { break; } else { context = CONTEXT_DTD; parseMarkupdecl(); context = CONTEXT_NONE; } } } else { // No external subset. skipWhitespace(); require('>'); } if (handler != null) { handler.doctypeDecl(doctypeName, ids[0], ids[1]); } // Expand general entities in // default values of attributes. // (Do this after the doctypeDecl // event!). // expandAttributeDefaultValues(); } /** * Parse a markup declaration in the internal or external DTD subset. *

    * [29] markupdecl ::= ( %elementdecl | %AttlistDecl | %EntityDecl |
    *                       %NotationDecl | %PI | %S | %Comment |
    *                       InternalPERef )
    * [30] InternalPERef ::= PEReference
    * [31] extSubset ::= (%markupdecl | %conditionalSect)*
    *

*/ void parseMarkupdecl () throws java.lang.Exception { if (tryRead(" * [33] STag ::= '<' Name (S Attribute)* S? '>' [WFC: unique Att spec] * [38] element ::= EmptyElement | STag content ETag * [39] EmptyElement ::= '<' Name (S Attribute)* S? '/>' * [WFC: unique Att spec] * *

(The '<' has already been read.) *

NOTE: this method actually chains onto parseContent(), if necessary, * and parseContent() will take care of calling parseETag(). */ void parseElement () throws java.lang.Exception { String gi; char c; int oldElementContent = currentElementContent; String oldElement = currentElement; // This is the (global) counter for the // array of specified attributes. tagAttributePos = 0; // Read the element type name. gi = readNmtoken(true); // Determine the current content type. currentElement = gi; currentElementContent = getElementContentType(gi); if (currentElementContent == CONTENT_UNDECLARED) { currentElementContent = CONTENT_ANY; } // Read the attributes, if any. // After this loop, we should be just // in front of the closing delimiter. skipWhitespace(); c = readCh(); while (c != '/' && c != '>') { unread(c); parseAttribute(gi); skipWhitespace(); c = readCh(); } unread(c); // Supply any defaulted attributes. Enumeration atts = declaredAttributes(gi); if (atts != null) { String aname; loop: while (atts.hasMoreElements()) { aname = (String)atts.nextElement(); // See if it was specified. for (int i = 0; i < tagAttributePos; i++) { if (tagAttributes[i] == aname) { continue loop; } } // I guess not... if (handler != null) { handler.attribute(aname, getAttributeExpandedValue(gi, aname), false); } } } // Figure out if this is a start tag // or an empty element, and dispatch an // event accordingly. c = readCh(); switch (c) { case '>': if (handler != null) { handler.startElement(gi); } parseContent(); break; case '/': require('>'); if (handler != null) { handler.startElement(gi); handler.endElement(gi); } break; } // Restore the previous state. currentElement = oldElement; currentElementContent = oldElementContent; } /** * Parse an attribute assignment. *

    * [34] Attribute ::= Name Eq AttValue
    *

* @param name The name of the attribute's element. * @see XmlHandler#attribute */ void parseAttribute (String name) throws java.lang.Exception { String aname; int type; String value; // Read the attribute name. aname = readNmtoken(true).intern(); type = getAttributeDefaultValueType(name, aname); // Parse '=' parseEq(); // Read the value, normalizing whitespace // if it is not CDATA. if (type == ATTRIBUTE_CDATA || type == ATTRIBUTE_UNDECLARED) { value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF); } else { value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF | LIT_NORMALIZE); } // Inform the handler about the // attribute. if (handler != null) { handler.attribute(aname, value, true); } dataBufferPos = 0; // Note that the attribute has been // specified. if (tagAttributePos == tagAttributes.length) { String newAttrib[] = new String[tagAttributes.length * 2]; System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos); tagAttributes = newAttrib; } tagAttributes[tagAttributePos++] = aname; } /** * Parse an equals sign surrounded by optional whitespace. * [35] Eq ::= S? '=' S? */ void parseEq () throws java.lang.Exception { skipWhitespace(); require('='); skipWhitespace(); } /** * Parse an end tag. * [36] ETag ::= '' * *NOTE: parseContent() chains to here. */ void parseETag () throws java.lang.Exception { String name; name = readNmtoken(true); if (name != currentElement) { error("mismatched end tag", name, currentElement); } skipWhitespace(); require('>'); if (handler != null) { handler.endElement(name); } } /** * Parse the content of an element. * [37] content ::= (element | PCData | Reference | CDSect | PI | Comment)* * [68] Reference ::= EntityRef | CharRef */ void parseContent () throws java.lang.Exception { String data; char c; while (true) { switch (currentElementContent) { case CONTENT_ANY: case CONTENT_MIXED: parsePCData(); break; case CONTENT_ELEMENTS: parseWhitespace(); break; } // Handle delimiters c = readCh(); switch (c) { case '&': // Found "&" c = readCh(); if (c == '#') { parseCharRef(); } else { unread(c); parseEntityRef(true); } break; case '<': // Found "<" c = readCh(); switch (c) { case '!': // Found "' * [VC: Unique Element Declaration] * *NOTE: the ''); } /** * Content specification. * [41] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements */ void parseContentspec (String name) throws java.lang.Exception { if (tryRead("EMPTY")) { setElement(name, CONTENT_EMPTY, null, null); return; } else if (tryRead("ANY")) { setElement(name, CONTENT_ANY, null, null); return; } else { require('('); dataBufferAppend('('); skipWhitespace(); if (tryRead("#PCDATA")) { dataBufferAppend("#PCDATA"); parseMixed(); setElement(name, CONTENT_MIXED, dataBufferToString(), null); } else { parseElements(); setElement(name, CONTENT_ELEMENTS, dataBufferToString(), null); } } } /** * Parse an element-content model. * [42] elements ::= (choice | seq) ('?' | '*' | '+')? * [44] cps ::= S? %cp S? * [45] choice ::= '(' S? %ctokplus (S? '|' S? %ctoks)* S? ')' * [46] ctokplus ::= cps ('|' cps)+ * [47] ctoks ::= cps ('|' cps)* * [48] seq ::= '(' S? %stoks (S? ',' S? %stoks)* S? ')' * [49] stoks ::= cps (',' cps)* * *NOTE: the opening '(' and S have already been read. * *TODO: go over parameter entity boundaries more carefully. */ void parseElements () throws java.lang.Exception { char c; char sep; // Parse the first content particle skipWhitespace(); parseCp(); // Check for end or for a separator. skipWhitespace(); c = readCh(); switch (c) { case ')': dataBufferAppend(')'); c = readCh(); switch (c) { case '*': case '+': case '?': dataBufferAppend(c); break; default: unread(c); } return; case ',': // Register the separator. case '|': sep = c; dataBufferAppend(c); break; default: error("bad separator in content model", c, null); return; } // Parse the rest of the content model. while (true) { skipWhitespace(); parseCp(); skipWhitespace(); c = readCh(); if (c == ')') { dataBufferAppend(')'); break; } else if (c != sep) { error("bad separator in content model", c, null); return; } else { dataBufferAppend(c); } } // Check for the occurrence indicator. c = readCh(); switch (c) { case '?': case '*': case '+': dataBufferAppend(c); return; default: unread(c); return; } } /** * Parse a content particle. * [43] cp ::= (Name | choice | seq) ('?' | '*' | '+') * *NOTE: I actually use a slightly different production here: * cp ::= (elements | (Name ('?' | '*' | '+')?)) */ void parseCp () throws java.lang.Exception { char c; if (tryRead('(')) { dataBufferAppend('('); parseElements(); } else { dataBufferAppend(readNmtoken(true)); c = readCh(); switch (c) { case '?': case '*': case '+': dataBufferAppend(c); break; default: unread(c); break; } } } /** * Parse mixed content. * [50] Mixed ::= '(' S? %( %'#PCDATA' (S? '|' S? %Mtoks)* ) S? ')*' * | '(' S? %('#PCDATA') S? ')' * [51] Mtoks ::= %Name (S? '|' S? %Name)* * *NOTE: the S and '#PCDATA' have already been read. */ void parseMixed () throws java.lang.Exception { char c; // Check for PCDATA alone. skipWhitespace(); if (tryRead(')')) { dataBufferAppend(")*"); tryRead('*'); return; } // Parse mixed content. skipWhitespace(); while (!tryRead(")*")) { require('|'); dataBufferAppend('|'); skipWhitespace(); dataBufferAppend(readNmtoken(true)); skipWhitespace(); } dataBufferAppend(")*"); } /** * Parse an attribute list declaration. * [52] AttlistDecl ::= '' * *NOTE: the '')) { parseAttDef(elementName); skipWhitespace(); } } /** * Parse a single attribute definition. * [53] AttDef ::= S %Name S %AttType S %Default */ void parseAttDef (String elementName) throws java.lang.Exception { String name; int type; String enumeration = null; // Read the attribute name. name = readNmtoken(true); // Read the attribute type. requireWhitespace(); type = readAttType(); // Get the string of enumerated values // if necessary. if (type == ATTRIBUTE_ENUMERATED || type == ATTRIBUTE_NOTATION) { enumeration = dataBufferToString(); } // Read the default value. requireWhitespace(); parseDefault(elementName, name, type, enumeration); } /** * Parse the attribute type. * [54] AttType ::= StringType | TokenizedType | EnumeratedType * [55] StringType ::= 'CDATA' * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' | 'ENTITIES' | * 'NMTOKEN' | 'NMTOKENS' * [57] EnumeratedType ::= NotationType | Enumeration * *TODO: validate the type!! */ int readAttType () throws java.lang.Exception { String typeString; Integer type; if (tryRead('(')) { parseEnumeration(); return ATTRIBUTE_ENUMERATED; } else { typeString = readNmtoken(true); if (typeString.equals("NOTATION")) { parseNotationType(); } type = (Integer)attributeTypeHash.get(typeString); if (type == null) { error("illegal attribute type", typeString, null); return ATTRIBUTE_UNDECLARED; } else { return type.intValue(); } } } /** * Parse an enumeration. * [60] Enumeration ::= '(' S? %Etoks (S? '|' S? %Etoks)* S? ')' * [61] Etoks ::= %Nmtoken (S? '|' S? %Nmtoken)* * *NOTE: the '(' has already been read. */ void parseEnumeration () throws java.lang.Exception { char c; dataBufferAppend('('); // Read the first token. skipWhitespace(); dataBufferAppend(readNmtoken(true)); // Read the remaining tokens. skipWhitespace(); while (!tryRead(')')) { require('|'); dataBufferAppend('|'); skipWhitespace(); dataBufferAppend(readNmtoken(true)); skipWhitespace(); } dataBufferAppend(')'); } /** * Parse a notation type for an attribute. * [58] NotationType ::= %'NOTATION' S '(' S? %Ntoks (S? '|' S? %Ntoks)* * S? ')' * [59] Ntoks ::= %Name (S? '|' S? %Name) * *NOTE: the 'NOTATION' has already been read */ void parseNotationType () throws java.lang.Exception { requireWhitespace(); require('('); parseEnumeration(); } /** * Parse the default value for an attribute. * [62] Default ::= '#REQUIRED' | '#IMPLIED' | ((%'#FIXED' S)? %AttValue */ void parseDefault (String elementName, String name, int type, String enumeration) throws java.lang.Exception { int valueType = ATTRIBUTE_DEFAULT_SPECIFIED; String value = null; boolean normalizeWSFlag; if (tryRead('#')) { if (tryRead("FIXED")) { valueType = ATTRIBUTE_DEFAULT_FIXED; requireWhitespace(); context = CONTEXT_ATTRIBUTEVALUE; value = readLiteral(LIT_CHAR_REF); context = CONTEXT_DTD; } else if (tryRead("REQUIRED")) { valueType = ATTRIBUTE_DEFAULT_REQUIRED; } else if (tryRead("IMPLIED")) { valueType = ATTRIBUTE_DEFAULT_IMPLIED; } else { error("illegal keyword for attribute default value", null, null); } } else { context = CONTEXT_ATTRIBUTEVALUE; value = readLiteral(LIT_CHAR_REF); context = CONTEXT_DTD; } setAttribute(elementName, name, type, enumeration, value, valueType); } /** * Parse a conditional section. * [63] conditionalSect ::= includeSect || ignoreSect * [64] includeSect ::= '' * [65] ignoreSect ::= '' * [66] ignoreSectContents ::= ((SkipLit | Comment | PI) -(Char* ']]>')) * | ('') * | (Char - (']' | [<'"])) * | ('")) { parseMarkupdecl(); skipWhitespace(); } } else if (tryRead("IGNORE")) { skipWhitespace(); require('['); int nesting = 1; char c; for (int nest = 1; nest > 0; ) { c = readCh(); switch (c) { case '<': if (tryRead("![")) { nest++; } case ']': if (tryRead("]>")) { nest--; } } } } else { error("conditional section must begin with INCLUDE or IGNORE", null, null); } } /** * Read a character reference. * [67] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' * *NOTE: the '&#' has already been read. */ void parseCharRef () throws java.lang.Exception { int value = 0; char c; if (tryRead('x')) { loop1: while (true) { c = readCh(); switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case 'a': case 'A': case 'b': case 'B': case 'c': case 'C': case 'd': case 'D': case 'e': case 'E': case 'f': case 'F': value *= 16; value += Integer.parseInt(new Character(c).toString(), 16); break; case ';': break loop1; default: error("illegal character in character reference", c, null); break loop1; } } } else { loop2: while (true) { c = readCh(); switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': value *= 10; value += Integer.parseInt(new Character(c).toString(), 10); break; case ';': break loop2; default: error("illegal character in character reference", c, null); break loop2; } } } // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz: if (value <= 0x0000ffff) { // no surrogates needed dataBufferAppend((char)value); } else if (value <= 0x000fffff) { // > 16 bits, surrogate needed dataBufferAppend((char)(0xd8 | ((value & 0x000ffc00) >> 10))); dataBufferAppend((char)(0xdc | (value & 0x0003ff))); } else { // too big for surrogate error("character reference " + value + " is too large for UTF-16", new Integer(value).toString(), null); } } /** * Parse a reference. * [69] EntityRef ::= '&' Name ';' * *NOTE: the '&' has already been read. * @param externalAllowed External entities are allowed here. */ void parseEntityRef (boolean externalAllowed) throws java.lang.Exception { String name; name = readNmtoken(true); require(';'); switch (getEntityType(name)) { case ENTITY_UNDECLARED: error("reference to undeclared entity", name, null); break; case ENTITY_INTERNAL: pushString(name, getEntityValue(name)); break; case ENTITY_TEXT: if (externalAllowed) { pushURL(name, getEntityPublicId(name), getEntitySystemId(name), null, null, null); } else { error("reference to external entity in attribute value.", name, null); } break; case ENTITY_NDATA: if (externalAllowed) { error("data entity reference in content", name, null); } else { error("reference to external entity in attribute value.", name, null); } break; } } /** * Parse a parameter entity reference. * [70] PEReference ::= '%' Name ';' * *NOTE: the '%' has already been read. */ void parsePEReference (boolean isEntityValue) throws java.lang.Exception { String name; name = "%" + readNmtoken(true); require(';'); switch (getEntityType(name)) { case ENTITY_UNDECLARED: error("reference to undeclared parameter entity", name, null); break; case ENTITY_INTERNAL: if (isEntityValue) { pushString(name, getEntityValue(name)); } else { pushString(name, " " + getEntityValue(name) + ' '); } break; case ENTITY_TEXT: if (isEntityValue) { pushString(null, " "); } pushURL(name, getEntityPublicId(name), getEntitySystemId(name), null, null, null); if (isEntityValue) { pushString(null, " "); } break; } } /** * Parse an entity declaration. * [71] EntityDecl ::= '' * | '' * [72] EntityDef ::= EntityValue | ExternalDef * [73] ExternalDef ::= ExternalID %NDataDecl? * [74] ExternalID ::= 'SYSTEM' S SystemLiteral * | 'PUBLIC' S PubidLiteral S SystemLiteral * [75] NDataDecl ::= S %'NDATA' S %Name * *NOTE: the ''); } /** * Parse a notation declaration. * [81] NotationDecl ::= '' * *NOTE: the ''); } /** * Parse PCDATA. *

    * [16] PCData ::= [^<&]*
    *

The trick here is that the data stays in the dataBuffer without * necessarily being converted to a string right away. */ void parsePCData () throws java.lang.Exception { char c; // Start with a little cheat -- in most // cases, the entire sequence of // character data will already be in // the readBuffer; if not, fall through to // the normal approach. if (USE_CHEATS) { int lineAugment = 0; int columnAugment = 0; loop: for (int i = readBufferPos; i < readBufferLength; i++) { switch (readBuffer[i]) { case '\n': lineAugment++; columnAugment = 0; break; case '&': case '<': int start = readBufferPos; columnAugment++; readBufferPos = i; if (lineAugment > 0) { line += lineAugment; column = columnAugment; } else { column += columnAugment; } dataBufferAppend(readBuffer, start, i-start); return; default: columnAugment++; } } } // OK, the cheat didn't work; start over // and do it by the book. while (true) { c = readCh(); switch (c) { case '<': case '&': unread(c); return; default: dataBufferAppend(c); break; } } } ////////////////////////////////////////////////////////////////////// // High-level reading and scanning methods. ////////////////////////////////////////////////////////////////////// /** * Require whitespace characters. * [1] S ::= (#x20 | #x9 | #xd | #xa)+ */ void requireWhitespace () throws java.lang.Exception { char c = readCh(); if (isWhitespace(c)) { skipWhitespace(); } else { error("whitespace expected", c, null); } } /** * Parse whitespace characters, and leave them in the data buffer. */ void parseWhitespace () throws java.lang.Exception { char c = readCh(); while (isWhitespace(c)) { dataBufferAppend(c); c = readCh(); } unread(c); } /** * Skip whitespace characters. * [1] S ::= (#x20 | #x9 | #xd | #xa)+ */ void skipWhitespace () throws java.lang.Exception { // Start with a little cheat. Most of // the time, the white space will fall // within the current read buffer; if // not, then fall through. if (USE_CHEATS) { int lineAugment = 0; int columnAugment = 0; loop: for (int i = readBufferPos; i < readBufferLength; i++) { switch (readBuffer[i]) { case ' ': case '\t': case '\r': columnAugment++; break; case '\n': lineAugment++; columnAugment = 0; break; case '%': if (context == CONTEXT_DTD || context == CONTEXT_ENTITYVALUE) { break loop; } // else fall through... default: readBufferPos = i; if (lineAugment > 0) { line += lineAugment; column = columnAugment; } else { column += columnAugment; } return; } } } // OK, do it by the book. char c = readCh(); while (isWhitespace(c)) { c = readCh(); } unread(c); } /** * Read a name or name token. * [5] Name ::= (Letter | '_' | ':') (NameChar)* * [7] Nmtoken ::= (NameChar)+ * *NOTE: [6] is implemented implicitly where required. */ String readNmtoken (boolean isName) throws java.lang.Exception { char c; if (USE_CHEATS) { loop: for (int i = readBufferPos; i < readBufferLength; i++) { switch (readBuffer[i]) { case '%': if (context == CONTEXT_DTD || context == CONTEXT_ENTITYVALUE) { break loop; } // else fall through... case '<': case '>': case '&': case ',': case '|': case '*': case '+': case '?': case ')': case '=': case '\'': case '"': case '[': case ' ': case '\t': case '\r': case '\n': case ';': case '/': case '#': int start = readBufferPos; if (i == start) { error("name expected", readBuffer[i], null); } readBufferPos = i; return intern(readBuffer, start, i - start); } } } nameBufferPos = 0; // Read the first character. loop: while (true) { c = readCh(); switch (c) { case '%': case '<': case '>': case '&': case ',': case '|': case '*': case '+': case '?': case ')': case '=': case '\'': case '"': case '[': case ' ': case '\t': case '\n': case '\r': case ';': case '/': unread(c); if (nameBufferPos == 0) { error("name expected", null, null); } String s = intern(nameBuffer,0,nameBufferPos); nameBufferPos = 0; return s; default: nameBuffer = (char[])extendArray(nameBuffer, nameBuffer.length, nameBufferPos); nameBuffer[nameBufferPos++] = c; } } } /** * Read a literal. * [10] AttValue ::= '"' ([^<&"] | Reference)* '"' * | "'" ([^<&'] | Reference)* "'" * [11] SystemLiteral ::= '"' URLchar* '"' | "'" (URLchar - "'")* "'" * [13] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" * [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' * | "'" ([^%&'] | PEReference | Reference)* "'" */ String readLiteral (int flags) throws java.lang.Exception { char delim, c; int startLine = line; // Find the delimiter. delim = readCh(); if (delim != '"' && delim != '\'' && delim != (char)0) { error("expected '\"' or \"'\"", delim, null); return null; } // Read the literal. try { c = readCh(); loop: while (c != delim) { switch (c) { // Literals never have line ends case '\n': case '\r': c = ' '; break; // References may be allowed case '&': if ((flags & LIT_CHAR_REF) > 0) { c = readCh(); if (c == '#') { parseCharRef(); c = readCh(); continue loop; // check the next character } else if ((flags & LIT_ENTITY_REF) > 0) { unread(c); parseEntityRef(false); c = readCh(); continue loop; } else { dataBufferAppend('&'); } } break; default: break; } dataBufferAppend(c); c = readCh(); } } catch (EOFException e) { error("end of input while looking for delimiter (started on line " + startLine + ')', null, new Character(delim).toString()); } // Normalise whitespace if necessary. if ((flags & LIT_NORMALIZE) > 0) { dataBufferNormalize(); } // Return the value. return dataBufferToString(); } /** * Try reading external identifiers. *

The system identifier is not required for notations. * @param inNotation Are we in a notation? * @return A two-member String array containing the identifiers. */ String[] readExternalIds (boolean inNotation) throws java.lang.Exception { char c; String ids[] = new String[2]; if (tryRead("PUBLIC")) { requireWhitespace(); ids[0] = readLiteral(LIT_NORMALIZE); // public id if (inNotation) { skipWhitespace(); if (tryRead('"') || tryRead('\'')) { ids[1] = readLiteral(0); } } else { requireWhitespace(); ids[1] = readLiteral(0); // system id } } else if (tryRead("SYSTEM")) { requireWhitespace(); ids[1] = readLiteral(0); // system id } return ids; } /** * Test if a character is whitespace. *

    * [1] S ::= (#x20 | #x9 | #xd | #xa)+
    *

* @param c The character to test. * @return true if the character is whitespace. */ final boolean isWhitespace (char c) { switch ((int)c) { case 0x20: case 0x09: case 0x0d: case 0x0a: return true; default: return false; } } ////////////////////////////////////////////////////////////////////// // Utility routines. ////////////////////////////////////////////////////////////////////// /** * Add a character to the data buffer. */ void dataBufferAppend (char c) { // Expand buffer if necessary. dataBuffer = (char[])extendArray(dataBuffer, dataBuffer.length, dataBufferPos); dataBuffer[dataBufferPos++] = c; } /** * Add a string to the data buffer. */ void dataBufferAppend (String s) { dataBufferAppend(s.toCharArray(), 0, s.length()); } /** * Append (part of) a character array to the data buffer. */ void dataBufferAppend (char ch[], int start, int length) { dataBuffer = (char[])extendArray(dataBuffer, dataBuffer.length, dataBufferPos + length); System.arraycopy((Object)ch, start, (Object)dataBuffer, dataBufferPos, length); dataBufferPos += length; } /** * Normalise whitespace in the data buffer. */ void dataBufferNormalize () { int i = 0; int j = 0; int end = dataBufferPos; // Skip whitespace at the start. while (j < end && isWhitespace(dataBuffer[j])) { j++; } // Skip whitespace at the end. while (end > j && isWhitespace(dataBuffer[end - 1])) { end --; } // Start copying to the left. while (j < end) { char c = dataBuffer[j++]; // Normalise all other whitespace to // a single space. if (isWhitespace(c)) { while (j < end && isWhitespace(dataBuffer[j++])) { } dataBuffer[i++] = ' '; dataBuffer[i++] = dataBuffer[j-1]; } else { dataBuffer[i++] = c; } } // The new length is <= the old one. dataBufferPos = i; } /** * Convert the data buffer to a string. * @param internFlag true if the contents should be interned. * @see #intern(char[],int,int) */ String dataBufferToString () { String s = new String(dataBuffer, 0, dataBufferPos); dataBufferPos = 0; return s; } /** * Flush the contents of the data buffer to the handler, if * appropriate, and reset the buffer for new input. */ void dataBufferFlush () throws java.lang.Exception { if (dataBufferPos > 0) { switch (currentElementContent) { case CONTENT_UNDECLARED: case CONTENT_EMPTY: // do nothing break; case CONTENT_MIXED: case CONTENT_ANY: if (handler != null) { handler.charData(dataBuffer, 0, dataBufferPos); } break; case CONTENT_ELEMENTS: if (handler != null) { handler.ignorableWhitespace(dataBuffer, 0, dataBufferPos); } break; } dataBufferPos = 0; } } /** * Require a string to appear, or throw an exception. */ void require (String delim) throws java.lang.Exception { char ch[] = delim.toCharArray(); for (int i = 0; i < ch.length; i++) { require(ch[i]); } } /** * Require a character to appear, or throw an exception. */ void require (char delim) throws java.lang.Exception { char c = readCh(); if (c != delim) { error("expected character", c, new Character(delim).toString()); } } /** * Return an internalised version of a string. *

Ælfred uses this method to create an internalised version * of all names and attribute values, so that it can test equality * with == instead of String.equals(). *

If you want to be able to test for equality in the same way, * you can use this method to internalise your own strings first: *

    * String PARA = handler.intern("PARA");
    *

Note that this will not return the same results as String.intern(). * @param s The string to internalise. * @return An internalised version of the string. * @see #intern(char[],int,int) * @see java.lang.String#intern */ public String intern (String s) { char ch[] = s.toCharArray(); return intern(ch, 0, ch.length); } /** * Create an internalised string from a character array. *

This is much more efficient than constructing a non-internalised * string first, and then internalising it. *

Note that this will not return the same results as String.intern(). * @param ch an array of characters for building the string. * @param start the starting position in the array. * @param length the number of characters to place in the string. * @return an internalised string. * @see #intern(String) * @see java.lang.String#intern */ public String intern (char ch[], int start, int length) { int index; int hash = 0; // Generate a hash code. for (int i = start; i < start + length; i++) { hash = ((hash << 1) & 0xffffff) + (int)ch[i]; } hash = hash % SYMBOL_TABLE_LENGTH; // Get the bucket. Object bucket[] = (Object[])symbolTable[hash]; if (bucket == null) { symbolTable[hash] = bucket = new Object[8]; } // Search for a matching tuple, and // return the string if we find one. for (index = 0; index < bucket.length; index += 2) { char chFound[] = (char[])bucket[index]; // Stop when we hit a null index. if (chFound == null) { break; } // If they're the same length, // check for a match. // If the loop finishes, 'index' will // contain the current bucket // position. if (chFound.length == length) { for (int i = 0; i < chFound.length; i++) { // Stop if there are no more tuples. if (ch[start+i] != chFound[i]) { break; } else if (i == length-1) { // That's it, we have a match! return (String)bucket[index+1]; } } } } // Not found -- we'll have to add it. // Do we have to grow the bucket? bucket = (Object[])extendArray(bucket, bucket.length, index); // OK, add it to the end of the // bucket. String s = new String(ch, start, length); bucket[index] = s.toCharArray(); bucket[index+1] = s; symbolTable[hash] = bucket; return s; } /** * Ensure the capacity of an array, allocating a new one if * necessary. */ Object extendArray (Object array, int currentSize, int requiredSize) { if (requiredSize < currentSize) { return array; } else { Object newArray = null; int newSize = currentSize * 2; if (newSize <= requiredSize) { newSize = requiredSize + 1; } if (array instanceof char[]) { newArray = new char[newSize]; } else if (array instanceof Object[]) { newArray = new Object[newSize]; } System.arraycopy(array, 0, newArray, 0, currentSize); return newArray; } } ////////////////////////////////////////////////////////////////////// // XML query routines. ////////////////////////////////////////////////////////////////////// // // Elements // /** * Get the declared elements for an XML document. *

The results will be valid only after the DTD (if any) has been * parsed. * @return An enumeration of all element types declared for this * document (as Strings). * @see #getElementContentType * @see #getElementContentModel */ public Enumeration declaredElements () { return elementInfo.keys(); } /** * Look up the content type of an element. * @param name The element type name. * @return An integer constant representing the content type. * @see #getElementContentModel * @see #CONTENT_UNDECLARED * @see #CONTENT_ANY * @see #CONTENT_EMPTY * @see #CONTENT_MIXED * @see #CONTENT_ELEMENTS */ public int getElementContentType (String name) { Object element[] = (Object[])elementInfo.get(name); if (element == null) { return CONTENT_UNDECLARED; } else { return ((Integer)element[0]).intValue(); } } /** * Look up the content model of an element. *

The result will always be null unless the content type is * CONTENT_ELEMENTS or CONTENT_MIXED. * @param name The element type name. * @return The normalised content model, as a string. * @see #getElementContentType */ public String getElementContentModel (String name) { Object element[] = (Object[])elementInfo.get(name); if (element == null) { return null; } else { return (String)element[1]; } } /** * Register an element. * Array format: * element type * attribute hash table */ void setElement (String name, int contentType, String contentModel, Hashtable attributes) throws java.lang.Exception { Object element[]; // Try looking up the element element = (Object[])elementInfo.get(name); // Make a new one if necessary. if (element == null) { element = new Object[3]; element[0] = new Integer(CONTENT_UNDECLARED); element[1] = null; element[2] = null; } else if (contentType != CONTENT_UNDECLARED && ((Integer)element[0]).intValue() != CONTENT_UNDECLARED) { error("multiple declarations for element type", name, null); return; } // Insert the content type, if any. if (contentType != CONTENT_UNDECLARED) { element[0] = new Integer(contentType); } // Insert the content model, if any. if (contentModel != null) { element[1] = contentModel; } // Insert the attributes, if any. if (attributes != null) { element[2] =attributes; } // Save the element info. elementInfo.put(name,element); } /** * Look up the attribute hash table for an element. * The hash table is the second item in the element array. */ Hashtable getElementAttributes (String name) { Object element[] = (Object[])elementInfo.get(name); if (element == null) { return null; } else { return (Hashtable)element[2]; } } // // Attributes // /** * Get the declared attributes for an element type. * @param elname The name of the element type. * @return An Enumeration of all the attributes declared for * a specific element type. The results will be valid only * after the DTD (if any) has been parsed. * @see #getAttributeType * @see #getAttributeEnumeration * @see #getAttributeDefaultValueType * @see #getAttributeDefaultValue * @see #getAttributeExpandedValue */ public Enumeration declaredAttributes (String elname) { Hashtable attlist = getElementAttributes(elname); if (attlist == null) { return null; } else { return attlist.keys(); } } /** * Retrieve the declared type of an attribute. * @param name The name of the associated element. * @param aname The name of the attribute. * @return An integer constant representing the attribute type. * @see #ATTRIBUTE_UNDECLARED * @see #ATTRIBUTE_CDATA * @see #ATTRIBUTE_ID * @see #ATTRIBUTE_IDREF * @see #ATTRIBUTE_IDREFS * @see #ATTRIBUTE_ENTITY * @see #ATTRIBUTE_ENTITIES * @see #ATTRIBUTE_NMTOKEN * @see #ATTRIBUTE_NMTOKENS * @see #ATTRIBUTE_ENUMERATED * @see #ATTRIBUTE_NOTATION */ public int getAttributeType (String name, String aname) { Object attribute[] = getAttribute(name, aname); if (attribute == null) { return ATTRIBUTE_UNDECLARED; } else { return ((Integer)attribute[0]).intValue(); } } /** * Retrieve the allowed values for an enumerated attribute type. * @param name The name of the associated element. * @param aname The name of the attribute. * @return A string containing the token list. * @see #ATTRIBUTE_ENUMERATED * @see #ATTRIBUTE_NOTATION */ public String getAttributeEnumeration (String name, String aname) { Object attribute[] = getAttribute(name, aname); if (attribute == null) { return null; } else { return (String)attribute[3]; } } /** * Retrieve the default value of a declared attribute. * @param name The name of the associated element. * @param aname The name of the attribute. * @return The default value, or null if the attribute was * #IMPLIED or simply undeclared and unspecified. * @see #getAttributeExpandedValue */ public String getAttributeDefaultValue (String name, String aname) { Object attribute[] = getAttribute(name, aname); if (attribute == null) { return null; } else { return (String)attribute[1]; } } /** * Retrieve the expanded value of a declared attribute. *

All general entities will be expanded. * @param name The name of the associated element. * @param aname The name of the attribute. * @return The expanded default value, or null if the attribute was * #IMPLIED or simply undeclared * @see #getAttributeDefaultValue */ public String getAttributeExpandedValue (String name, String aname) { Object attribute[] = getAttribute(name, aname); if (attribute == null) { return null; } else if (attribute[4] == null && attribute[1] != null) { try { pushString(null, (char)0 + (String)attribute[1] + (char)0); attribute[4] = readLiteral(LIT_NORMALIZE | LIT_CHAR_REF | LIT_ENTITY_REF); } catch (Exception e) {} } return (String)attribute[4]; } /** * Retrieve the default value type of a declared attribute. * @see #ATTRIBUTE_DEFAULT_SPECIFIED * @see #ATTRIBUTE_DEFAULT_IMPLIED * @see #ATTRIBUTE_DEFAULT_REQUIRED * @see #ATTRIBUTE_DEFAULT_FIXED */ public int getAttributeDefaultValueType (String name, String aname) { Object attribute[] = getAttribute(name, aname); if (attribute == null) { return ATTRIBUTE_DEFAULT_UNDECLARED; } else { return ((Integer)attribute[2]).intValue(); } } /** * Register an attribute declaration for later retrieval. * Format: * - String type * - String default value * - int value type * *TODO: do something with attribute types. */ void setAttribute (String elName, String name, int type, String enumeration, String value, int valueType) throws java.lang.Exception { Hashtable attlist; Object attribute[]; // Create a new hashtable if necessary. attlist = getElementAttributes(elName); if (attlist == null) { attlist = new Hashtable(); } // Check that the attribute doesn't // already exist! if (attlist.get(name) != null) { return; } else { attribute = new Object[5]; attribute[0] = new Integer(type); attribute[1] = value; attribute[2] = new Integer(valueType); attribute[3] = enumeration; attribute[4] = null; attlist.put(name.intern(), attribute); // Use CONTENT_UNDECLARED to avoid overwriting // existing element declaration. setElement(elName,CONTENT_UNDECLARED, null, attlist); } } /** * Retrieve the three-member array representing an * attribute declaration. */ Object[] getAttribute (String elName, String name) { Hashtable attlist; Object attribute[]; attlist = getElementAttributes(elName); if (attlist == null) { return null; } attribute = (Object[])attlist.get(name); return attribute; } // // Entities // /** * Get declared entities. * @return An Enumeration of all the entities declared for * this XML document. The results will be valid only * after the DTD (if any) has been parsed. * @see #getEntityType * @see #getEntityPublicId * @see #getEntitySystemId * @see #getEntityValue * @see #getEntityNotationName */ public Enumeration declaredEntities () { return entityInfo.keys(); } /** * Find the type of an entity. * @returns An integer constant representing the entity type. * @see #ENTITY_UNDECLARED * @see #ENTITY_INTERNAL * @see #ENTITY_NDATA * @see #ENTITY_TEXT */ public int getEntityType (String ename) { Object entity[] = (Object[])entityInfo.get(ename); if (entity == null) { return ENTITY_UNDECLARED; } else { return ((Integer)entity[0]).intValue(); } } /** * Return an external entity's public identifier, if any. * @param ename The name of the external entity. * @return The entity's system identifier, or null if the * entity was not declared, if it is not an * external entity, or if no public identifier was * provided. * @see #getEntityType */ public String getEntityPublicId (String ename) { Object entity[] = (Object[])entityInfo.get(ename); if (entity == null) { return null; } else { return (String)entity[1]; } } /** * Return an external entity's system identifier. * @param ename The name of the external entity. * @return The entity's system identifier, or null if the * entity was not declared, or if it is not an * external entity. * @see #getEntityType */ public String getEntitySystemId (String ename) { Object entity[] = (Object[])entityInfo.get(ename); if (entity == null) { return null; } else { return (String)entity[2]; } } /** * Return the value of an internal entity. * @param ename The name of the internal entity. * @return The entity's value, or null if the entity was * not declared, or if it is not an internal entity. * @see #getEntityType */ public String getEntityValue (String ename) { Object entity[] = (Object[])entityInfo.get(ename); if (entity == null) { return null; } else { return (String)entity[3]; } } /** * Get the notation name associated with an NDATA entity. * @param ename The NDATA entity name. * @return The associated notation name, or null if the * entity was not declared, or if it is not an * NDATA entity. * @see #getEntityType */ public String getEntityNotationName (String eName) { Object entity[] = (Object[])entityInfo.get(eName); if (entity == null) { return null; } else { return (String)entity[4]; } } /** * Register an entity declaration for later retrieval. */ void setInternalEntity (String eName, String value) { setEntity(eName, ENTITY_INTERNAL, null, null, value, null); } /** * Register an external data entity. */ void setExternalDataEntity (String eName, String pubid, String sysid, String nName) { setEntity(eName, ENTITY_NDATA, pubid, sysid, null, nName); } /** * Register an external text entity. */ void setExternalTextEntity (String eName, String pubid, String sysid) { setEntity(eName, ENTITY_TEXT, pubid, sysid, null, null); } /** * Register an entity declaration for later retrieval. */ void setEntity (String eName, int eClass, String pubid, String sysid, String value, String nName) { Object entity[]; if (entityInfo.get(eName) == null) { entity = new Object[5]; entity[0] = new Integer(eClass); entity[1] = pubid; entity[2] = sysid; entity[3] = value; entity[4] = nName; entityInfo.put(eName,entity); } } // // Notations. // /** * Get declared notations. * @return An Enumeration of all the notations declared for * this XML document. The results will be valid only * after the DTD (if any) has been parsed. * @see #getNotationPublicId * @see #getNotationSystemId */ public Enumeration declaredNotations () { return notationInfo.keys(); } /** * Look up the public identifier for a notation. * You will normally use this method to look up a notation * that was provided as an attribute value or for an NDATA entity. * @param nname The name of the notation. * @return A string containing the public identifier, or null * if none was provided or if no such notation was * declared. * @see #getNotationSystemId */ public String getNotationPublicId (String nname) { Object notation[] = (Object[])notationInfo.get(nname); if (notation == null) { return null; } else { return (String)notation[0]; } } /** * Look up the system identifier for a notation. * You will normally use this method to look up a notation * that was provided as an attribute value or for an NDATA entity. * @param nname The name of the notation. * @return A string containing the system identifier, or null * if no such notation was declared. * @see #getNotationPublicId */ public String getNotationSystemId (String nname) { Object notation[] = (Object[])notationInfo.get(nname); if (notation == null) { return null; } else { return (String)notation[1]; } } /** * Register a notation declaration for later retrieval. * Format: * - public id * - system id */ void setNotation (String nname, String pubid, String sysid) throws java.lang.Exception { Object notation[]; if (notationInfo.get(nname) == null) { notation = new Object[2]; notation[0] = pubid; notation[1] = sysid; notationInfo.put(nname,notation); } else { error("multiple declarations of notation", nname, null); } } // // Location. // /** * Return the current line number. */ public int getLineNumber () { return line; } /** * Return the current column number. */ public int getColumnNumber () { return column; } ////////////////////////////////////////////////////////////////////// // High-level I/O. ////////////////////////////////////////////////////////////////////// /** * Read a single character from the readBuffer. *

The readDataChunk() method maintains the buffer. *

If we hit the end of an entity, try to pop the stack and * keep going. *

(This approach doesn't really enforce XML's rules about * entity boundaries, but this is not currently a validating * parser). *

This routine also attempts to keep track of the current * position in external entities, but it's not entirely accurate. * @return The next available input character. * @see #unread(char) * @see #unread(String) * @see #readDataChunk * @see #readBuffer * @see #line * @return The next character from the current input source. */ char readCh () throws java.lang.Exception { char c; // As long as there's nothing in the // read buffer, try reading more data // (for an external entity) or popping // the entity stack (for either). while (readBufferPos >= readBufferLength) { switch (sourceType) { case INPUT_READER: case INPUT_EXTERNAL: case INPUT_STREAM: readDataChunk(); while (readBufferLength < 1) { popInput(); if (readBufferLength <1) { readDataChunk(); } } break; default: popInput(); break; } } c = readBuffer[readBufferPos++]; // This is a particularly nasty bit // of code, that checks for a parameter // entity reference but peeks ahead to // catch the '%' in parameter entity // declarations. if ( c == '%' && (context == CONTEXT_DTD || context == CONTEXT_ENTITYVALUE) ) { char c2 = readCh(); unread(c2); if (!isWhitespace(c2)) { parsePEReference(context == CONTEXT_ENTITYVALUE); return readCh(); } } if (c == '\n') { line++; column = 0; } else { column++; } return c; } /** * Push a single character back onto the current input stream. *

This method usually pushes the character back onto * the readBuffer, while the unread(String) method treats the * string as a new internal entity. *

I don't think that this would ever be called with * readBufferPos = 0, because the methods always reads a character * before unreading it, but just in case, I've added a boundary * condition. * @param c The character to push back. * @see #readCh * @see #unread(String) * @see #unread(char[]) * @see #readBuffer */ void unread (char c) throws java.lang.Exception { // Normal condition. if (c == '\n') { line--; column = -1; } if (readBufferPos > 0) { readBuffer[--readBufferPos] = c; } else { pushString(null, new Character(c).toString()); } } /** * Push a char array back onto the current input stream. *

NOTE: you must never push back characters that you * haven't actually read: use pushString() instead. * @see #readCh * @see #unread(char) * @see #unread(String) * @see #readBuffer * @see #pushString */ void unread (char ch[], int length) throws java.lang.Exception { for (int i = 0; i < length; i++) { if (ch[i] == '\n') {line--;column = -1;} } if (length < readBufferPos) {readBufferPos -= length;} else { pushCharArray(null, ch, 0, length); sourceType = INPUT_BUFFER; } } /** * Push a new external input source. *

The source will be either an external text entity, or the DTD * external subset. *

TO DO: Right now, this method always attempts to autodetect * the encoding; in the future, it should allow the caller to * request an encoding explicitly, and it should also look at the * headers with an HTTP connection. * @param url The java.net.URL object for the entity. * @see XmlHandler#resolveEntity * @see #pushString * @see #sourceType * @see #pushInput * @see #detectEncoding * @see #sourceType * @see #readBuffer */ void pushURL (String ename, String publicId, String systemId, Reader reader, InputStream stream, String encoding) throws java.lang.Exception { URL url; boolean ignoreEncoding = false; // Push the existing status. pushInput(ename); // Create a new read buffer. // (Note the four-character margin) readBuffer = new char[READ_BUFFER_MAX+4]; readBufferPos = 0; readBufferLength = 0; readBufferOverflow = -1; is = null; line = 1; currentByteCount = 0; // Flush any remaining data. dataBufferFlush(); // Make the URL absolute. if (systemId != null && externalEntity != null) { systemId = new URL(externalEntity.getURL(), systemId).toString(); } else if (baseURI != null) { try { systemId = new URL(new URL(baseURI), systemId).toString(); } catch (Exception e) {} } // See if the application wants to // redirect the system ID and/or // supply its own character stream. if (systemId != null && handler != null) { Object input = handler.resolveEntity(publicId, systemId); if (input != null) { if (input instanceof String) { systemId = (String)input; } else if (input instanceof InputStream) { stream = (InputStream)input; } else if (input instanceof Reader) { reader = (Reader)input; } } } // Start the entity. if (handler != null) { if (systemId != null) { handler.startExternalEntity(systemId); } else { handler.startExternalEntity("[external stream]"); } } // Figure out what we're reading from. if (reader != null) { // There's an explicit character stream. sourceType = INPUT_READER; this.reader = reader; tryEncodingDecl(true); return; } else if (stream != null) { sourceType = INPUT_STREAM; is = stream; } else { // We have to open our own stream // to the URL. // Set the new status sourceType = INPUT_EXTERNAL; url = new URL(systemId); externalEntity = url.openConnection(); externalEntity.connect(); is = externalEntity.getInputStream(); } // If we get to here, there must be // an InputStream available. if (!is.markSupported()) { is = new BufferedInputStream(is); } // Attempt to detect the encoding. if (encoding == null && externalEntity != null) { encoding = externalEntity.getContentEncoding(); } if (encoding != null) { checkEncoding(encoding, false); ignoreEncoding = true; } else { detectEncoding(); ignoreEncoding = false; } // Read an XML or text declaration. tryEncodingDecl(ignoreEncoding); } /** * Check for an encoding declaration. */ void tryEncodingDecl (boolean ignoreEncoding) throws java.lang.Exception { // Read the XML/Encoding declaration. if (tryRead(" 0) { parseTextDecl(ignoreEncoding); } else { parseXMLDecl(ignoreEncoding); } } else { unread("xml".toCharArray(), 3); parsePI(); } } } /** * Attempt to detect the encoding of an entity. *

The trick here (as suggested in the XML standard) is that * any entity not in UTF-8, or in UCS-2 with a byte-order mark, * must begin with an XML declaration or an encoding * declaration; we simply have to look for "<?XML" in various * encodings. *

This method has no way to distinguish among 8-bit encodings. * Instead, it assumes UTF-8, then (possibly) revises its assumption * later in checkEncoding(). Any ASCII-derived 8-bit encoding * should work, but most will be rejected later by checkEncoding(). *

I don't currently detect EBCDIC, since I'm concerned that it * could also be a valid UTF-8 sequence; I'll have to do more checking * later. * @see #tryEncoding(byte[], byte, byte, byte, byte) * @see #tryEncoding(byte[], byte, byte) * @see #checkEncoding * @see #read8bitEncodingDeclaration */ void detectEncoding () throws java.lang.Exception { byte signature[] = new byte[4]; // Read the first four bytes for // autodetection. is.mark(4); is.read(signature); is.reset(); // Look for a known signature. if (tryEncoding(signature, (byte)0x00, (byte)0x00, (byte)0x00, (byte)0x3c)) { // UCS-4 must begin with "Utility routine for detectEncoding(). *

Always looks for some part of "Looks for a UCS-2 byte-order mark. *

Utility routine for detectEncoding(). * @param sig The first four bytes read. * @param b1 The first byte of the signature * @param b2 The second byte of the signature * @see #detectEncoding */ boolean tryEncoding (byte sig[], byte b1, byte b2) { return ((sig[0] == b1) && (sig[1] == b2)); } /** * This method pushes a string back onto input. *

It is useful either as the expansion of an internal entity, * or for backtracking during the parse. *

Call pushCharArray() to do the actual work. * @param s The string to push back onto input. * @see #pushCharArray */ void pushString (String ename, String s) throws java.lang.Exception { char ch[] = s.toCharArray(); pushCharArray(ename, ch, 0, ch.length); } /** * Push a new internal input source. *

This method is useful for expanding an internal entity, * or for unreading a string of characters. It creates a new * readBuffer containing the characters in the array, instead * of characters converted from an input byte stream. *

I've added a couple of optimisations: don't push zero- * length strings, and just push back a single character * for 1-character strings; this should save some time and memory. * @param ch The char array to push. * @see #pushString * @see #pushURL * @see #readBuffer * @see #sourceType * @see #pushInput */ void pushCharArray (String ename, char ch[], int start, int length) throws java.lang.Exception { // Push the existing status pushInput(ename); sourceType = INPUT_INTERNAL; readBuffer = ch; readBufferPos = start; readBufferLength = length; readBufferOverflow = -1; } /** * Save the current input source onto the stack. *

This method saves all of the global variables associated with * the current input source, so that they can be restored when a new * input source has finished. It also tests for entity recursion. *

The method saves the following global variables onto a stack * using a fixed-length array: *

sourceType *
externalEntity *
readBuffer *
readBufferPos *
readBufferLength *
line *
encoding *

* @param ename The name of the entity (if any) causing the new input. * @see #popInput * @see #sourceType * @see #externalEntity * @see #readBuffer * @see #readBufferPos * @see #readBufferLength * @see #line * @see #encoding */ void pushInput (String ename) throws java.lang.Exception { Object input[] = new Object[12]; // Check for entity recursion. if (ename != null) { Enumeration entities = entityStack.elements(); while (entities.hasMoreElements()) { String e = (String)entities.nextElement(); if (e == ename) { error("recursive reference to entity", ename, null); } } } entityStack.push(ename); // Don't bother if there is no input. if (sourceType == INPUT_NONE) { return; } // Set up a snapshot of the current // input source. input[0] = new Integer(sourceType); input[1] = externalEntity; input[2] = readBuffer; input[3] = new Integer(readBufferPos); input[4] = new Integer(readBufferLength); input[5] = new Integer(line); input[6] = new Integer(encoding); input[7] = new Integer(readBufferOverflow); input[8] = is; input[9] = new Integer(currentByteCount); input[10] = new Integer(column); input[11] = reader; // Push it onto the stack. inputStack.push(input); } /** * Restore a previous input source. *

This method restores all of the global variables associated with * the current input source. * @exception java.io.EOFException * If there are no more entries on the input stack. * @see #pushInput * @see #sourceType * @see #externalEntity * @see #readBuffer * @see #readBufferPos * @see #readBufferLength * @see #line * @see #encoding */ void popInput () throws java.lang.Exception { Object input[]; switch (sourceType) { case INPUT_EXTERNAL: dataBufferFlush(); if (handler != null && externalEntity != null) { handler.endExternalEntity(externalEntity.getURL().toString()); } break; case INPUT_STREAM: dataBufferFlush(); if (baseURI != null) { if (handler != null) { handler.endExternalEntity(baseURI); } } break; case INPUT_READER: dataBufferFlush(); if (baseURI != null) { if (handler != null) { handler.endExternalEntity(baseURI); } } break; } // Throw an EOFException if there // is nothing else to pop. if (inputStack.isEmpty()) { throw new EOFException(); } else { String s; input = (Object[])inputStack.pop(); s = (String)entityStack.pop(); } sourceType = ((Integer)input[0]).intValue(); externalEntity = (URLConnection)input[1]; readBuffer = (char[])input[2]; readBufferPos = ((Integer)input[3]).intValue(); readBufferLength = ((Integer)input[4]).intValue(); line = ((Integer)input[5]).intValue(); encoding = ((Integer)input[6]).intValue(); readBufferOverflow = ((Integer)input[7]).intValue(); is = (InputStream)input[8]; currentByteCount = ((Integer)input[9]).intValue(); column = ((Integer)input[10]).intValue(); reader = (Reader)input[11]; } /** * Return true if we can read the expected character. *

Note that the character will be removed from the input stream * on success, but will be put back on failure. Do not attempt to * read the character again if the method succeeds. * @param delim The character that should appear next. For a * insensitive match, you must supply this in upper-case. * @return true if the character was successfully read, or false if * it was not. * @see #tryRead(String) */ boolean tryRead (char delim) throws java.lang.Exception { char c; // Read the character c = readCh(); // Test for a match, and push the character // back if the match fails. if (c == delim) { return true; } else { unread(c); return false; } } /** * Return true if we can read the expected string. *

This is simply a convenience method. *

Note that the string will be removed from the input stream * on success, but will be put back on failure. Do not attempt to * read the string again if the method succeeds. *

This method will push back a character rather than an * array whenever possible (probably the majority of cases). *

NOTE: This method currently has a hard-coded limit * of 100 characters for the delimiter. * @param delim The string that should appear next. * @return true if the string was successfully read, or false if * it was not. * @see #tryRead(char) */ boolean tryRead (String delim) throws java.lang.Exception { char ch[] = delim.toCharArray(); char c; // Compare the input, character- // by character. for (int i = 0; i < ch.length; i++) { c=readCh(); if (c!=ch[i]) { unread(c); if (i!=0) {unread(ch,i);} return false; } } return true; } /** * Return true if we can read some whitespace. *

This is simply a convenience method. *

This method will push back a character rather than an * array whenever possible (probably the majority of cases). * @return true if whitespace was found. */ boolean tryWhitespace () throws java.lang.Exception { char c; c = readCh(); if (isWhitespace(c)) { skipWhitespace(); return true; } else { unread(c); return false; } } /** * Read all data until we find the specified string. *

This is especially useful for scanning marked sections. *

This is a a little inefficient right now, since it calls tryRead() * for every character. * @param delim The string delimiter * @see #tryRead(String, boolean) * @see #readCh */ void parseUntil (String delim) throws java.lang.Exception { char c; int startLine = line; try { while (!tryRead(delim)) { c = readCh(); dataBufferAppend(c); } } catch (EOFException e) { error("end of input while looking for delimiter (started on line " + startLine + ')', null, delim); } } /** * Skip all data until we find the specified string. *

This is especially useful for scanning comments. *

This is a a little inefficient right now, since it calls tryRead() * for every character. * @param delim The string delimiter * @see #tryRead(String, boolean) * @see #readCh */ void skipUntil (String delim) throws java.lang.Exception { while (!tryRead(delim)) { readCh(); } } /** * Read just the encoding declaration (or XML declaration) at the * start of an external entity. * When this method is called, we know that the declaration is * present (or appears to be). We also know that the entity is * in some sort of ASCII-derived 8-bit encoding. * The idea of this is to let us read what the 8-bit encoding is * before we've committed to converting any more of the file; the * XML or encoding declaration must be in 7-bit ASCII, so we're * safe as long as we don't go past it. */ void read8bitEncodingDeclaration () throws java.lang.Exception { int ch; readBufferPos = readBufferLength = 0; while (true) { ch = is.read(); readBuffer[readBufferLength++] = (char)ch; switch (ch) { case (int)'>': return; case -1: error("end of file before end of XML or encoding declaration.", null, "?>"); return; } if (readBuffer.length == readBufferLength) { error("unfinished XML or encoding declaration", null, null); } } } ////////////////////////////////////////////////////////////////////// // Low-level I/O. ////////////////////////////////////////////////////////////////////// /** * Read a chunk of data from an external input source. *

This is simply a front-end that fills the rawReadBuffer * with bytes, then calls the appropriate encoding handler. * @see #encoding * @see #rawReadBuffer * @see #readBuffer * @see #filterCR * @see #copyUtf8ReadBuffer * @see #copyIso8859_1ReadBuffer * @see #copyUcs_2ReadBuffer * @see #copyUcs_4ReadBuffer */ void readDataChunk () throws java.lang.Exception { int count, i, j; // See if we have any overflow. if (readBufferOverflow > -1) { readBuffer[0] = (char)readBufferOverflow; readBufferOverflow = -1; readBufferPos = 1; sawCR = true; } else { readBufferPos = 0; sawCR = false; } // Special situation -- we're taking // input from a character stream. if (sourceType == INPUT_READER) { count = reader.read(readBuffer, readBufferPos, READ_BUFFER_MAX-1); if (count < 0) {readBufferLength = -1;} else { readBufferLength = readBufferPos+count; filterCR(); sawCR = false; } return; } // Read as many bytes as possible // into the read buffer. count = is.read(rawReadBuffer, 0, READ_BUFFER_MAX); // Dispatch to an encoding-specific // reader method to populate the // readBuffer. switch (encoding) { case ENCODING_UTF_8: copyUtf8ReadBuffer(count); break; case ENCODING_ISO_8859_1: copyIso8859_1ReadBuffer(count); break; case ENCODING_UCS_2_12: copyUcs2ReadBuffer(count, 8, 0); break; case ENCODING_UCS_2_21: copyUcs2ReadBuffer(count, 0, 8); break; case ENCODING_UCS_4_1234: copyUcs4ReadBuffer(count, 24, 16, 8, 0); break; case ENCODING_UCS_4_4321: copyUcs4ReadBuffer(count, 0, 8, 16, 24); break; case ENCODING_UCS_4_2143: copyUcs4ReadBuffer(count, 16, 24, 0, 8); break; case ENCODING_UCS_4_3412: copyUcs4ReadBuffer(count, 8, 0, 24, 16); break; } // Filter out all carriage returns // if we've seen any. if (sawCR) { filterCR(); sawCR = false; } // Reset the position. readBufferPos = 0; currentByteCount += count; } /** * Filter carriage returns in the read buffer. *

CRLF becomes LF; CR becomes LF. * @see #readDataChunk * @see #readBuffer * @see #readBufferOverflow */ void filterCR () { int i, j; readBufferOverflow = -1; loop: for (i = 0, j = 0; j < readBufferLength; i++, j++) { switch (readBuffer[j]) { case '\r': if (j == readBufferLength - 1) { readBufferOverflow = '\r'; readBufferLength--; break loop; } else if (readBuffer[j+1] == '\n') {j++;} readBuffer[i] = '\n'; break; case '\n': default: readBuffer[i] = readBuffer[j]; break; } } readBufferLength = i; } /** * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters. *

When readDataChunk() calls this method, the raw bytes are in * rawReadBuffer, and the final characters will appear in * readBuffer. *

The tricky part of this is dealing with UTF-8 multi-byte * sequences, but it doesn't seem to slow things down too much. * @param count The number of bytes to convert. * @see #readDataChunk * @see #rawReadBuffer * @see #readBuffer * @see #getNextUtf8Byte */ void copyUtf8ReadBuffer (int count) throws java.lang.Exception { int i = 0; int j = readBufferPos; int b1; boolean isSurrogate = false; while (i < count) { b1 = rawReadBuffer[i++]; isSurrogate = false; // Determine whether we are dealing // with a one-, two-, three-, or four- // byte sequence. if ((b1 & 0x80) == 0) { // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx readBuffer[j++] = (char)b1; } else if ((b1 & 0xe0) == 0xc0) { // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx readBuffer[j++] = (char)(((b1 & 0x1f) << 6) | getNextUtf8Byte(i++, count)); } else if ((b1 & 0xf0) == 0xe0) { // 3-byte sequence: zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx readBuffer[j++] = (char)(((b1 & 0x0f) << 12) | (getNextUtf8Byte(i++, count) << 6) | getNextUtf8Byte(i++, count)); } else if ((b1 & 0xf8) == 0xf0) { // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx // = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx // (uuuuu = wwww + 1) isSurrogate = true; int b2 = getNextUtf8Byte(i++, count); int b3 = getNextUtf8Byte(i++, count); int b4 = getNextUtf8Byte(i++, count); readBuffer[j++] = (char)(0xd800 | ((((b1 & 0x07) << 2) | ((b2 & 0x30) >> 4) - 1) << 6) | ((b2 & 0x0f) << 2) | ((b3 & 0x30) >> 4)); readBuffer[j++] = (char)(0xdc | ((b3 & 0x0f) << 6) | b4); // TODO: test that surrogate value is legal. } else { // Otherwise, the 8th bit may not be set in UTF-8 encodingError("bad start for UTF-8 multi-byte sequence", b1, i); } if (readBuffer[j-1] == '\r') { sawCR = true; } } // How many characters have we read? readBufferLength = j; } /** * Return the next byte value in a UTF-8 sequence. * If it is not possible to get a byte from the current * entity, throw an exception. * @param pos The current position in the rawReadBuffer. * @param count The number of bytes in the rawReadBuffer * @return The significant six bits of a non-initial byte in * a UTF-8 sequence. * @exception EOFException If the sequence is incomplete. */ int getNextUtf8Byte (int pos, int count) throws java.lang.Exception { int val; // Take a character from the buffer // or from the actual input stream. if (pos < count) { val = rawReadBuffer[pos]; } else { val = is.read(); if (val == -1) { encodingError("unfinished multi-byte UTF-8 sequence at EOF", -1, pos); } } // Check for the correct bits at the // start. if ((val & 0xc0) != 0x80) { encodingError("bad continuation of multi-byte UTF-8 sequence", val, pos + 1); } // Return the significant bits. return (val & 0x3f); } /** * Convert a buffer of ISO-8859-1-encoded bytes into UTF-16 characters. *

When readDataChunk() calls this method, the raw bytes are in * rawReadBuffer, and the final characters will appear in * readBuffer. *

This is a direct conversion, with no tricks. * @param count The number of bytes to convert. * @see #readDataChunk * @see #rawReadBuffer * @see #readBuffer */ void copyIso8859_1ReadBuffer (int count) { int i, j; for (i = 0, j = readBufferPos; i < count; i++, j++) { readBuffer[j] = (char)(rawReadBuffer[i] & 0xff); if (readBuffer[j] == '\r') { sawCR = true; } } readBufferLength = j; } /** * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters. *

When readDataChunk() calls this method, the raw bytes are in * rawReadBuffer, and the final characters will appear in * readBuffer. * @param count The number of bytes to convert. * @param shift1 The number of bits to shift byte 1. * @param shift2 The number of bits to shift byte 2 * @see #readDataChunk * @see #rawReadBuffer * @see #readBuffer */ void copyUcs2ReadBuffer (int count, int shift1, int shift2) throws java.lang.Exception { int j = readBufferPos; if (count > 0 && (count % 2) != 0) { encodingError("odd number of bytes in UCS-2 encoding", -1, count); } for (int i = 0; i < count; i+=2) { readBuffer[j++] = (char)(((rawReadBuffer[i] & 0xff) << shift1) | ((rawReadBuffer[i+1] & 0xff) << shift2)); if (readBuffer[j-1] == '\r') { sawCR = true; } } readBufferLength = j; } /** * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters. *

When readDataChunk() calls this method, the raw bytes are in * rawReadBuffer, and the final characters will appear in * readBuffer. *

Java has 16-bit chars, but this routine will attempt to use * surrogates to encoding values between 0x00010000 and 0x000fffff. * @param count The number of bytes to convert. * @param shift1 The number of bits to shift byte 1. * @param shift2 The number of bits to shift byte 2 * @param shift3 The number of bits to shift byte 2 * @param shift4 The number of bits to shift byte 2 * @see #readDataChunk * @see #rawReadBuffer * @see #readBuffer */ void copyUcs4ReadBuffer (int count, int shift1, int shift2, int shift3, int shift4) throws java.lang.Exception { int j = readBufferPos; int value; if (count > 0 && (count % 4) != 0) { encodingError("number of bytes in UCS-4 encoding not divisible by 4", -1, count); } for (int i = 0; i < count; i+=4) { value = (((rawReadBuffer[i] & 0xff) << shift1) | ((rawReadBuffer[i+1] & 0xff) << shift2) | ((rawReadBuffer[i+2] & 0xff) << shift3) | ((rawReadBuffer[i+3] & 0xff) << shift4)); if (value < 0x0000ffff) { readBuffer[j++] = (char)value; if (value == (int)'\r') { sawCR = true; } } else if (value < 0x000fffff) { readBuffer[j++] = (char)(0xd8 | ((value & 0x000ffc00) >> 10)); readBuffer[j++] = (char)(0xdc | (value & 0x0003ff)); } else { encodingError("value cannot be represented in UTF-16", value, i); } } readBufferLength = j; } /** * Report a character encoding error. */ void encodingError (String message, int value, int offset) throws java.lang.Exception { String uri; if (value >= 0) { message = message + " (byte value: 0x" + Integer.toHexString(value) + ')'; } if (externalEntity != null) { uri = externalEntity.getURL().toString(); } else { uri = baseURI; } handler.error(message, uri, -1, offset + currentByteCount); } ////////////////////////////////////////////////////////////////////// // Local Variables. ////////////////////////////////////////////////////////////////////// /** * Re-initialize the variables for each parse. */ void initializeVariables () { // No errors; first line errorCount = 0; line = 1; column = 0; // Set up the buffers for data and names dataBufferPos = 0; dataBuffer = new char[DATA_BUFFER_INITIAL]; nameBufferPos = 0; nameBuffer = new char[NAME_BUFFER_INITIAL]; // Set up the DTD hash tables elementInfo = new Hashtable(); entityInfo = new Hashtable(); notationInfo = new Hashtable(); // Set up the variables for the current // element context. currentElement = null; currentElementContent = CONTENT_UNDECLARED; // Set up the input variables sourceType = INPUT_NONE; inputStack = new Stack(); entityStack = new Stack(); externalEntity = null; tagAttributePos = 0; tagAttributes = new String[100]; rawReadBuffer = new byte[READ_BUFFER_MAX]; readBufferOverflow = -1; context = CONTEXT_NONE; symbolTable = new Object[SYMBOL_TABLE_LENGTH]; } /** * Clean up after the parse to allow some garbage collection. * Leave around anything that might be useful for queries. */ void cleanupVariables () { errorCount = -1; line = -1; column = -1; dataBuffer = null; nameBuffer = null; currentElement = null; currentElementContent = CONTENT_UNDECLARED; sourceType = INPUT_NONE; inputStack = null; externalEntity = null; entityStack = null; } // // The current XML handler interface. // XmlHandler handler; // // I/O information. // private Reader reader; // current reader private InputStream is; // current input stream private int line; // current line number private int column; // current column number private int sourceType; // type of input source private Stack inputStack; // stack of input soruces private URLConnection externalEntity; // current external entity private int encoding; // current character encoding. private int currentByteCount; // how many bytes read from current source. // // Maintain a count of errors. // private int errorCount; // // Buffers for decoded but unparsed character input. // private final static int READ_BUFFER_MAX = 16384; private char readBuffer[]; private int readBufferPos; private int readBufferLength; private int readBufferOverflow; // overflow character from last data chunk. // // Buffer for undecoded raw byte input. // private byte rawReadBuffer[]; // // Buffer for parsed character data. // private static int DATA_BUFFER_INITIAL = 4096; private char dataBuffer[]; private int dataBufferPos; // // Buffer for parsed names. // private static int NAME_BUFFER_INITIAL = 1024; private char nameBuffer[]; private int nameBufferPos; // // Hashtables for DTD information on elements, entities, and notations. // private Hashtable elementInfo; private Hashtable entityInfo; private Hashtable notationInfo; // // Element type currently in force. // private String currentElement; private int currentElementContent; // // Base external identifiers for resolution. // private String basePublicId; private String baseURI; private int baseEncoding; private Reader baseReader; private InputStream baseInputStream; private char baseInputBuffer[]; private int baseInputBufferStart; private int baseInputBufferLength; // // Stack of entity names, to help detect recursion. // private Stack entityStack; // // Are we in a context where PEs are allowed? // private int context; // // Symbol table, for internalising names. // private Object symbolTable[]; private final static int SYMBOL_TABLE_LENGTH = 1087; // // Hash table of attributes found in current start tag. // private String tagAttributes[]; private int tagAttributePos; // // Utility flag: have we noticed a CR while reading the last // data chunk? If so, we will have to go back and normalise // CR/LF. // private boolean sawCR; }

Copyright 1998-2024 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.

What this is

Other links

The source code

new blog posts

... this post is sponsored by my books ...
#1 New Release!	FP Best Seller