What this is
This file is included in the DevDaily.com
"Java Source Code
Warehouse" project. The intent of this project is to help you "Learn
Java by Example" TM.
Other links
The source code
// XmlParser.java: the main parser class.
// NO WARRANTY! See README, and copyright below.
// $Id: XmlParser.java,v 1.3 2004/02/22 20:00:47 spestov Exp $
package com.microstar.xml;
import java.io.BufferedInputStream;
import java.io.EOFException;
import java.io.InputStream;
import java.io.Reader;
import java.net.URL;
import java.net.URLConnection;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Stack;
/**
* Parse XML documents and return parse events through call-backs.
* You need to define a class implementing the XmlHandler
* interface: an object belonging to this class will receive the
* callbacks for the events. (As an alternative to implementing
* the full XmlHandler interface, you can simply extend the
* HandlerBase convenience class.)
* Usage (assuming that MyHandler is your implementation
* of the XmlHandler interface):
*
* XmlHandler handler = new MyHandler();
* XmlParser parser = new XmlParser();
* parser.setHandler(handler);
* try {
* parser.parse("http://www.host.com/doc.xml", null);
* } catch (Exception e) {
* [do something interesting]
* }
*
* Alternatively, you can use the standard SAX interfaces
* with the SAXDriver class as your entry point.
* @author Copyright (c) 1997, 1998 by Microstar Software Ltd.
* @author Written by David Megginson <dmeggins@microstar.com>
* @version 1.1
* @see XmlHandler
* @see HandlerBase
* @see SAXDriver
*/
public class XmlParser {
//
// Use special cheats that speed up the code (currently about 50%),
// but may cause problems with future maintenance and add to the
// class file size (about 500 bytes).
//
private final static boolean USE_CHEATS = true;
//////////////////////////////////////////////////////////////////////
// Constructors.
////////////////////////////////////////////////////////////////////////
/**
* Construct a new parser with no associated handler.
* @see #setHandler
* @see #parse
*/
public XmlParser ()
{
}
/**
* Set the handler that will receive parsing events.
* @param handler The handler to receive callback events.
* @see #parse
* @see XmlHandler
*/
public void setHandler (XmlHandler handler)
{
this.handler = handler;
}
/**
* Parse an XML document from a URI.
* You may parse a document more than once, but only one thread
* may call this method for an object at one time.
* @param systemId The URI of the document.
* @param publicId The public identifier of the document, or null.
* @param encoding The suggested encoding, or null if unknown.
* @exception java.lang.Exception Any exception thrown by your
* own handlers, or any derivation of java.io.IOException
* thrown by the parser itself.
*/
public void parse (String systemId, String publicId, String encoding)
throws java.lang.Exception
{
doParse(systemId, publicId, null, null, encoding);
}
/**
* Parse an XML document from a byte stream.
* The URI that you supply will become the base URI for
* resolving relative links, but Ælfred will actually read
* the document from the supplied input stream.
* You may parse a document more than once, but only one thread
* may call this method for an object at one time.
* @param systemId The base URI of the document, or null if not
* known.
* @param publicId The public identifier of the document, or null
* if not known.
* @param stream A byte input stream.
* @param encoding The suggested encoding, or null if unknown.
* @exception java.lang.Exception Any exception thrown by your
* own handlers, or any derivation of java.io.IOException
* thrown by the parser itself.
*/
public void parse (String systemId, String publicId,
InputStream stream, String encoding)
throws java.lang.Exception
{
doParse(systemId, publicId, null, stream, encoding);
}
/**
* Parse an XML document from a character stream.
* The URI that you supply will become the base URI for
* resolving relative links, but Ælfred will actually read
* the document from the supplied input stream.
* You may parse a document more than once, but only one thread
* may call this method for an object at one time.
* @param systemId The base URI of the document, or null if not
* known.
* @param publicId The public identifier of the document, or null
* if not known.
* @param reader A character stream.
* @exception java.lang.Exception Any exception thrown by your
* own handlers, or any derivation of java.io.IOException
* thrown by the parser itself.
*/
public void parse (String systemId, String publicId, Reader reader)
throws java.lang.Exception
{
doParse(systemId, publicId, reader, null, null);
}
private synchronized void doParse (String systemId, String publicId,
Reader reader, InputStream stream,
String encoding)
throws java.lang.Exception
{
basePublicId = publicId;
baseURI = systemId;
baseReader = reader;
baseInputStream = stream;
initializeVariables();
// Set the default entities here.
setInternalEntity(intern("amp"), "&");
setInternalEntity(intern("lt"), "<");
setInternalEntity(intern("gt"), ">");
setInternalEntity(intern("apos"), "'");
setInternalEntity(intern("quot"), """);
if (handler != null) {
handler.startDocument();
}
pushURL("[document]", basePublicId, baseURI, baseReader, baseInputStream,
encoding);
parseDocument();
if (handler != null) {
handler.endDocument();
}
cleanupVariables();
}
////////////////////////////////////////////////////////////////////////
// Constants.
////////////////////////////////////////////////////////////////////////
//
// Constants for element content type.
//
/**
* Constant: an element has not been declared.
* @see #getElementContentType
*/
public final static int CONTENT_UNDECLARED = 0;
/**
* Constant: the element has a content model of ANY.
* @see #getElementContentType
*/
public final static int CONTENT_ANY = 1;
/**
* Constant: the element has declared content of EMPTY.
* @see #getElementContentType
*/
public final static int CONTENT_EMPTY = 2;
/**
* Constant: the element has mixed content.
* @see #getElementContentType
*/
public final static int CONTENT_MIXED = 3;
/**
* Constant: the element has element content.
* @see #getElementContentType
*/
public final static int CONTENT_ELEMENTS = 4;
//
// Constants for the entity type.
//
/**
* Constant: the entity has not been declared.
* @see #getEntityType
*/
public final static int ENTITY_UNDECLARED = 0;
/**
* Constant: the entity is internal.
* @see #getEntityType
*/
public final static int ENTITY_INTERNAL = 1;
/**
* Constant: the entity is external, non-XML data.
* @see #getEntityType
*/
public final static int ENTITY_NDATA = 2;
/**
* Constant: the entity is external XML data.
* @see #getEntityType
*/
public final static int ENTITY_TEXT = 3;
//
// Constants for attribute type.
//
/**
* Constant: the attribute has not been declared for this element type.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_UNDECLARED = 0;
/**
* Constant: the attribute value is a string value.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_CDATA = 1;
/**
* Constant: the attribute value is a unique identifier.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_ID = 2;
/**
* Constant: the attribute value is a reference to a unique identifier.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_IDREF = 3;
/**
* Constant: the attribute value is a list of ID references.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_IDREFS = 4;
/**
* Constant: the attribute value is the name of an entity.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_ENTITY = 5;
/**
* Constant: the attribute value is a list of entity names.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_ENTITIES = 6;
/**
* Constant: the attribute value is a name token.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_NMTOKEN = 7;
/**
* Constant: the attribute value is a list of name tokens.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_NMTOKENS = 8;
/**
* Constant: the attribute value is a token from an enumeration.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_ENUMERATED = 9;
/**
* Constant: the attribute is the name of a notation.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_NOTATION = 10;
//
// When the class is loaded, populate the hash table of
// attribute types.
//
/**
* Hash table of attribute types.
*/
private static Hashtable attributeTypeHash;
static {
attributeTypeHash = new Hashtable();
attributeTypeHash.put("CDATA", new Integer(ATTRIBUTE_CDATA));
attributeTypeHash.put("ID", new Integer(ATTRIBUTE_ID));
attributeTypeHash.put("IDREF", new Integer(ATTRIBUTE_IDREF));
attributeTypeHash.put("IDREFS", new Integer(ATTRIBUTE_IDREFS));
attributeTypeHash.put("ENTITY", new Integer(ATTRIBUTE_ENTITY));
attributeTypeHash.put("ENTITIES", new Integer(ATTRIBUTE_ENTITIES));
attributeTypeHash.put("NMTOKEN", new Integer(ATTRIBUTE_NMTOKEN));
attributeTypeHash.put("NMTOKENS", new Integer(ATTRIBUTE_NMTOKENS));
attributeTypeHash.put("NOTATION", new Integer(ATTRIBUTE_NOTATION));
}
//
// Constants for supported encodings.
//
private final static int ENCODING_UTF_8 = 1;
private final static int ENCODING_ISO_8859_1 = 2;
private final static int ENCODING_UCS_2_12 = 3;
private final static int ENCODING_UCS_2_21 = 4;
private final static int ENCODING_UCS_4_1234 = 5;
private final static int ENCODING_UCS_4_4321 = 6;
private final static int ENCODING_UCS_4_2143 = 7;
private final static int ENCODING_UCS_4_3412 = 8;
//
// Constants for attribute default value.
//
/**
* Constant: the attribute is not declared.
* @see #getAttributeDefaultValueType
*/
public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 0;
/**
* Constant: the attribute has a literal default value specified.
* @see #getAttributeDefaultValueType
* @see #getAttributeDefaultValue
*/
public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 1;
/**
* Constant: the attribute was declared #IMPLIED.
* @see #getAttributeDefaultValueType
*/
public final static int ATTRIBUTE_DEFAULT_IMPLIED = 2;
/**
* Constant: the attribute was declared #REQUIRED.
* @see #getAttributeDefaultValueType
*/
public final static int ATTRIBUTE_DEFAULT_REQUIRED = 3;
/**
* Constant: the attribute was declared #FIXED.
* @see #getAttributeDefaultValueType
* @see #getAttributeDefaultValue
*/
public final static int ATTRIBUTE_DEFAULT_FIXED = 4;
//
// Constants for input.
//
private final static int INPUT_NONE = 0;
private final static int INPUT_INTERNAL = 1;
private final static int INPUT_EXTERNAL = 2;
private final static int INPUT_STREAM = 3;
private final static int INPUT_BUFFER = 4;
private final static int INPUT_READER = 5;
//
// Flags for reading literals.
//
private final static int LIT_CHAR_REF = 1;
private final static int LIT_ENTITY_REF = 2;
private final static int LIT_PE_REF = 4;
private final static int LIT_NORMALIZE = 8;
//
// Flags for parsing context.
//
private final static int CONTEXT_NONE = 0;
private final static int CONTEXT_DTD = 1;
private final static int CONTEXT_ENTITYVALUE = 2;
private final static int CONTEXT_ATTRIBUTEVALUE = 3;
//////////////////////////////////////////////////////////////////////
// Error reporting.
//////////////////////////////////////////////////////////////////////
/**
* Report an error.
* @param message The error message.
* @param textFound The text that caused the error (or null).
* @see XmlHandler#error
* @see #line
*/
void error (String message, String textFound, String textExpected)
throws java.lang.Exception
{
errorCount++;
if (textFound != null) {
message = message + " (found \"" + textFound + "\")";
}
if (textExpected != null) {
message = message + " (expected \"" + textExpected + "\")";
}
if (handler != null) {
String uri = null;
if (externalEntity != null) {
uri = externalEntity.getURL().toString();
}
handler.error(message, uri, line, column);
}
}
/**
* Report a serious error.
* @param message The error message.
* @param textFound The text that caused the error (or null).
*/
void error (String message, char textFound, String textExpected)
throws java.lang.Exception
{
error(message, new Character(textFound).toString(), textExpected);
}
//////////////////////////////////////////////////////////////////////
// Major syntactic productions.
//////////////////////////////////////////////////////////////////////
/**
* Parse an XML document.
*
* [1] document ::= prolog element Misc*
*
* This is the top-level parsing function for a single XML
* document. As a minimum, a well-formed document must have
* a document element, and a valid document must have a prolog
* as well.
*/
void parseDocument ()
throws java.lang.Exception
{
char c;
parseProlog();
require('<');
parseElement();
try
{
parseMisc(); //skip all white, PIs, and comments
c=readCh(); //if this doesn't throw an exception...
error("unexpected characters after document end",c,null);
}
catch (EOFException e)
{return;}
}
/**
* Skip a comment.
*
* [18] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
*
* (The <!-- has already been read.)
*/
void parseComment ()
throws java.lang.Exception
{
skipUntil("-->");
}
/**
* Parse a processing instruction and do a call-back.
*
* [19] PI ::= '<?' Name (S (Char* - (Char* '?>' Char*)))? '?>'
*
* (The <? has already been read.)
* An XML processing instruction must begin with
* a Name, which is the instruction's target.
*/
void parsePI ()
throws java.lang.Exception
{
String name;
name = readNmtoken(true);
if (!tryRead("?>")) {
requireWhitespace();
parseUntil("?>");
}
if (handler != null) {
handler.processingInstruction(name, dataBufferToString());
}
}
/**
* Parse a CDATA marked section.
*
* [20] CDSect ::= CDStart CData CDEnd
* [21] CDStart ::= '<![CDATA['
* [22] CData ::= (Char* - (Char* ']]>' Char*))
* [23] CDEnd ::= ']]>'
*
* (The '<![CDATA[' has already been read.)
* Note that this just appends characters to the dataBuffer,
* without actually generating an event.
*/
void parseCDSect ()
throws java.lang.Exception
{
parseUntil("]]>");
}
/**
* Parse the prolog of an XML document.
*
* [24] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
*
* There are a couple of tricks here. First, it is necessary to
* declare the XML default attributes after the DTD (if present)
* has been read. Second, it is not possible to expand general
* references in attribute value literals until after the entire
* DTD (if present) has been parsed.
* We do not look for the XML declaration here, because it is
* handled by pushURL().
* @see pushURL
*/
void parseProlog ()
throws java.lang.Exception
{
parseMisc();
if (tryRead("
* [25] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
* [26] VersionInfo ::= S 'version' Eq ('"1.0"' | "'1.0'")
* [33] SDDecl ::= S 'standalone' Eq "'" ('yes' | 'no') "'"
* | S 'standalone' Eq '"' ("yes" | "no") '"'
* [78] EncodingDecl ::= S 'encoding' Eq QEncoding
*
* ([80] to [82] are also significant.)
* (The <?xml and whitespace have already been read.)
* TODO: validate value of standalone.
* @see #parseTextDecl
* @see #checkEncoding
*/
void parseXMLDecl (boolean ignoreEncoding)
throws java.lang.Exception
{
String version;
String encodingName = null;
String standalone = null;
// Read the version.
require("version");
parseEq();
version = readLiteral(0);
if (!version.equals("1.0")) {
error("unsupported XML version", version, "1.0");
}
// Try reading an encoding declaration.
skipWhitespace();
if (tryRead("encoding")) {
parseEq();
encodingName = readLiteral(0);
checkEncoding(encodingName, ignoreEncoding);
}
// Try reading a standalone declaration
skipWhitespace();
if (tryRead("standalone")) {
parseEq();
standalone = readLiteral(0);
}
skipWhitespace();
require("?>");
}
/**
* Parse the Encoding PI.
*
* [78] EncodingDecl ::= S 'encoding' Eq QEncoding
* [79] EncodingPI ::= '<?xml' S 'encoding' Eq QEncoding S? '?>'
* [80] QEncoding ::= '"' Encoding '"' | "'" Encoding "'"
* [81] Encoding ::= LatinName
* [82] LatinName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
*
* (The <?xml ' and whitespace have already been read.)
* @see #parseXMLDecl
* @see #checkEncoding
*/
void parseTextDecl (boolean ignoreEncoding)
throws java.lang.Exception
{
String encodingName = null;
// Read an optional version.
if (tryRead("version")) {
String version;
parseEq();
version = readLiteral(0);
if (!version.equals("1.0")) {
error("unsupported XML version", version, "1.0");
}
requireWhitespace();
}
// Read the encoding.
require("encoding");
parseEq();
encodingName = readLiteral(0);
checkEncoding(encodingName, ignoreEncoding);
skipWhitespace();
require("?>");
}
/**
* Check that the encoding specified makes sense.
* Compare what the author has specified in the XML declaration
* or encoding PI with what we have detected.
* This is also important for distinguishing among the various
* 7- and 8-bit encodings, such as ISO-LATIN-1 (I cannot autodetect
* those).
* @param encodingName The name of the encoding specified by the user.
* @see #parseXMLDecl
* @see #parseTextDecl
*/
void checkEncoding (String encodingName, boolean ignoreEncoding)
throws java.lang.Exception
{
encodingName = encodingName.toUpperCase();
if (ignoreEncoding) {
return;
}
switch (encoding) {
// 8-bit encodings
case ENCODING_UTF_8:
if (encodingName.equals("ISO-8859-1")) {
encoding = ENCODING_ISO_8859_1;
} else if (!encodingName.equals("UTF-8")) {
error("unsupported 8-bit encoding",
encodingName,
"UTF-8 or ISO-8859-1");
}
break;
// 16-bit encodings
case ENCODING_UCS_2_12:
case ENCODING_UCS_2_21:
if (!encodingName.equals("ISO-10646-UCS-2") &&
!encodingName.equals("UTF-16")) {
error("unsupported 16-bit encoding",
encodingName,
"ISO-10646-UCS-2");
}
break;
// 32-bit encodings
case ENCODING_UCS_4_1234:
case ENCODING_UCS_4_4321:
case ENCODING_UCS_4_2143:
case ENCODING_UCS_4_3412:
if (!encodingName.equals("ISO-10646-UCS-4")) {
error("unsupported 32-bit encoding",
encodingName,
"ISO-10646-UCS-4");
}
}
}
/**
* Parse miscellaneous markup outside the document element and DOCTYPE
* declaration.
*
* [27] Misc ::= Comment | PI | S
*
*/
void parseMisc ()
throws java.lang.Exception
{
while (true)
{
skipWhitespace();
if (tryRead(""))
{parsePI();}
else if (tryRead(" |