alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Java example source code file (Parser.java)

This example Java source code file (Parser.java) is included in the alvinalexander.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Learn more about this Java project at its project page.

Java - Java tags/keywords

cdata, dtd, eos, exception, input, ioexception, notation, pair, ph_doc_start, string, suppresswarnings, utf\-16, util, xmlns

The Parser.java Java example source code

/*
 * Copyright (c) 2012, 2013, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

package jdk.internal.util.xml.impl;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.Map;
import jdk.internal.org.xml.sax.InputSource;
import jdk.internal.org.xml.sax.SAXException;

/**
 * XML non-validating parser engine.
 */
public abstract class Parser {

    public final static String FAULT = "";
    protected final static int BUFFSIZE_READER = 512;
    protected final static int BUFFSIZE_PARSER = 128;
    /**
     * The end of stream character.
     */
    public final static char EOS = 0xffff;
    private Pair mNoNS; // there is no namespace
    private Pair mXml;  // the xml namespace
    private Map<String, Input> mEnt;  // the entities look up table
    private Map<String, Input> mPEnt; // the parmeter entities look up table
    protected boolean mIsSAlone;     // xml decl standalone flag
    protected boolean mIsSAloneSet;  // standalone is explicitely set
    protected boolean mIsNSAware;    // if true - namespace aware mode
    protected int mPh;  // current phase of document processing
    protected final static int PH_BEFORE_DOC = -1;  // before parsing
    protected final static int PH_DOC_START = 0;   // document start
    protected final static int PH_MISC_DTD = 1;   // misc before DTD
    protected final static int PH_DTD = 2;   // DTD
    protected final static int PH_DTD_MISC = 3;   // misc after DTD
    protected final static int PH_DOCELM = 4;   // document's element
    protected final static int PH_DOCELM_MISC = 5;   // misc after element
    protected final static int PH_AFTER_DOC = 6;   // after parsing
    protected int mEvt;  // current event type
    protected final static int EV_NULL = 0;   // unknown
    protected final static int EV_ELM = 1;   // empty element
    protected final static int EV_ELMS = 2;   // start element
    protected final static int EV_ELME = 3;   // end element
    protected final static int EV_TEXT = 4;   // textual content
    protected final static int EV_WSPC = 5;   // white space content
    protected final static int EV_PI = 6;   // processing instruction
    protected final static int EV_CDAT = 7;   // character data
    protected final static int EV_COMM = 8;   // comment
    protected final static int EV_DTD = 9;   // document type definition
    protected final static int EV_ENT = 10;  // skipped entity
    private char mESt; // built-in entity recognizer state
    // mESt values:
    //   0x100   : the initial state
    //   > 0x100 : unrecognized name
    //   < 0x100 : replacement character
    protected char[] mBuff;       // parser buffer
    protected int mBuffIdx;    // index of the last char
    protected Pair mPref;       // stack of prefixes
    protected Pair mElm;        // stack of elements
    // mAttL.chars - element qname
    // mAttL.next  - next element
    // mAttL.list  - list of attributes defined on this element
    // mAttL.list.chars - attribute qname
    // mAttL.list.id    - a char representing attribute's type see below
    // mAttL.list.next  - next attribute defined on the element
    // mAttL.list.list  - devault value structure or null
    // mAttL.list.list.chars - "name='value' " chars array for Input
    //
    // Attribute type character values:
    // 'i' - "ID"
    // 'r' - "IDREF"
    // 'R' - "IDREFS"
    // 'n' - "ENTITY"
    // 'N' - "ENTITIES"
    // 't' - "NMTOKEN"
    // 'T' - "NMTOKENS"
    // 'u' - enumeration type
    // 'o' - "NOTATION"
    // 'c' - "CDATA"
    // see also: bkeyword() and atype()
    //
    protected Pair mAttL;       // list of defined attrs by element name
    protected Input mDoc;        // document entity
    protected Input mInp;        // stack of entities
    private char[] mChars;      // reading buffer
    private int mChLen;      // current capacity
    private int mChIdx;      // index to the next char
    protected Attrs mAttrs;      // attributes of the curr. element
    private String[] mItems;      // attributes array of the curr. element
    private char mAttrIdx;    // attributes counter/index
    private String mUnent;  // unresolved entity name
    private Pair mDltd;   // deleted objects for reuse
    /**
     * Default prefixes
     */
    private final static char NONS[];
    private final static char XML[];
    private final static char XMLNS[];

    static {
        NONS = new char[1];
        NONS[0] = (char) 0;

        XML = new char[4];
        XML[0] = (char) 4;
        XML[1] = 'x';
        XML[2] = 'm';
        XML[3] = 'l';

        XMLNS = new char[6];
        XMLNS[0] = (char) 6;
        XMLNS[1] = 'x';
        XMLNS[2] = 'm';
        XMLNS[3] = 'l';
        XMLNS[4] = 'n';
        XMLNS[5] = 's';
    }
    /**
     * ASCII character type array.
     *
     * This array maps an ASCII (7 bit) character to the character type.<br />
     * Possible character type values are:<br /> - ' ' for any kind of white
     * space character;<br /> - 'a' for any lower case alphabetical character
     * value;<br /> - 'A' for any upper case alphabetical character value;
* - 'd' for any decimal digit character value;<br /> - 'z' for any * character less then ' ' except '\t', '\n', '\r';<br /> An ASCII (7 bit) * character which does not fall in any category listed above is mapped to * it self. */ private static final byte asctyp[]; /** * NMTOKEN character type array. * * This array maps an ASCII (7 bit) character to the character type.<br /> * Possible character type values are:<br /> - 0 for underscore ('_') or any * lower and upper case alphabetical character value;<br /> - 1 for colon * (':') character;<br /> - 2 for dash ('-') and dot ('.') or any decimal * digit character value;<br /> - 3 for any kind of white space character
' { panic(FAULT); } mEvt = EV_ELM; break; default: panic(FAULT); } break; } break; case 1: // read white space switch (ch) { case ' ': case '\t': case '\n': bappend(ch); break; case '\r': // EOL processing [#2.11] if (getch() != '\n') { bkch(); } bappend('\n'); break; case '<': mEvt = EV_WSPC; bkch(); bflash_ws(); break; default: bkch(); st = 2; break; } break; case 2: // read the text content of the element switch (ch) { case '&': if (mUnent == null) { // There was no unresolved entity on previous step. if ((mUnent = ent('x')) != null) { mEvt = EV_TEXT; bkch(); // move back to ';' after entity name setch('&'); // parser must be back on next step bflash(); } } else { // There was unresolved entity on previous step. mEvt = EV_ENT; skippedEnt(mUnent); mUnent = null; } break; case '<': mEvt = EV_TEXT; bkch(); bflash(); break; case '\r': // EOL processing [#2.11] if (getch() != '\n') { bkch(); } bappend('\n'); break; case EOS: panic(FAULT); default: bappend(ch); break; } break; default: panic(FAULT); } } return mEvt; } /** * Parses the document type declaration. * * @exception Exception is parser specific exception form panic method. * @exception IOException */ private void dtd() throws Exception { char ch; String str = null; String name = null; Pair psid = null; // read 'DOCTYPE' if ("DOCTYPE".equals(name(false)) != true) { panic(FAULT); } mPh = PH_DTD; // DTD for (short st = 0; st >= 0;) { ch = getch(); switch (st) { case 0: // read the document type name if (chtyp(ch) != ' ') { bkch(); name = name(mIsNSAware); wsskip(); st = 1; // read 'PUPLIC' or 'SYSTEM' } break; case 1: // read 'PUPLIC' or 'SYSTEM' switch (chtyp(ch)) { case 'A': bkch(); psid = pubsys(' '); st = 2; // skip spaces before internal subset docType(name, psid.name, psid.value); break; case '[': bkch(); st = 2; // skip spaces before internal subset docType(name, null, null); break; case '>': bkch(); st = 3; // skip spaces after internal subset docType(name, null, null); break; default: panic(FAULT); } break; case 2: // skip spaces before internal subset switch (chtyp(ch)) { case '[': // Process internal subset dtdsub(); st = 3; // skip spaces after internal subset break; case '>': // There is no internal subset bkch(); st = 3; // skip spaces after internal subset break; case ' ': // skip white spaces break; default: panic(FAULT); } break; case 3: // skip spaces after internal subset switch (chtyp(ch)) { case '>': if (psid != null) { // Report the DTD external subset InputSource is = resolveEnt(name, psid.name, psid.value); if (is != null) { if (mIsSAlone == false) { // Set the end of DTD external subset char bkch(); setch(']'); // Set the DTD external subset InputSource push(new Input(BUFFSIZE_READER)); setinp(is); mInp.pubid = psid.name; mInp.sysid = psid.value; // Parse the DTD external subset dtdsub(); } else { // Unresolved DTD external subset skippedEnt("[dtd]"); // Release reader and stream if (is.getCharacterStream() != null) { try { is.getCharacterStream().close(); } catch (IOException ioe) { } } if (is.getByteStream() != null) { try { is.getByteStream().close(); } catch (IOException ioe) { } } } } else { // Unresolved DTD external subset skippedEnt("[dtd]"); } del(psid); } st = -1; // end of DTD break; case ' ': // skip white spaces break; default: panic(FAULT); } break; default: panic(FAULT); } } } /** * Parses the document type declaration subset. * * @exception Exception is parser specific exception form panic method. * @exception IOException */ private void dtdsub() throws Exception { char ch; for (short st = 0; st >= 0;) { ch = getch(); switch (st) { case 0: // skip white spaces before a declaration switch (chtyp(ch)) { case '<': ch = getch(); switch (ch) { case '?': pi(); break; case '!': ch = getch(); bkch(); if (ch == '-') { comm(); break; } // A markup or an entity declaration bntok(); switch (bkeyword()) { case 'n': dtdent(); break; case 'a': dtdattl(); // parse attributes declaration break; case 'e': dtdelm(); // parse element declaration break; case 'o': dtdnot(); // parse notation declaration break; default: panic(FAULT); // unsupported markup declaration break; } st = 1; // read the end of declaration break; default: panic(FAULT); break; } break; case '%': // A parameter entity reference pent(' '); break; case ']': // End of DTD subset st = -1; break; case ' ': // Skip white spaces break; case 'Z': // End of stream if (getch() != ']') { panic(FAULT); } st = -1; break; default: panic(FAULT); } break; case 1: // read the end of declaration switch (ch) { case '>': // there is no notation st = 0; // skip white spaces before a declaration break; case ' ': case '\n': case '\r': case '\t': // Skip white spaces break; default: panic(FAULT); break; } break; default: panic(FAULT); } } } /** * Parses an entity declaration. This method fills the general ( * <code>mEnt) and parameter * ( * <code>mPEnt) entity look up table. * * @exception Exception is parser specific exception form panic method. * @exception IOException */ @SuppressWarnings("fallthrough") private void dtdent() throws Exception { String str = null; char[] val = null; Input inp = null; Pair ids = null; char ch; for (short st = 0; st >= 0;) { ch = getch(); switch (st) { case 0: // skip white spaces before entity name switch (chtyp(ch)) { case ' ': // Skip white spaces break; case '%': // Parameter entity or parameter entity declaration. ch = getch(); bkch(); if (chtyp(ch) == ' ') { // Parameter entity declaration. wsskip(); str = name(false); switch (chtyp(wsskip())) { case 'A': // Read the external identifier ids = pubsys(' '); if (wsskip() == '>') { // External parsed entity if (mPEnt.containsKey(str) == false) { // [#4.2] inp = new Input(); inp.pubid = ids.name; inp.sysid = ids.value; mPEnt.put(str, inp); } } else { panic(FAULT); } del(ids); st = -1; // the end of declaration break; case '\"': case '\'': // Read the parameter entity value bqstr('d'); // Create the parameter entity value val = new char[mBuffIdx + 1]; System.arraycopy(mBuff, 1, val, 1, val.length - 1); // Add surrounding spaces [#4.4.8] val[0] = ' '; // Add the entity to the entity look up table if (mPEnt.containsKey(str) == false) { // [#4.2] inp = new Input(val); inp.pubid = mInp.pubid; inp.sysid = mInp.sysid; inp.xmlenc = mInp.xmlenc; inp.xmlver = mInp.xmlver; mPEnt.put(str, inp); } st = -1; // the end of declaration break; default: panic(FAULT); break; } } else { // Parameter entity reference. pent(' '); } break; default: bkch(); str = name(false); st = 1; // read entity declaration value break; } break; case 1: // read entity declaration value switch (chtyp(ch)) { case '\"': // internal entity case '\'': bkch(); bqstr('d'); // read a string into the buffer if (mEnt.get(str) == null) { // Create general entity value val = new char[mBuffIdx]; System.arraycopy(mBuff, 1, val, 0, val.length); // Add the entity to the entity look up table if (mEnt.containsKey(str) == false) { // [#4.2] inp = new Input(val); inp.pubid = mInp.pubid; inp.sysid = mInp.sysid; inp.xmlenc = mInp.xmlenc; inp.xmlver = mInp.xmlver; mEnt.put(str, inp); } } st = -1; // the end of declaration break; case 'A': // external entity bkch(); ids = pubsys(' '); switch (wsskip()) { case '>': // external parsed entity if (mEnt.containsKey(str) == false) { // [#4.2] inp = new Input(); inp.pubid = ids.name; inp.sysid = ids.value; mEnt.put(str, inp); } break; case 'N': // external general unparsed entity if ("NDATA".equals(name(false)) == true) { wsskip(); unparsedEntDecl(str, ids.name, ids.value, name(false)); break; } default: panic(FAULT); break; } del(ids); st = -1; // the end of declaration break; case ' ': // Skip white spaces break; default: panic(FAULT); break; } break; default: panic(FAULT); } } } /** * Parses an element declaration. * * This method parses the declaration up to the closing angle bracket. * * @exception Exception is parser specific exception form panic method. * @exception IOException */ @SuppressWarnings("fallthrough") private void dtdelm() throws Exception { // This is stub implementation which skips an element // declaration. wsskip(); name(mIsNSAware); char ch; while (true) { ch = getch(); switch (ch) { case '>': bkch(); return; case EOS: panic(FAULT); default: break; } } } /** * Parses an attribute list declaration. * * This method parses the declaration up to the closing angle bracket. * * @exception Exception is parser specific exception form panic method. * @exception IOException */ private void dtdattl() throws Exception { char elmqn[] = null; Pair elm = null; char ch; for (short st = 0; st >= 0;) { ch = getch(); switch (st) { case 0: // read the element name switch (chtyp(ch)) { case 'a': case 'A': case '_': case 'X': case ':': bkch(); // Get the element from the list or add a new one. elmqn = qname(mIsNSAware); elm = find(mAttL, elmqn); if (elm == null) { elm = pair(mAttL); elm.chars = elmqn; mAttL = elm; } st = 1; // read an attribute declaration break; case ' ': break; case '%': pent(' '); break; default: panic(FAULT); break; } break; case 1: // read an attribute declaration switch (chtyp(ch)) { case 'a': case 'A': case '_': case 'X': case ':': bkch(); dtdatt(elm); if (wsskip() == '>') { return; } break; case ' ': break; case '%': pent(' '); break; default: panic(FAULT); break; } break; default: panic(FAULT); break; } } } /** * Parses an attribute declaration. * * The attribute uses the following fields of Pair object: chars - characters * of qualified name id - the type identifier of the attribute list - a pair * which holds the default value (chars field) * * @param elm An object which represents all defined attributes on an * element. * @exception Exception is parser specific exception form panic method. * @exception IOException */ @SuppressWarnings("fallthrough") private void dtdatt(Pair elm) throws Exception { char attqn[] = null; Pair att = null; char ch; for (short st = 0; st >= 0;) { ch = getch(); switch (st) { case 0: // the attribute name switch (chtyp(ch)) { case 'a': case 'A': case '_': case 'X': case ':': bkch(); // Get the attribute from the list or add a new one. attqn = qname(mIsNSAware); att = find(elm.list, attqn); if (att == null) { // New attribute declaration att = pair(elm.list); att.chars = attqn; elm.list = att; } else { // Do not override the attribute declaration [#3.3] att = pair(null); att.chars = attqn; att.id = 'c'; } wsskip(); st = 1; break; case '%': pent(' '); break; case ' ': break; default: panic(FAULT); break; } break; case 1: // the attribute type switch (chtyp(ch)) { case '(': att.id = 'u'; // enumeration type st = 2; // read the first element of the list break; case '%': pent(' '); break; case ' ': break; default: bkch(); bntok(); // read type id att.id = bkeyword(); switch (att.id) { case 'o': // NOTATION if (wsskip() != '(') { panic(FAULT); } ch = getch(); st = 2; // read the first element of the list break; case 'i': // ID case 'r': // IDREF case 'R': // IDREFS case 'n': // ENTITY case 'N': // ENTITIES case 't': // NMTOKEN case 'T': // NMTOKENS case 'c': // CDATA wsskip(); st = 4; // read default declaration break; default: panic(FAULT); break; } break; } break; case 2: // read the first element of the list switch (chtyp(ch)) { case 'a': case 'A': case 'd': case '.': case ':': case '-': case '_': case 'X': bkch(); switch (att.id) { case 'u': // enumeration type bntok(); break; case 'o': // NOTATION mBuffIdx = -1; bname(false); break; default: panic(FAULT); break; } wsskip(); st = 3; // read next element of the list break; case '%': pent(' '); break; case ' ': break; default: panic(FAULT); break; } break; case 3: // read next element of the list switch (ch) { case ')': wsskip(); st = 4; // read default declaration break; case '|': wsskip(); switch (att.id) { case 'u': // enumeration type bntok(); break; case 'o': // NOTATION mBuffIdx = -1; bname(false); break; default: panic(FAULT); break; } wsskip(); break; case '%': pent(' '); break; default: panic(FAULT); break; } break; case 4: // read default declaration switch (ch) { case '#': bntok(); switch (bkeyword()) { case 'F': // FIXED switch (wsskip()) { case '\"': case '\'': st = 5; // read the default value break; case EOS: panic(FAULT); default: st = -1; break; } break; case 'Q': // REQUIRED case 'I': // IMPLIED st = -1; break; default: panic(FAULT); break; } break; case '\"': case '\'': bkch(); st = 5; // read the default value break; case ' ': case '\n': case '\r': case '\t': break; case '%': pent(' '); break; default: bkch(); st = -1; break; } break; case 5: // read the default value switch (ch) { case '\"': case '\'': bkch(); bqstr('d'); // the value in the mBuff now att.list = pair(null); // Create a string like "attqname='value' " att.list.chars = new char[att.chars.length + mBuffIdx + 3]; System.arraycopy( att.chars, 1, att.list.chars, 0, att.chars.length - 1); att.list.chars[att.chars.length - 1] = '='; att.list.chars[att.chars.length] = ch; System.arraycopy( mBuff, 1, att.list.chars, att.chars.length + 1, mBuffIdx); att.list.chars[att.chars.length + mBuffIdx + 1] = ch; att.list.chars[att.chars.length + mBuffIdx + 2] = ' '; st = -1; break; default: panic(FAULT); break; } break; default: panic(FAULT); break; } } } /** * Parses a notation declaration. * * This method parses the declaration up to the closing angle bracket. * * @exception Exception is parser specific exception form panic method. * @exception IOException */ private void dtdnot() throws Exception { wsskip(); String name = name(false); wsskip(); Pair ids = pubsys('N'); notDecl(name, ids.name, ids.value); del(ids); } /** * Parses an attribute. * * This recursive method is responsible for prefix addition * ( * <code>mPref) on the way down. The element's start tag end triggers * the return process. The method then on it's way back resolves prefixes * and accumulates attributes. * * <p>att.num carries attribute flags where: 0x1 - attribute is * declared in DTD (attribute decalration had been read); 0x2 - attribute's * default value is used.</p> * * @param att An object which reprecents current attribute. * @exception Exception is parser specific exception form panic method. * @exception IOException */ @SuppressWarnings("fallthrough") private void attr(Pair att) throws Exception { switch (wsskip()) { case '/': case '>': if ((att.num & 0x2) == 0) { // all attributes have been read att.num |= 0x2; // set default attribute flag Input inp = mInp; // Go through all attributes defined on current element. for (Pair def = mElm.list; def != null; def = def.next) { if (def.list == null) // no default value { continue; } // Go through all attributes defined on current // element and add defaults. Pair act = find(att.next, def.chars); if (act == null) { push(new Input(def.list.chars)); } } if (mInp != inp) { // defaults have been added attr(att); return; } } // Ensure the attribute string array capacity mAttrs.setLength(mAttrIdx); mItems = mAttrs.mItems; return; case EOS: panic(FAULT); default: // Read the attribute name and value att.chars = qname(mIsNSAware); att.name = att.local(); String type = atype(att); // sets attribute's type on att.id wsskip(); if (getch() != '=') { panic(FAULT); } bqstr((char) att.id); // read the value with normalization. String val = new String(mBuff, 1, mBuffIdx); Pair next = pair(att); next.num = (att.num & ~0x1); // inherit attribute flags // Put a namespace declaration on top of the prefix stack if ((mIsNSAware == false) || (isdecl(att, val) == false)) { // An ordinary attribute mAttrIdx++; attr(next); // recursive call to parse the next attribute mAttrIdx--; // Add the attribute to the attributes string array char idx = (char) (mAttrIdx << 3); mItems[idx + 1] = att.qname(); // attr qname mItems[idx + 2] = (mIsNSAware) ? att.name : ""; // attr local name mItems[idx + 3] = val; // attr value mItems[idx + 4] = type; // attr type switch (att.num & 0x3) { case 0x0: mItems[idx + 5] = null; break; case 0x1: // declared attribute mItems[idx + 5] = "d"; break; default: // 0x2, 0x3 - default attribute always declared mItems[idx + 5] = "D"; break; } // Resolve the prefix if any and report the attribute // NOTE: The attribute does not accept the default namespace. mItems[idx + 0] = (att.chars[0] != 0) ? rslv(att.chars) : ""; } else { // A namespace declaration. mPref.name contains prefix and // mPref.value contains namespace URI set by isdecl method. // Report a start of the new mapping newPrefix(); // Recursive call to parse the next attribute attr(next); // NOTE: The namespace declaration is not reported. } del(next); break; } } /** * Retrieves attribute type. * * This method sets the type of normalization in the attribute * <code>id field and returns the name of attribute type. * * @param att An object which represents current attribute. * @return The name of the attribute type. * @exception Exception is parser specific exception form panic method. */ private String atype(Pair att) throws Exception { Pair attr; // CDATA-type normalization by default [#3.3.3] att.id = 'c'; if (mElm.list == null || (attr = find(mElm.list, att.chars)) == null) { return "CDATA"; } att.num |= 0x1; // attribute is declared // Non-CDATA normalization except when the attribute type is CDATA. att.id = 'i'; switch (attr.id) { case 'i': return "ID"; case 'r': return "IDREF"; case 'R': return "IDREFS"; case 'n': return "ENTITY"; case 'N': return "ENTITIES"; case 't': return "NMTOKEN"; case 'T': return "NMTOKENS"; case 'u': return "NMTOKEN"; case 'o': return "NOTATION"; case 'c': att.id = 'c'; return "CDATA"; default: panic(FAULT); } return null; } /** * Parses a comment. * * The '<!' part is read in dispatcher so the method starts * with first '-' after '<!'. * * @exception Exception is parser specific exception form panic method. */ @SuppressWarnings("fallthrough") private void comm() throws Exception { if (mPh == PH_DOC_START) { mPh = PH_MISC_DTD; // misc before DTD } // '<!' has been already read by dispetcher. char ch; mBuffIdx = -1; for (short st = 0; st >= 0;) { ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); if (ch == EOS) { panic(FAULT); } switch (st) { case 0: // first '-' of the comment open if (ch == '-') { st = 1; } else { panic(FAULT); } break; case 1: // secind '-' of the comment open if (ch == '-') { st = 2; } else { panic(FAULT); } break; case 2: // skip the comment body switch (ch) { case '-': st = 3; break; default: bappend(ch); break; } break; case 3: // second '-' of the comment close switch (ch) { case '-': st = 4; break; default: bappend('-'); bappend(ch); st = 2; break; } break; case 4: // '>' of the comment close if (ch == '>') { comm(mBuff, mBuffIdx + 1); st = -1; break; } // else - panic [#2.5 compatibility note] default: panic(FAULT); } } } /** * Parses a processing instruction. * * The '<?' is read in dispatcher so the method starts with * first character of PI target name after '<?'. * * @exception Exception is parser specific exception form panic method. * @exception IOException */ private void pi() throws Exception { // '<?' has been already read by dispetcher. char ch; String str = null; mBuffIdx = -1; for (short st = 0; st >= 0;) { ch = getch(); if (ch == EOS) { panic(FAULT); } switch (st) { case 0: // read the PI target name switch (chtyp(ch)) { case 'a': case 'A': case '_': case ':': case 'X': bkch(); str = name(false); // PI target name may not be empty string [#2.6] // PI target name 'XML' is reserved [#2.6] if ((str.length() == 0) || (mXml.name.equals(str.toLowerCase()) == true)) { panic(FAULT); } // This is processing instruction if (mPh == PH_DOC_START) // the begining of the document { mPh = PH_MISC_DTD; // misc before DTD } wsskip(); // skip spaces after the PI target name st = 1; // accumulate the PI body mBuffIdx = -1; break; default: panic(FAULT); } break; case 1: // accumulate the PI body switch (ch) { case '?': st = 2; // end of the PI body break; default: bappend(ch); break; } break; case 2: // end of the PI body switch (ch) { case '>': // PI has been read. pi(str, new String(mBuff, 0, mBuffIdx + 1)); st = -1; break; case '?': bappend('?'); break; default: bappend('?'); bappend(ch); st = 1; // accumulate the PI body break; } break; default: panic(FAULT); } } } /** * Parses a character data. * * The '<!' part is read in dispatcher so the method starts * with first '[' after '<!'. * * @exception Exception is parser specific exception form panic method. * @exception IOException */ private void cdat() throws Exception { // '<!' has been already read by dispetcher. char ch; mBuffIdx = -1; for (short st = 0; st >= 0;) { ch = getch(); switch (st) { case 0: // the first '[' of the CDATA open if (ch == '[') { st = 1; } else { panic(FAULT); } break; case 1: // read "CDATA" if (chtyp(ch) == 'A') { bappend(ch); } else { if ("CDATA".equals( new String(mBuff, 0, mBuffIdx + 1)) != true) { panic(FAULT); } bkch(); st = 2; } break; case 2: // the second '[' of the CDATA open if (ch != '[') { panic(FAULT); } mBuffIdx = -1; st = 3; break; case 3: // read data before the first ']' if (ch != ']') { bappend(ch); } else { st = 4; } break; case 4: // read the second ']' or continue to read the data if (ch != ']') { bappend(']'); bappend(ch); st = 3; } else { st = 5; } break; case 5: // read '>' or continue to read the data switch (ch) { case ']': bappend(']'); break; case '>': bflash(); st = -1; break; default: bappend(']'); bappend(']'); bappend(ch); st = 3; break; } break; default: panic(FAULT); } } } /** * Reads a xml name. * * The xml name must conform "Namespaces in XML" specification. Therefore * the ':' character is not allowed in the name. This method should be used * for PI and entity names which may not have a namespace according to the * specification mentioned above. * * @param ns The true value turns namespace conformance on. * @return The name has been read. * @exception Exception When incorrect character appear in the name. * @exception IOException */ protected String name(boolean ns) throws Exception { mBuffIdx = -1; bname(ns); return new String(mBuff, 1, mBuffIdx); } /** * Reads a qualified xml name. * * The characters of a qualified name is an array of characters. The first * (chars[0]) character is the index of the colon character which separates * the prefix from the local name. If the index is zero, the name does not * contain separator or the parser works in the namespace unaware mode. The * length of qualified name is the length of the array minus one. * * @param ns The true value turns namespace conformance on. * @return The characters of a qualified name. * @exception Exception When incorrect character appear in the name. * @exception IOException */ protected char[] qname(boolean ns) throws Exception { mBuffIdx = -1; bname(ns); char chars[] = new char[mBuffIdx + 1]; System.arraycopy(mBuff, 0, chars, 0, mBuffIdx + 1); return chars; } /** * Reads the public or/and system identifiers. * * @param inp The input object. * @exception Exception is parser specific exception form panic method. * @exception IOException */ private void pubsys(Input inp) throws Exception { Pair pair = pubsys(' '); inp.pubid = pair.name; inp.sysid = pair.value; del(pair); } /** * Reads the public or/and system identifiers. * * @param flag The 'N' allows public id be without system id. * @return The public or/and system identifiers pair. * @exception Exception is parser specific exception form panic method. * @exception IOException */ @SuppressWarnings("fallthrough") private Pair pubsys(char flag) throws Exception { Pair ids = pair(null); String str = name(false); if ("PUBLIC".equals(str) == true) { bqstr('i'); // non-CDATA normalization [#4.2.2] ids.name = new String(mBuff, 1, mBuffIdx); switch (wsskip()) { case '\"': case '\'': bqstr(' '); ids.value = new String(mBuff, 1, mBuffIdx); break; case EOS: panic(FAULT); default: if (flag != 'N') // [#4.7] { panic(FAULT); } ids.value = null; break; } return ids; } else if ("SYSTEM".equals(str) == true) { ids.name = null; bqstr(' '); ids.value = new String(mBuff, 1, mBuffIdx); return ids; } panic(FAULT); return null; } /** * Reads an attribute value. * * The grammar which this method can read is:<br /> * <code>eqstr := S "=" qstr
* <code>qstr := S ("'" string "'") | * ('"' string '"')</code>
This method resolves entities * inside a string unless the parser parses DTD. * * @param flag The '=' character forces the method to accept the '=' * character before quoted string and read the following string as not an * attribute ('-'), 'c' - CDATA, 'i' - non CDATA, ' ' - no normalization; * '-' - not an attribute value; 'd' - in DTD context. * @return The content of the quoted strign as a string. * @exception Exception is parser specific exception form panic method. * @exception IOException */ protected String eqstr(char flag) throws Exception { if (flag == '=') { wsskip(); if (getch() != '=') { panic(FAULT); } } bqstr((flag == '=') ? '-' : flag); return new String(mBuff, 1, mBuffIdx); } /** * Resoves an entity. * * This method resolves built-in and character entity references. It is also * reports external entities to the application. * * @param flag The 'x' character forces the method to report a skipped * entity; 'i' character - indicates non-CDATA normalization. * @return Name of unresolved entity or <code>null if entity had been * resolved successfully. * @exception Exception is parser specific exception form panic method. * @exception IOException */ @SuppressWarnings("fallthrough") private String ent(char flag) throws Exception { char ch; int idx = mBuffIdx + 1; Input inp = null; String str = null; mESt = 0x100; // reset the built-in entity recognizer bappend('&'); for (short st = 0; st >= 0;) { ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); switch (st) { case 0: // the first character of the entity name case 1: // read built-in entity name switch (chtyp(ch)) { case 'd': case '.': case '-': if (st != 1) { panic(FAULT); } case 'a': case 'A': case '_': case 'X': bappend(ch); eappend(ch); st = 1; break; case ':': if (mIsNSAware != false) { panic(FAULT); } bappend(ch); eappend(ch); st = 1; break; case ';': if (mESt < 0x100) { // The entity is a built-in entity mBuffIdx = idx - 1; bappend(mESt); st = -1; break; } else if (mPh == PH_DTD) { // In DTD entity declaration has to resolve character // entities and include "as is" others. [#4.4.7] bappend(';'); st = -1; break; } // Convert an entity name to a string str = new String(mBuff, idx + 1, mBuffIdx - idx); inp = mEnt.get(str); // Restore the buffer offset mBuffIdx = idx - 1; if (inp != null) { if (inp.chars == null) { // External entity InputSource is = resolveEnt(str, inp.pubid, inp.sysid); if (is != null) { push(new Input(BUFFSIZE_READER)); setinp(is); mInp.pubid = inp.pubid; mInp.sysid = inp.sysid; str = null; // the entity is resolved } else { // Unresolved external entity if (flag != 'x') { panic(FAULT); // unknown entity within marckup } // str is name of unresolved entity } } else { // Internal entity push(inp); str = null; // the entity is resolved } } else { // Unknown or general unparsed entity if (flag != 'x') { panic(FAULT); // unknown entity within marckup } // str is name of unresolved entity } st = -1; break; case '#': if (st != 0) { panic(FAULT); } st = 2; break; default: panic(FAULT); } break; case 2: // read character entity switch (chtyp(ch)) { case 'd': bappend(ch); break; case ';': // Convert the character entity to a character try { int i = Integer.parseInt( new String(mBuff, idx + 1, mBuffIdx - idx), 10); if (i >= 0xffff) { panic(FAULT); } ch = (char) i; } catch (NumberFormatException nfe) { panic(FAULT); } // Restore the buffer offset mBuffIdx = idx - 1; if (ch == ' ' || mInp.next != null) { bappend(ch, flag); } else { bappend(ch); } st = -1; break; case 'a': // If the entity buffer is empty and ch == 'x' if ((mBuffIdx == idx) && (ch == 'x')) { st = 3; break; } default: panic(FAULT); } break; case 3: // read hex character entity switch (chtyp(ch)) { case 'A': case 'a': case 'd': bappend(ch); break; case ';': // Convert the character entity to a character try { int i = Integer.parseInt( new String(mBuff, idx + 1, mBuffIdx - idx), 16); if (i >= 0xffff) { panic(FAULT); } ch = (char) i; } catch (NumberFormatException nfe) { panic(FAULT); } // Restore the buffer offset mBuffIdx = idx - 1; if (ch == ' ' || mInp.next != null) { bappend(ch, flag); } else { bappend(ch); } st = -1; break; default: panic(FAULT); } break; default: panic(FAULT); } } return str; } /** * Resoves a parameter entity. * * This method resolves a parameter entity references. It is also reports * external entities to the application. * * @param flag The '-' instruct the method to do not set up surrounding * spaces [#4.4.8]. * @exception Exception is parser specific exception form panic method. * @exception IOException */ @SuppressWarnings("fallthrough") private void pent(char flag) throws Exception { char ch; int idx = mBuffIdx + 1; Input inp = null; String str = null; bappend('%'); if (mPh != PH_DTD) // the DTD internal subset { return; // Not Recognized [#4.4.1] } // Read entity name bname(false); str = new String(mBuff, idx + 2, mBuffIdx - idx - 1); if (getch() != ';') { panic(FAULT); } inp = mPEnt.get(str); // Restore the buffer offset mBuffIdx = idx - 1; if (inp != null) { if (inp.chars == null) { // External parameter entity InputSource is = resolveEnt(str, inp.pubid, inp.sysid); if (is != null) { if (flag != '-') { bappend(' '); // tail space } push(new Input(BUFFSIZE_READER)); // BUG: there is no leading space! [#4.4.8] setinp(is); mInp.pubid = inp.pubid; mInp.sysid = inp.sysid; } else { // Unresolved external parameter entity skippedEnt("%" + str); } } else { // Internal parameter entity if (flag == '-') { // No surrounding spaces inp.chIdx = 1; } else { // Insert surrounding spaces bappend(' '); // tail space inp.chIdx = 0; } push(inp); } } else { // Unknown parameter entity skippedEnt("%" + str); } } /** * Recognizes and handles a namespace declaration. * * This method identifies a type of namespace declaration if any and puts * new mapping on top of prefix stack. * * @param name The attribute qualified name (<code>name.value is a * <code>String object which represents the attribute prefix). * @param value The attribute value. * @return <code>true if a namespace declaration is recognized. */ private boolean isdecl(Pair name, String value) { if (name.chars[0] == 0) { if ("xmlns".equals(name.name) == true) { // New default namespace declaration mPref = pair(mPref); mPref.list = mElm; // prefix owner element mPref.value = value; mPref.name = ""; mPref.chars = NONS; mElm.num++; // namespace counter return true; } } else { if (name.eqpref(XMLNS) == true) { // New prefix declaration int len = name.name.length(); mPref = pair(mPref); mPref.list = mElm; // prefix owner element mPref.value = value; mPref.name = name.name; mPref.chars = new char[len + 1]; mPref.chars[0] = (char) (len + 1); name.name.getChars(0, len, mPref.chars, 1); mElm.num++; // namespace counter return true; } } return false; } /** * Resolves a prefix. * * @return The namespace assigned to the prefix. * @exception Exception When mapping for specified prefix is not found. */ private String rslv(char[] qname) throws Exception { for (Pair pref = mPref; pref != null; pref = pref.next) { if (pref.eqpref(qname) == true) { return pref.value; } } if (qname[0] == 1) { // QNames like ':local' for (Pair pref = mPref; pref != null; pref = pref.next) { if (pref.chars[0] == 0) { return pref.value; } } } panic(FAULT); return null; } /** * Skips xml white space characters. * * This method skips white space characters (' ', '\t', '\n', '\r') and * looks ahead not white space character. * * @return The first not white space look ahead character. * @exception IOException */ protected char wsskip() throws IOException { char ch; while (true) { // Read next character ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); if (ch < 0x80) { if (nmttyp[ch] != 3) // [ \t\n\r] { break; } } else { break; } } mChIdx--; // bkch(); return ch; } /** * Reports document type. * * @param name The name of the entity. * @param pubid The public identifier of the entity or <code>null. * @param sysid The system identifier of the entity or <code>null. */ protected abstract void docType(String name, String pubid, String sysid) throws SAXException; /** * Reports a comment. * * @param text The comment text starting from first charcater. * @param length The number of characters in comment. */ protected abstract void comm(char[] text, int length); /** * Reports a processing instruction. * * @param target The processing instruction target name. * @param body The processing instruction body text. */ protected abstract void pi(String target, String body) throws Exception; /** * Reports new namespace prefix. The Namespace prefix ( * <code>mPref.name) being declared and the Namespace URI ( * <code>mPref.value) the prefix is mapped to. An empty string is * used for the default element namespace, which has no prefix. */ protected abstract void newPrefix() throws Exception; /** * Reports skipped entity name. * * @param name The entity name. */ protected abstract void skippedEnt(String name) throws Exception; /** * Returns an * <code>InputSource for specified entity or * <code>null. * * @param name The name of the entity. * @param pubid The public identifier of the entity. * @param sysid The system identifier of the entity. */ protected abstract InputSource resolveEnt( String name, String pubid, String sysid) throws Exception; /** * Reports notation declaration. * * @param name The notation's name. * @param pubid The notation's public identifier, or null if none was given. * @param sysid The notation's system identifier, or null if none was given. */ protected abstract void notDecl(String name, String pubid, String sysid) throws Exception; /** * Reports unparsed entity name. * * @param name The unparsed entity's name. * @param pubid The entity's public identifier, or null if none was given. * @param sysid The entity's system identifier. * @param notation The name of the associated notation. */ protected abstract void unparsedEntDecl( String name, String pubid, String sysid, String notation) throws Exception; /** * Notifies the handler about fatal parsing error. * * @param msg The problem description message. */ protected abstract void panic(String msg) throws Exception; /** * Reads a qualified xml name. * * This is low level routine which leaves a qName in the buffer. The * characters of a qualified name is an array of characters. The first * (chars[0]) character is the index of the colon character which separates * the prefix from the local name. If the index is zero, the name does not * contain separator or the parser works in the namespace unaware mode. The * length of qualified name is the length of the array minus one. * * @param ns The true value turns namespace conformance on. * @exception Exception is parser specific exception form panic method. * @exception IOException */ private void bname(boolean ns) throws Exception { char ch; char type; mBuffIdx++; // allocate a char for colon offset int bqname = mBuffIdx; int bcolon = bqname; int bchidx = bqname + 1; int bstart = bchidx; int cstart = mChIdx; short st = (short) ((ns == true) ? 0 : 2); while (true) { // Read next character if (mChIdx >= mChLen) { bcopy(cstart, bstart); getch(); mChIdx--; // bkch(); cstart = mChIdx; bstart = bchidx; } ch = mChars[mChIdx++]; type = (char) 0; // [X] if (ch < 0x80) { type = (char) nmttyp[ch]; } else if (ch == EOS) { panic(FAULT); } // Parse QName switch (st) { case 0: // read the first char of the prefix case 2: // read the first char of the suffix switch (type) { case 0: // [aA_X] bchidx++; // append char to the buffer st++; // (st == 0)? 1: 3; break; case 1: // [:] mChIdx--; // bkch(); st++; // (st == 0)? 1: 3; break; default: panic(FAULT); } break; case 1: // read the prefix case 3: // read the suffix switch (type) { case 0: // [aA_X] case 2: // [.-d] bchidx++; // append char to the buffer break; case 1: // [:] bchidx++; // append char to the buffer if (ns == true) { if (bcolon != bqname) { panic(FAULT); // it must be only one colon } bcolon = bchidx - 1; if (st == 1) { st = 2; } } break; default: mChIdx--; // bkch(); bcopy(cstart, bstart); mBuff[bqname] = (char) (bcolon - bqname); return; } break; default: panic(FAULT); } } } /** * Reads a nmtoken. * * This is low level routine which leaves a nmtoken in the buffer. * * @exception Exception is parser specific exception form panic method. * @exception IOException */ @SuppressWarnings("fallthrough") private void bntok() throws Exception { char ch; mBuffIdx = -1; bappend((char) 0); // default offset to the colon char while (true) { ch = getch(); switch (chtyp(ch)) { case 'a': case 'A': case 'd': case '.': case ':': case '-': case '_': case 'X': bappend(ch); break; case 'Z': panic(FAULT); default: bkch(); return; } } } /** * Recognizes a keyword. * * This is low level routine which recognizes one of keywords in the buffer. * Keyword Id ID - i IDREF - r IDREFS - R ENTITY - n ENTITIES - N NMTOKEN - * t NMTOKENS - T ELEMENT - e ATTLIST - a NOTATION - o CDATA - c REQUIRED - * Q IMPLIED - I FIXED - F * * @return an id of a keyword or '?'. * @exception Exception is parser specific exception form panic method. * @exception IOException */ private char bkeyword() throws Exception { String str = new String(mBuff, 1, mBuffIdx); switch (str.length()) { case 2: // ID return ("ID".equals(str) == true) ? 'i' : '?'; case 5: // IDREF, CDATA, FIXED switch (mBuff[1]) { case 'I': return ("IDREF".equals(str) == true) ? 'r' : '?'; case 'C': return ("CDATA".equals(str) == true) ? 'c' : '?'; case 'F': return ("FIXED".equals(str) == true) ? 'F' : '?'; default: break; } break; case 6: // IDREFS, ENTITY switch (mBuff[1]) { case 'I': return ("IDREFS".equals(str) == true) ? 'R' : '?'; case 'E': return ("ENTITY".equals(str) == true) ? 'n' : '?'; default: break; } break; case 7: // NMTOKEN, IMPLIED, ATTLIST, ELEMENT switch (mBuff[1]) { case 'I': return ("IMPLIED".equals(str) == true) ? 'I' : '?'; case 'N': return ("NMTOKEN".equals(str) == true) ? 't' : '?'; case 'A': return ("ATTLIST".equals(str) == true) ? 'a' : '?'; case 'E': return ("ELEMENT".equals(str) == true) ? 'e' : '?'; default: break; } break; case 8: // ENTITIES, NMTOKENS, NOTATION, REQUIRED switch (mBuff[2]) { case 'N': return ("ENTITIES".equals(str) == true) ? 'N' : '?'; case 'M': return ("NMTOKENS".equals(str) == true) ? 'T' : '?'; case 'O': return ("NOTATION".equals(str) == true) ? 'o' : '?'; case 'E': return ("REQUIRED".equals(str) == true) ? 'Q' : '?'; default: break; } break; default: break; } return '?'; } /** * Reads a single or double quotted string in to the buffer. * * This method resolves entities inside a string unless the parser parses * DTD. * * @param flag 'c' - CDATA, 'i' - non CDATA, ' ' - no normalization; '-' - * not an attribute value; 'd' - in DTD context. * @exception Exception is parser specific exception form panic method. * @exception IOException */ @SuppressWarnings("fallthrough") private void bqstr(char flag) throws Exception { Input inp = mInp; // remember the original input mBuffIdx = -1; bappend((char) 0); // default offset to the colon char char ch; for (short st = 0; st >= 0;) { ch = (mChIdx < mChLen) ? mChars[mChIdx++] : getch(); switch (st) { case 0: // read a single or double quote switch (ch) { case ' ': case '\n': case '\r': case '\t': break; case '\'': st = 2; // read a single quoted string break; case '\"': st = 3; // read a double quoted string break; default: panic(FAULT); break; } break; case 2: // read a single quoted string case 3: // read a double quoted string switch (ch) { case '\'': if ((st == 2) && (mInp == inp)) { st = -1; } else { bappend(ch); } break; case '\"': if ((st == 3) && (mInp == inp)) { st = -1; } else { bappend(ch); } break; case '&': if (flag != 'd') { ent(flag); } else { bappend(ch); } break; case '%': if (flag == 'd') { pent('-'); } else { bappend(ch); } break; case '<': if ((flag == '-') || (flag == 'd')) { bappend(ch); } else { panic(FAULT); } break; case EOS: // EOS before single/double quote panic(FAULT); case '\r': // EOL processing [#2.11 & #3.3.3] if (flag != ' ' && mInp.next == null) { if (getch() != '\n') { bkch(); } ch = '\n'; } default: bappend(ch, flag); break; } break; default: panic(FAULT); } } // There is maximum one space at the end of the string in // i-mode (non CDATA normalization) and it has to be removed. if ((flag == 'i') && (mBuff[mBuffIdx] == ' ')) { mBuffIdx -= 1; } } /** * Reports characters and empties the parser's buffer. This method is called * only if parser is going to return control to the main loop. This means * that this method may use parser buffer to report white space without * copeing characters to temporary buffer. */ protected abstract void bflash() throws Exception; /** * Reports white space characters and empties the parser's buffer. This * method is called only if parser is going to return control to the main * loop. This means that this method may use parser buffer to report white * space without copeing characters to temporary buffer. */ protected abstract void bflash_ws() throws Exception; /** * Appends a character to parser's buffer with normalization. * * @param ch The character to append to the buffer. * @param mode The normalization mode. */ private void bappend(char ch, char mode) { // This implements attribute value normalization as // described in the XML specification [#3.3.3]. switch (mode) { case 'i': // non CDATA normalization switch (ch) { case ' ': case '\n': case '\r': case '\t': if ((mBuffIdx > 0) && (mBuff[mBuffIdx] != ' ')) { bappend(' '); } return; default: break; } break; case 'c': // CDATA normalization switch (ch) { case '\n': case '\r': case '\t': ch = ' '; break; default: break; } break; default: // no normalization break; } mBuffIdx++; if (mBuffIdx < mBuff.length) { mBuff[mBuffIdx] = ch; } else { mBuffIdx--; bappend(ch); } } /** * Appends a character to parser's buffer. * * @param ch The character to append to the buffer. */ private void bappend(char ch) { try { mBuff[++mBuffIdx] = ch; } catch (Exception exp) { // Double the buffer size char buff[] = new char[mBuff.length << 1]; System.arraycopy(mBuff, 0, buff, 0, mBuff.length); mBuff = buff; mBuff[mBuffIdx] = ch; } } /** * Appends (mChIdx - cidx) characters from character buffer (mChars) to * parser's buffer (mBuff). * * @param cidx The character buffer (mChars) start index. * @param bidx The parser buffer (mBuff) start index. */ private void bcopy(int cidx, int bidx) { int length = mChIdx - cidx; if ((bidx + length + 1) >= mBuff.length) { // Expand the buffer char buff[] = new char[mBuff.length + length]; System.arraycopy(mBuff, 0, buff, 0, mBuff.length); mBuff = buff; } System.arraycopy(mChars, cidx, mBuff, bidx, length); mBuffIdx += length; } /** * Recognizes the built-in entities <i>lt, gt, amp, * <i>apos, quot. The initial state is 0x100. Any state belowe * 0x100 is a built-in entity replacement character. * * @param ch the next character of an entity name. */ @SuppressWarnings("fallthrough") private void eappend(char ch) { switch (mESt) { case 0x100: // "l" or "g" or "a" or "q" switch (ch) { case 'l': mESt = 0x101; break; case 'g': mESt = 0x102; break; case 'a': mESt = 0x103; break; case 'q': mESt = 0x107; break; default: mESt = 0x200; break; } break; case 0x101: // "lt" mESt = (ch == 't') ? '<' : (char) 0x200; break; case 0x102: // "gt" mESt = (ch == 't') ? '>' : (char) 0x200; break; case 0x103: // "am" or "ap" switch (ch) { case 'm': mESt = 0x104; break; case 'p': mESt = 0x105; break; default: mESt = 0x200; break; } break; case 0x104: // "amp" mESt = (ch == 'p') ? '&' : (char) 0x200; break; case 0x105: // "apo" mESt = (ch == 'o') ? (char) 0x106 : (char) 0x200; break; case 0x106: // "apos" mESt = (ch == 's') ? '\'' : (char) 0x200; break; case 0x107: // "qu" mESt = (ch == 'u') ? (char) 0x108 : (char) 0x200; break; case 0x108: // "quo" mESt = (ch == 'o') ? (char) 0x109 : (char) 0x200; break; case 0x109: // "quot" mESt = (ch == 't') ? '\"' : (char) 0x200; break; case '<': // "lt" case '>': // "gt" case '&': // "amp" case '\'': // "apos" case '\"': // "quot" mESt = 0x200; default: break; } } /** * Sets up a new input source on the top of the input stack. Note, the first * byte returned by the entity's byte stream has to be the first byte in the * entity. However, the parser does not expect the byte order mask in both * cases when encoding is provided by the input source. * * @param is A new input source to set up. * @exception IOException If any IO errors occur. * @exception Exception is parser specific exception form panic method. */ protected void setinp(InputSource is) throws Exception { Reader reader = null; mChIdx = 0; mChLen = 0; mChars = mInp.chars; mInp.src = null; if (mPh < PH_DOC_START) { mIsSAlone = false; // default [#2.9] } mIsSAloneSet = false; if (is.getCharacterStream() != null) { // Ignore encoding in the xml text decl. reader = is.getCharacterStream(); xml(reader); } else if (is.getByteStream() != null) { String expenc; if (is.getEncoding() != null) { // Ignore encoding in the xml text decl. expenc = is.getEncoding().toUpperCase(); if (expenc.equals("UTF-16")) { reader = bom(is.getByteStream(), 'U'); // UTF-16 [#4.3.3] } else { reader = enc(expenc, is.getByteStream()); } xml(reader); } else { // Get encoding from BOM or the xml text decl. reader = bom(is.getByteStream(), ' '); if (reader == null) { // Encoding is defined by the xml text decl. reader = enc("UTF-8", is.getByteStream()); expenc = xml(reader); if (expenc.startsWith("UTF-16")) { panic(FAULT); // UTF-16 must have BOM [#4.3.3] } reader = enc(expenc, is.getByteStream()); } else { // Encoding is defined by the BOM. xml(reader); } } } else { // There is no support for public/system identifiers. panic(FAULT); } mInp.src = reader; mInp.pubid = is.getPublicId(); mInp.sysid = is.getSystemId(); } /** * Determines the entity encoding. * * This method gets encoding from Byte Order Mask [#4.3.3] if any. Note, the * first byte returned by the entity's byte stream has to be the first byte * in the entity. Also, there is no support for UCS-4. * * @param is A byte stream of the entity. * @param hint An encoding hint, character U means UTF-16. * @return a reader constructed from the BOM or UTF-8 by default. * @exception Exception is parser specific exception form panic method. * @exception IOException */ private Reader bom(InputStream is, char hint) throws Exception { int val = is.read(); switch (val) { case 0xef: // UTF-8 if (hint == 'U') // must be UTF-16 { panic(FAULT); } if (is.read() != 0xbb) { panic(FAULT); } if (is.read() != 0xbf) { panic(FAULT); } return new ReaderUTF8(is); case 0xfe: // UTF-16, big-endian if (is.read() != 0xff) { panic(FAULT); } return new ReaderUTF16(is, 'b'); case 0xff: // UTF-16, little-endian if (is.read() != 0xfe) { panic(FAULT); } return new ReaderUTF16(is, 'l'); case -1: mChars[mChIdx++] = EOS; return new ReaderUTF8(is); default: if (hint == 'U') // must be UTF-16 { panic(FAULT); } // Read the rest of UTF-8 character switch (val & 0xf0) { case 0xc0: case 0xd0: mChars[mChIdx++] = (char) (((val & 0x1f) << 6) | (is.read() & 0x3f)); break; case 0xe0: mChars[mChIdx++] = (char) (((val & 0x0f) << 12) | ((is.read() & 0x3f) << 6) | (is.read() & 0x3f)); break; case 0xf0: // UCS-4 character throw new UnsupportedEncodingException(); default: mChars[mChIdx++] = (char) val; break; } return null; } } /** * Parses the xml text declaration. * * This method gets encoding from the xml text declaration [#4.3.1] if any. * The method assumes the buffer (mChars) is big enough to accommodate whole * xml text declaration. * * @param reader is entity reader. * @return The xml text declaration encoding or default UTF-8 encoding. * @exception Exception is parser specific exception form panic method. * @exception IOException */ private String xml(Reader reader) throws Exception { String str = null; String enc = "UTF-8"; char ch; int val; short st; // Read the xml text declaration into the buffer if (mChIdx != 0) { // The bom method have read ONE char into the buffer. st = (short) ((mChars[0] == '<') ? 1 : -1); } else { st = 0; } while (st >= 0 && mChIdx < mChars.length) { ch = ((val = reader.read()) >= 0) ? (char) val : EOS; mChars[mChIdx++] = ch; switch (st) { case 0: // read '<' of xml declaration switch (ch) { case '<': st = 1; break; case 0xfeff: // the byte order mask ch = ((val = reader.read()) >= 0) ? (char) val : EOS; mChars[mChIdx - 1] = ch; st = (short) ((ch == '<') ? 1 : -1); break; default: st = -1; break; } break; case 1: // read '?' of xml declaration [#4.3.1] st = (short) ((ch == '?') ? 2 : -1); break; case 2: // read 'x' of xml declaration [#4.3.1] st = (short) ((ch == 'x') ? 3 : -1); break; case 3: // read 'm' of xml declaration [#4.3.1] st = (short) ((ch == 'm') ? 4 : -1); break; case 4: // read 'l' of xml declaration [#4.3.1] st = (short) ((ch == 'l') ? 5 : -1); break; case 5: // read white space after 'xml' switch (ch) { case ' ': case '\t': case '\r': case '\n': st = 6; break; default: st = -1; break; } break; case 6: // read content of xml declaration switch (ch) { case '?': st = 7; break; case EOS: st = -2; break; default: break; } break; case 7: // read '>' after '?' of xml declaration switch (ch) { case '>': case EOS: st = -2; break; default: st = 6; break; } break; default: panic(FAULT); break; } } mChLen = mChIdx; mChIdx = 0; // If there is no xml text declaration, the encoding is default. if (st == -1) { return enc; } mChIdx = 5; // the first white space after "<?xml" // Parse the xml text declaration for (st = 0; st >= 0;) { ch = getch(); switch (st) { case 0: // skip spaces after the xml declaration name if (chtyp(ch) != ' ') { bkch(); st = 1; } break; case 1: // read xml declaration version case 2: // read xml declaration encoding or standalone case 3: // read xml declaration standalone switch (chtyp(ch)) { case 'a': case 'A': case '_': bkch(); str = name(false).toLowerCase(); if ("version".equals(str) == true) { if (st != 1) { panic(FAULT); } if ("1.0".equals(eqstr('=')) != true) { panic(FAULT); } mInp.xmlver = 0x0100; st = 2; } else if ("encoding".equals(str) == true) { if (st != 2) { panic(FAULT); } mInp.xmlenc = eqstr('=').toUpperCase(); enc = mInp.xmlenc; st = 3; } else if ("standalone".equals(str) == true) { if ((st == 1) || (mPh >= PH_DOC_START)) // [#4.3.1] { panic(FAULT); } str = eqstr('=').toLowerCase(); // Check the 'standalone' value and use it [#5.1] if (str.equals("yes") == true) { mIsSAlone = true; } else if (str.equals("no") == true) { mIsSAlone = false; } else { panic(FAULT); } mIsSAloneSet = true; st = 4; } else { panic(FAULT); } break; case ' ': break; case '?': if (st == 1) { panic(FAULT); } bkch(); st = 4; break; default: panic(FAULT); } break; case 4: // end of xml declaration switch (chtyp(ch)) { case '?': if (getch() != '>') { panic(FAULT); } if (mPh <= PH_DOC_START) { mPh = PH_MISC_DTD; // misc before DTD } st = -1; break; case ' ': break; default: panic(FAULT); } break; default: panic(FAULT); } } return enc; } /** * Sets up the document reader. * * @param name an encoding name. * @param is the document byte input stream. * @return a reader constructed from encoding name and input stream. * @exception UnsupportedEncodingException */ private Reader enc(String name, InputStream is) throws UnsupportedEncodingException { // DO NOT CLOSE current reader if any! if (name.equals("UTF-8")) { return new ReaderUTF8(is); } else if (name.equals("UTF-16LE")) { return new ReaderUTF16(is, 'l'); } else if (name.equals("UTF-16BE")) { return new ReaderUTF16(is, 'b'); } else { return new InputStreamReader(is, name); } } /** * Sets up current input on the top of the input stack. * * @param inp A new input to set up. */ protected void push(Input inp) { mInp.chLen = mChLen; mInp.chIdx = mChIdx; inp.next = mInp; mInp = inp; mChars = inp.chars; mChLen = inp.chLen; mChIdx = inp.chIdx; } /** * Restores previous input on the top of the input stack. */ protected void pop() { if (mInp.src != null) { try { mInp.src.close(); } catch (IOException ioe) { } mInp.src = null; } mInp = mInp.next; if (mInp != null) { mChars = mInp.chars; mChLen = mInp.chLen; mChIdx = mInp.chIdx; } else { mChars = null; mChLen = 0; mChIdx = 0; } } /** * Maps a character to it's type. * * Possible character type values are:<br /> - ' ' for any kind of white * space character;<br /> - 'a' for any lower case alphabetical character * value;<br /> - 'A' for any upper case alphabetical character value;
* - 'd' for any decimal digit character value;<br /> - 'z' for any * character less then ' ' except '\t', '\n', '\r';<br /> - 'X' for any not * ASCII character;<br /> - 'Z' for EOS character.
An ASCII (7 bit) * character which does not fall in any category listed above is mapped to * it self. * * @param ch The character to map. * @return The type of character. */ protected char chtyp(char ch) { if (ch < 0x80) { return (char) asctyp[ch]; } return (ch != EOS) ? 'X' : 'Z'; } /** * Retrives the next character in the document. * * @return The next character in the document. */ protected char getch() throws IOException { if (mChIdx >= mChLen) { if (mInp.src == null) { pop(); // remove internal entity return getch(); } // Read new portion of the document characters int Num = mInp.src.read(mChars, 0, mChars.length); if (Num < 0) { if (mInp != mDoc) { pop(); // restore the previous input return getch(); } else { mChars[0] = EOS; mChLen = 1; } } else { mChLen = Num; } mChIdx = 0; } return mChars[mChIdx++]; } /** * Puts back the last read character. * * This method <strong>MUST NOT be called more then once after each * call of {@link #getch getch} method. */ protected void bkch() throws Exception { if (mChIdx <= 0) { panic(FAULT); } mChIdx--; } /** * Sets the current character. * * @param ch The character to set. */ protected void setch(char ch) { mChars[mChIdx] = ch; } /** * Finds a pair in the pair chain by a qualified name. * * @param chain The first element of the chain of pairs. * @param qname The qualified name. * @return A pair with the specified qualified name or null. */ protected Pair find(Pair chain, char[] qname) { for (Pair pair = chain; pair != null; pair = pair.next) { if (pair.eqname(qname) == true) { return pair; } } return null; } /** * Provedes an instance of a pair. * * @param next The reference to a next pair. * @return An instance of a pair. */ protected Pair pair(Pair next) { Pair pair; if (mDltd != null) { pair = mDltd; mDltd = pair.next; } else { pair = new Pair(); } pair.next = next; return pair; } /** * Deletes an instance of a pair. * * @param pair The pair to delete. * @return A reference to the next pair in a chain. */ protected Pair del(Pair pair) { Pair next = pair.next; pair.name = null; pair.value = null; pair.chars = null; pair.list = null; pair.next = mDltd; mDltd = pair; return next; } }

Other Java examples (source code examples)

Here is a short list of links related to this Java Parser.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.