alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Java example source code file (Parser.java)

This example Java source code file (Parser.java) is included in the alvinalexander.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Learn more about this Java project at its project page.

Java - Java tags/keywords

attributelist, cdata, changedcharsetexception, ioexception, net, network, notation, script_end_tag, simpleattributeset, string, tagelement, tagstack, text, util, vector

The Parser.java Java example source code

/*
 * Copyright (c) 1998, 2013, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

package javax.swing.text.html.parser;

import javax.swing.text.SimpleAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.ChangedCharSetException;
import java.io.*;
import java.util.Hashtable;
import java.util.Properties;
import java.util.Vector;
import java.util.Enumeration;
import java.net.URL;

import sun.misc.MessageUtils;

/**
 * A simple DTD-driven HTML parser. The parser reads an
 * HTML file from an InputStream and calls various methods
 * (which should be overridden in a subclass) when tags and
 * data are encountered.
 * <p>
 * Unfortunately there are many badly implemented HTML parsers
 * out there, and as a result there are many badly formatted
 * HTML files. This parser attempts to parse most HTML files.
 * This means that the implementation sometimes deviates from
 * the SGML specification in favor of HTML.
 * <p>
 * The parser treats \r and \r\n as \n. Newlines after starttags
 * and before end tags are ignored just as specified in the SGML/HTML
 * specification.
 * <p>
 * The html spec does not specify how spaces are to be coalesced very well.
 * Specifically, the following scenarios are not discussed (note that a
 * space should be used here, but I am using &nbsp to force the space to
 * be displayed):
 * <p>
 * '<b>blah <i> <strike> foo' which can be treated as:
 * '<b>blah <i><strike>foo'
 * <p>as well as:
 * '<p><a href="xx"> <em>Using</em></a></p>'
 * which appears to be treated as:
 * '<p><a href="xx"><em>Using</em></a></p>'
 * <p>
 * If <code>strict is false, when a tag that breaks flow,
 * (<code>TagElement.breaksFlows) or trailing whitespace is
 * encountered, all whitespace will be ignored until a non whitespace
 * character is encountered. This appears to give behavior closer to
 * the popular browsers.
 *
 * @see DTD
 * @see TagElement
 * @see SimpleAttributeSet
 * @author Arthur van Hoff
 * @author Sunita Mani
 */
public
class Parser implements DTDConstants {

    private char text[] = new char[1024];
    private int textpos = 0;
    private TagElement last;
    private boolean space;

    private char str[] = new char[128];
    private int strpos = 0;

    protected DTD dtd = null;

    private int ch;
    private int ln;
    private Reader in;

    private Element recent;
    private TagStack stack;
    private boolean skipTag = false;
    private TagElement lastFormSent = null;
    private SimpleAttributeSet attributes = new SimpleAttributeSet();

    // State for <html>,  and .  Since people like to slap
    // together HTML documents without thinking, occasionally they
    // have multiple instances of these tags.  These booleans track
    // the first sightings of these tags so they can be safely ignored
    // by the parser if repeated.
    private boolean seenHtml = false;
    private boolean seenHead = false;
    private boolean seenBody = false;

    /**
     * The html spec does not specify how spaces are coalesced very well.
     * If strict == false, ignoreSpace is used to try and mimic the behavior
     * of the popular browsers.
     * <p>
     * The problematic scenarios are:
     * '<b>blah <i> <strike> foo' which can be treated as:
     * '<b>blah <i><strike>foo'
     * as well as:
     * '<p><a href="xx"> <em>Using</em></a></p>'
     * which appears to be treated as:
     * '<p><a href="xx"><em>Using</em></a></p>'
     * <p>
     * When a tag that breaks flow, or trailing whitespace is encountered
     * ignoreSpace is set to true. From then on, all whitespace will be
     * ignored.
     * ignoreSpace will be set back to false the first time a
     * non whitespace character is encountered. This appears to give
     * behavior closer to the popular browsers.
     */
    private boolean ignoreSpace;

    /**
     * This flag determines whether or not the Parser will be strict
     * in enforcing SGML compatibility.  If false, it will be lenient
     * with certain common classes of erroneous HTML constructs.
     * Strict or not, in either case an error will be recorded.
     *
     */
    protected boolean strict = false;


    /** Number of \r\n's encountered. */
    private int crlfCount;
    /** Number of \r's encountered. A \r\n will not increment this. */
    private int crCount;
    /** Number of \n's encountered. A \r\n will not increment this. */
    private int lfCount;

    //
    // To correctly identify the start of a tag/comment/text we need two
    // ivars. Two are needed as handleText isn't invoked until the tag
    // after the text has been parsed, that is the parser parses the text,
    // then a tag, then invokes handleText followed by handleStart.
    //
    /** The start position of the current block. Block is overloaded here,
     * it really means the current start position for the current comment,
     * tag, text. Use getBlockStartPosition to access this. */
    private int currentBlockStartPos;
    /** Start position of the last block. */
    private int lastBlockStartPos;

    /**
     * array for mapping numeric references in range
     * 130-159 to displayable Unicode characters.
     */
    private static final char[] cp1252Map = {
        8218,  // ‚
        402,   // ƒ
        8222,  // „
        8230,  // …
        8224,  // †
        8225,  // ‡
        710,   // ˆ
        8240,  // ‰
        352,   // Š
        8249,  // ‹
        338,   // Œ
        141,   // 
        142,   // Ž
        143,   // 
        144,   // 
        8216,  // ‘
        8217,  // ’
        8220,  // “
        8221,  // ”
        8226,  // •
        8211,  // –
        8212,  // —
        732,   // ˜
        8482,  // ™
        353,   // š
        8250,  // ›
        339,   // œ
        157,   // 
        158,   // ž
        376    // Ÿ
    };

    public Parser(DTD dtd) {
        this.dtd = dtd;
    }


    /**
     * @return the line number of the line currently being parsed
     */
    protected int getCurrentLine() {
        return ln;
    }

    /**
     * Returns the start position of the current block. Block is
     * overloaded here, it really means the current start position for
     * the current comment tag, text, block.... This is provided for
     * subclassers that wish to know the start of the current block when
     * called with one of the handleXXX methods.
     */
    int getBlockStartPosition() {
        return Math.max(0, lastBlockStartPos - 1);
    }

    /**
     * Makes a TagElement.
     */
    protected TagElement makeTag(Element elem, boolean fictional) {
        return new TagElement(elem, fictional);
    }

    protected TagElement makeTag(Element elem) {
        return makeTag(elem, false);
    }

    protected SimpleAttributeSet getAttributes() {
        return attributes;
    }

    protected void flushAttributes() {
        attributes.removeAttributes(attributes);
    }

    /**
     * Called when PCDATA is encountered.
     */
    protected void handleText(char text[]) {
    }

    /**
     * Called when an HTML title tag is encountered.
     */
    protected void handleTitle(char text[]) {
        // default behavior is to call handleText. Subclasses
        // can override if necessary.
        handleText(text);
    }

    /**
     * Called when an HTML comment is encountered.
     */
    protected void handleComment(char text[]) {
    }

    protected void handleEOFInComment() {
        // We've reached EOF.  Our recovery strategy is to
        // see if we have more than one line in the comment;
        // if so, we pretend that the comment was an unterminated
        // single line comment, and reparse the lines after the
        // first line as normal HTML content.

        int commentEndPos = strIndexOf('\n');
        if (commentEndPos >= 0) {
            handleComment(getChars(0, commentEndPos));
            try {
                in.close();
                in = new CharArrayReader(getChars(commentEndPos + 1));
                ch = '>';
            } catch (IOException e) {
                error("ioexception");
            }

            resetStrBuffer();
        } else {
            // no newline, so signal an error
            error("eof.comment");
        }
    }

    /**
     * Called when an empty tag is encountered.
     */
    protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException {
    }

    /**
     * Called when a start tag is encountered.
     */
    protected void handleStartTag(TagElement tag) {
    }

    /**
     * Called when an end tag is encountered.
     */
    protected void handleEndTag(TagElement tag) {
    }

    /**
     * An error has occurred.
     */
    protected void handleError(int ln, String msg) {
        /*
        Thread.dumpStack();
        System.out.println("**** " + stack);
        System.out.println("line " + ln + ": error: " + msg);
        System.out.println();
        */
    }

    /**
     * Output text.
     */
    void handleText(TagElement tag) {
        if (tag.breaksFlow()) {
            space = false;
            if (!strict) {
                ignoreSpace = true;
            }
        }
        if (textpos == 0) {
            if ((!space) || (stack == null) || last.breaksFlow() ||
                !stack.advance(dtd.pcdata)) {
                last = tag;
                space = false;
                lastBlockStartPos = currentBlockStartPos;
                return;
            }
        }
        if (space) {
            if (!ignoreSpace) {
                // enlarge buffer if needed
                if (textpos + 1 > text.length) {
                    char newtext[] = new char[text.length + 200];
                    System.arraycopy(text, 0, newtext, 0, text.length);
                    text = newtext;
                }

                // output pending space
                text[textpos++] = ' ';
                if (!strict && !tag.getElement().isEmpty()) {
                    ignoreSpace = true;
                }
            }
            space = false;
        }
        char newtext[] = new char[textpos];
        System.arraycopy(text, 0, newtext, 0, textpos);
        // Handles cases of bad html where the title tag
        // was getting lost when we did error recovery.
        if (tag.getElement().getName().equals("title")) {
            handleTitle(newtext);
        } else {
            handleText(newtext);
        }
        lastBlockStartPos = currentBlockStartPos;
        textpos = 0;
        last = tag;
        space = false;
    }

    /**
     * Invoke the error handler.
     */
    protected void error(String err, String arg1, String arg2,
        String arg3) {
        handleError(ln, err + " " + arg1 + " " + arg2 + " " + arg3);
    }

    protected void error(String err, String arg1, String arg2) {
        error(err, arg1, arg2, "?");
    }
    protected void error(String err, String arg1) {
        error(err, arg1, "?", "?");
    }
    protected void error(String err) {
        error(err, "?", "?", "?");
    }


    /**
     * Handle a start tag. The new tag is pushed
     * onto the tag stack. The attribute list is
     * checked for required attributes.
     */
    protected void startTag(TagElement tag) throws ChangedCharSetException {
        Element elem = tag.getElement();

        // If the tag is an empty tag and texpos != 0
        // this implies that there is text before the
        // start tag that needs to be processed before
        // handling the tag.
        //
        if (!elem.isEmpty() ||
                    ((last != null) && !last.breaksFlow()) ||
                    (textpos != 0)) {
            handleText(tag);
        } else {
            // this variable gets updated in handleText().
            // Since in this case we do not call handleText()
            // we need to update it here.
            //
            last = tag;
            // Note that we should really check last.breakFlows before
            // assuming this should be false.
            space = false;
        }
        lastBlockStartPos = currentBlockStartPos;

        // check required attributes
        for (AttributeList a = elem.atts ; a != null ; a = a.next) {
            if ((a.modifier == REQUIRED) &&
                ((attributes.isEmpty()) ||
                 ((!attributes.isDefined(a.name)) &&
                  (!attributes.isDefined(HTML.getAttributeKey(a.name)))))) {
                error("req.att ", a.getName(), elem.getName());
            }
        }

        if (elem.isEmpty()) {
            handleEmptyTag(tag);
            /*
        } else if (elem.getName().equals("form")) {
            handleStartTag(tag);
            */
        } else {
            recent = elem;
            stack = new TagStack(tag, stack);
            handleStartTag(tag);
        }
    }

    /**
     * Handle an end tag. The end tag is popped
     * from the tag stack.
     */
    protected void endTag(boolean omitted) {
        handleText(stack.tag);

        if (omitted && !stack.elem.omitEnd()) {
            error("end.missing", stack.elem.getName());
        } else if (!stack.terminate()) {
            error("end.unexpected", stack.elem.getName());
        }

        // handle the tag
        handleEndTag(stack.tag);
        stack = stack.next;
        recent = (stack != null) ? stack.elem : null;
    }


    boolean ignoreElement(Element elem) {

        String stackElement = stack.elem.getName();
        String elemName = elem.getName();
        /* We ignore all elements that are not valid in the context of
           a table except <td>, 
(these we handle in legalElementContext()) and #pcdata. We also ignore the <font> tag in the context of
    and
      We additonally ignore the <meta> and the