|
The Parser.java Java example source code
/*
* Copyright (c) 1998, 2013, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package javax.swing.text.html.parser;
import javax.swing.text.SimpleAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.ChangedCharSetException;
import java.io.*;
import java.util.Hashtable;
import java.util.Properties;
import java.util.Vector;
import java.util.Enumeration;
import java.net.URL;
import sun.misc.MessageUtils;
/**
* A simple DTD-driven HTML parser. The parser reads an
* HTML file from an InputStream and calls various methods
* (which should be overridden in a subclass) when tags and
* data are encountered.
* <p>
* Unfortunately there are many badly implemented HTML parsers
* out there, and as a result there are many badly formatted
* HTML files. This parser attempts to parse most HTML files.
* This means that the implementation sometimes deviates from
* the SGML specification in favor of HTML.
* <p>
* The parser treats \r and \r\n as \n. Newlines after starttags
* and before end tags are ignored just as specified in the SGML/HTML
* specification.
* <p>
* The html spec does not specify how spaces are to be coalesced very well.
* Specifically, the following scenarios are not discussed (note that a
* space should be used here, but I am using   to force the space to
* be displayed):
* <p>
* '<b>blah <i> <strike> foo' which can be treated as:
* '<b>blah <i><strike>foo'
* <p>as well as:
* '<p><a href="xx"> <em>Using</em></a></p>'
* which appears to be treated as:
* '<p><a href="xx"><em>Using</em></a></p>'
* <p>
* If <code>strict is false, when a tag that breaks flow,
* (<code>TagElement.breaksFlows) or trailing whitespace is
* encountered, all whitespace will be ignored until a non whitespace
* character is encountered. This appears to give behavior closer to
* the popular browsers.
*
* @see DTD
* @see TagElement
* @see SimpleAttributeSet
* @author Arthur van Hoff
* @author Sunita Mani
*/
public
class Parser implements DTDConstants {
private char text[] = new char[1024];
private int textpos = 0;
private TagElement last;
private boolean space;
private char str[] = new char[128];
private int strpos = 0;
protected DTD dtd = null;
private int ch;
private int ln;
private Reader in;
private Element recent;
private TagStack stack;
private boolean skipTag = false;
private TagElement lastFormSent = null;
private SimpleAttributeSet attributes = new SimpleAttributeSet();
// State for <html>, and . Since people like to slap
// together HTML documents without thinking, occasionally they
// have multiple instances of these tags. These booleans track
// the first sightings of these tags so they can be safely ignored
// by the parser if repeated.
private boolean seenHtml = false;
private boolean seenHead = false;
private boolean seenBody = false;
/**
* The html spec does not specify how spaces are coalesced very well.
* If strict == false, ignoreSpace is used to try and mimic the behavior
* of the popular browsers.
* <p>
* The problematic scenarios are:
* '<b>blah <i> <strike> foo' which can be treated as:
* '<b>blah <i><strike>foo'
* as well as:
* '<p><a href="xx"> <em>Using</em></a></p>'
* which appears to be treated as:
* '<p><a href="xx"><em>Using</em></a></p>'
* <p>
* When a tag that breaks flow, or trailing whitespace is encountered
* ignoreSpace is set to true. From then on, all whitespace will be
* ignored.
* ignoreSpace will be set back to false the first time a
* non whitespace character is encountered. This appears to give
* behavior closer to the popular browsers.
*/
private boolean ignoreSpace;
/**
* This flag determines whether or not the Parser will be strict
* in enforcing SGML compatibility. If false, it will be lenient
* with certain common classes of erroneous HTML constructs.
* Strict or not, in either case an error will be recorded.
*
*/
protected boolean strict = false;
/** Number of \r\n's encountered. */
private int crlfCount;
/** Number of \r's encountered. A \r\n will not increment this. */
private int crCount;
/** Number of \n's encountered. A \r\n will not increment this. */
private int lfCount;
//
// To correctly identify the start of a tag/comment/text we need two
// ivars. Two are needed as handleText isn't invoked until the tag
// after the text has been parsed, that is the parser parses the text,
// then a tag, then invokes handleText followed by handleStart.
//
/** The start position of the current block. Block is overloaded here,
* it really means the current start position for the current comment,
* tag, text. Use getBlockStartPosition to access this. */
private int currentBlockStartPos;
/** Start position of the last block. */
private int lastBlockStartPos;
/**
* array for mapping numeric references in range
* 130-159 to displayable Unicode characters.
*/
private static final char[] cp1252Map = {
8218, //
402, //
8222, //
8230, //
8224, //
8225, //
710, //
8240, //
352, //
8249, //
338, //
141, //
142, //
143, //
144, //
8216, //
8217, //
8220, //
8221, //
8226, //
8211, //
8212, //
732, //
8482, //
353, //
8250, //
339, //
157, //
158, //
376 //
};
public Parser(DTD dtd) {
this.dtd = dtd;
}
/**
* @return the line number of the line currently being parsed
*/
protected int getCurrentLine() {
return ln;
}
/**
* Returns the start position of the current block. Block is
* overloaded here, it really means the current start position for
* the current comment tag, text, block.... This is provided for
* subclassers that wish to know the start of the current block when
* called with one of the handleXXX methods.
*/
int getBlockStartPosition() {
return Math.max(0, lastBlockStartPos - 1);
}
/**
* Makes a TagElement.
*/
protected TagElement makeTag(Element elem, boolean fictional) {
return new TagElement(elem, fictional);
}
protected TagElement makeTag(Element elem) {
return makeTag(elem, false);
}
protected SimpleAttributeSet getAttributes() {
return attributes;
}
protected void flushAttributes() {
attributes.removeAttributes(attributes);
}
/**
* Called when PCDATA is encountered.
*/
protected void handleText(char text[]) {
}
/**
* Called when an HTML title tag is encountered.
*/
protected void handleTitle(char text[]) {
// default behavior is to call handleText. Subclasses
// can override if necessary.
handleText(text);
}
/**
* Called when an HTML comment is encountered.
*/
protected void handleComment(char text[]) {
}
protected void handleEOFInComment() {
// We've reached EOF. Our recovery strategy is to
// see if we have more than one line in the comment;
// if so, we pretend that the comment was an unterminated
// single line comment, and reparse the lines after the
// first line as normal HTML content.
int commentEndPos = strIndexOf('\n');
if (commentEndPos >= 0) {
handleComment(getChars(0, commentEndPos));
try {
in.close();
in = new CharArrayReader(getChars(commentEndPos + 1));
ch = '>';
} catch (IOException e) {
error("ioexception");
}
resetStrBuffer();
} else {
// no newline, so signal an error
error("eof.comment");
}
}
/**
* Called when an empty tag is encountered.
*/
protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException {
}
/**
* Called when a start tag is encountered.
*/
protected void handleStartTag(TagElement tag) {
}
/**
* Called when an end tag is encountered.
*/
protected void handleEndTag(TagElement tag) {
}
/**
* An error has occurred.
*/
protected void handleError(int ln, String msg) {
/*
Thread.dumpStack();
System.out.println("**** " + stack);
System.out.println("line " + ln + ": error: " + msg);
System.out.println();
*/
}
/**
* Output text.
*/
void handleText(TagElement tag) {
if (tag.breaksFlow()) {
space = false;
if (!strict) {
ignoreSpace = true;
}
}
if (textpos == 0) {
if ((!space) || (stack == null) || last.breaksFlow() ||
!stack.advance(dtd.pcdata)) {
last = tag;
space = false;
lastBlockStartPos = currentBlockStartPos;
return;
}
}
if (space) {
if (!ignoreSpace) {
// enlarge buffer if needed
if (textpos + 1 > text.length) {
char newtext[] = new char[text.length + 200];
System.arraycopy(text, 0, newtext, 0, text.length);
text = newtext;
}
// output pending space
text[textpos++] = ' ';
if (!strict && !tag.getElement().isEmpty()) {
ignoreSpace = true;
}
}
space = false;
}
char newtext[] = new char[textpos];
System.arraycopy(text, 0, newtext, 0, textpos);
// Handles cases of bad html where the title tag
// was getting lost when we did error recovery.
if (tag.getElement().getName().equals("title")) {
handleTitle(newtext);
} else {
handleText(newtext);
}
lastBlockStartPos = currentBlockStartPos;
textpos = 0;
last = tag;
space = false;
}
/**
* Invoke the error handler.
*/
protected void error(String err, String arg1, String arg2,
String arg3) {
handleError(ln, err + " " + arg1 + " " + arg2 + " " + arg3);
}
protected void error(String err, String arg1, String arg2) {
error(err, arg1, arg2, "?");
}
protected void error(String err, String arg1) {
error(err, arg1, "?", "?");
}
protected void error(String err) {
error(err, "?", "?", "?");
}
/**
* Handle a start tag. The new tag is pushed
* onto the tag stack. The attribute list is
* checked for required attributes.
*/
protected void startTag(TagElement tag) throws ChangedCharSetException {
Element elem = tag.getElement();
// If the tag is an empty tag and texpos != 0
// this implies that there is text before the
// start tag that needs to be processed before
// handling the tag.
//
if (!elem.isEmpty() ||
((last != null) && !last.breaksFlow()) ||
(textpos != 0)) {
handleText(tag);
} else {
// this variable gets updated in handleText().
// Since in this case we do not call handleText()
// we need to update it here.
//
last = tag;
// Note that we should really check last.breakFlows before
// assuming this should be false.
space = false;
}
lastBlockStartPos = currentBlockStartPos;
// check required attributes
for (AttributeList a = elem.atts ; a != null ; a = a.next) {
if ((a.modifier == REQUIRED) &&
((attributes.isEmpty()) ||
((!attributes.isDefined(a.name)) &&
(!attributes.isDefined(HTML.getAttributeKey(a.name)))))) {
error("req.att ", a.getName(), elem.getName());
}
}
if (elem.isEmpty()) {
handleEmptyTag(tag);
/*
} else if (elem.getName().equals("form")) {
handleStartTag(tag);
*/
} else {
recent = elem;
stack = new TagStack(tag, stack);
handleStartTag(tag);
}
}
/**
* Handle an end tag. The end tag is popped
* from the tag stack.
*/
protected void endTag(boolean omitted) {
handleText(stack.tag);
if (omitted && !stack.elem.omitEnd()) {
error("end.missing", stack.elem.getName());
} else if (!stack.terminate()) {
error("end.unexpected", stack.elem.getName());
}
// handle the tag
handleEndTag(stack.tag);
stack = stack.next;
recent = (stack != null) ? stack.elem : null;
}
boolean ignoreElement(Element elem) {
String stackElement = stack.elem.getName();
String elemName = elem.getName();
/* We ignore all elements that are not valid in the context of
a table except <td>, (these we handle in
legalElementContext()) and #pcdata. We also ignore the
<font> tag in the context of and We additonally
ignore the <meta> and the | |