alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

What this is

This file is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Other links

The source code

// $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/Parser.java,v 1.2 2004/02/10 13:41:10 woolfel Exp $
/*
 * ====================================================================
 * Copyright 2002-2004 The Apache Software Foundation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 */

// The developers of JMeter and Apache are greatful to the developers
// of HTMLParser for giving Apache Software Foundation a non-exclusive
// license. The performance benefits of HTMLParser are clear and the
// users of JMeter will benefit from the hard work the HTMLParser
// team. For detailed information about HTMLParser, the project is
// hosted on sourceforge at http://htmlparser.sourceforge.net/.
//
// HTMLParser was originally created by Somik Raha in 2000. Since then
// a healthy community of users has formed and helped refine the
// design so that it is able to tackle the difficult task of parsing
// dirty HTML. Derrick Oswald is the current lead developer and was kind
// enough to assist JMeter.

package org.htmlparser;
//////////////////
// Java Imports //
//////////////////
import java.io.BufferedInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URLConnection;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.Map;

import org.htmlparser.parserHelper.ParserHelper;
import org.htmlparser.parserHelper.TagParser;
import org.htmlparser.scanners.AppletScanner;
import org.htmlparser.scanners.BodyScanner;
import org.htmlparser.scanners.BulletListScanner;
import org.htmlparser.scanners.DivScanner;
import org.htmlparser.scanners.DoctypeScanner;
import org.htmlparser.scanners.FormScanner;
import org.htmlparser.scanners.FrameSetScanner;
import org.htmlparser.scanners.HeadScanner;
import org.htmlparser.scanners.HtmlScanner;
import org.htmlparser.scanners.JspScanner;
import org.htmlparser.scanners.LinkScanner;
import org.htmlparser.scanners.MetaTagScanner;
import org.htmlparser.scanners.ScriptScanner;
import org.htmlparser.scanners.StyleScanner;
import org.htmlparser.scanners.TableScanner;
import org.htmlparser.scanners.TagScanner;
import org.htmlparser.scanners.TitleScanner;
import org.htmlparser.tags.EndTag;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.tags.Tag;
import org.htmlparser.util.DefaultParserFeedback;
import org.htmlparser.util.IteratorImpl;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.ParserFeedback;
import org.htmlparser.visitors.NodeVisitor;

/**
 * This is the class that the user will use, either to get an iterator into 
 * the html page or to directly parse the page and print the results
 * 
* Typical usage of the parser is as follows :
* [1] Create a parser object - passing the URL and a feedback object to the parser
* [2] Register the common scanners. See {@link #registerScanners()}
* You wouldnt do this if you want to configure a custom lightweight parser. In that case, * you would add the scanners of your choice using {@link #addScanner(TagScanner)}
* [3] Enumerate through the elements from the parser object
* It is important to note that the parsing occurs when you enumerate, ON DEMAND. This is a thread-safe way, * and you only get the control back after a particular element is parsed and returned. * *
* Below is some sample code to parse Yahoo.com and print all the tags. *
 * Parser parser = new Parser("http://www.yahoo.com",new DefaultHTMLParserFeedback());
 * // In this example, we are registering all the common scanners
 * parser.registerScanners(); 
 * for (NodeIterator i = parser.elements();e.hasMoreNodes();) {
 * 	Node node = i.nextNode();
 *	node.print();
 * }
 * 
Below is some sample code to parse Yahoo.com and print only the text * information. This scanning will run faster, as there are no scanners * registered here. *
 * Parser parser = new Parser("http://www.yahoo.com",new DefaultHTMLParserFeedback());
 * // In this example, none of the scanners need to be registered
 * // as a string node is not a tag to be scanned for.
 * for (NodeIterator i = parser.elements();e.hasMoreNodes();) {
 *	Node node = i.nextNode();
 *	if (node instanceof StringNode) {        	
 * 		StringNode stringNode =
 *		(StringNode)node;        
 * 		System.out.println(stringNode.getText());    
 * 	} 
 * }
 * 
* The above snippet will print out only the text contents in the html document.
* Here's another snippet that will only print out the link urls in a document. * This is an example of adding a link scanner. *
 * Parser parser = new Parser("http://www.yahoo.com",new DefaultHTMLParserFeedback());
 * parser.addScanner(new LinkScanner("-l"));
 * for (NodeIterator i = parser.elements();e.hasMoreNodes();) {
 * 	Node node = i.nextNode();    
 * 	if (node instanceof LinkTag) {
 * 		LinkTag linkTag = (LinkTag)node;        
 * 		System.out.println(linkTag.getLink());    
 * 	} 
 * }
 * 
* @see Parser#elements() */ public class Parser implements Serializable { // Please don't change the formatting of the version variables below. // This is done so as to facilitate ant script processing. /** * The floating point version number. */ public final static double VERSION_NUMBER = 1.3; /** * The type of version. */ public final static String VERSION_TYPE = "Release Build"; /** * The date of the version. */ public final static String VERSION_DATE = "May 25, 2003"; /** * The display version. */ public final static String VERSION_STRING = "" + VERSION_NUMBER + " (" + VERSION_TYPE + " " + VERSION_DATE + ")"; // End of formatting /** * The default charset. * This should be ISO-8859-1, * see RFC 2616 (http://www.ietf.org/rfc/rfc2616.txt?number=2616) section 3.7.1 * Another alias is "8859_1". */ protected static final String DEFAULT_CHARSET = "ISO-8859-1"; /** * Trigger for charset detection. */ protected static final String CHARSET_STRING = "charset"; /** * Feedback object. */ protected ParserFeedback feedback; /** * The URL or filename to be parsed. */ protected String resourceLocn; /** * The html reader associated with this parser. */ protected transient NodeReader reader; /** * The list of scanners to apply at the top level. */ private Map scanners; /** * The encoding being used to decode the connection input stream. */ protected String character_set; /** * The source for HTML. */ protected transient URLConnection url_conn; /** * The bytes extracted from the source. */ protected transient BufferedInputStream input; /** * A quiet message sink. * Use this for no feedback. */ public static ParserFeedback noFeedback = new DefaultParserFeedback(DefaultParserFeedback.QUIET); /** * A verbose message sink. * Use this for output on System.out. */ public static ParserFeedback stdout = new DefaultParserFeedback(); private ParserHelper parserHelper = new ParserHelper(); // // Static methods // /** * @param lineSeparator New Line separator to be used */ public static void setLineSeparator(String lineSeparator) { Node.setLineSeparator(lineSeparator); } /** * Return the version string of this parser. * @return A string of the form: *
     * "[floating point number] ([build-type] [build-date])"
     * 
*/ public static String getVersion() { return (VERSION_STRING); } /** * Return the version number of this parser. * @return A floating point number, the whole number part is the major * version, and the fractional part is the minor version. */ public static double getVersionNumber() { return (VERSION_NUMBER); } // // Constructors // /** * Zero argument constructor. * The parser is in a safe but useless state. * Set the reader or connection using setReader() or setConnection(). * @see #setReader(NodeReader) * @see #setConnection(URLConnection) */ public Parser() { setFeedback(null); setScanners(null); resourceLocn = null; reader = null; character_set = DEFAULT_CHARSET; url_conn = null; input = null; Tag.setTagParser(new TagParser(getFeedback())); } /** * This constructor enables the construction of test cases, with readers * associated with test string buffers. It can also be used with readers of the user's choice * streaming data into the parser.

* Important: If you are using this constructor, and you would like to use the parser * to parse multiple times (multiple calls to parser.elements()), you must ensure the following:
*

    *
  • Before the first parse, you must mark the reader for a length that you anticipate (the size of the stream).
  • *
  • After the first parse, calls to elements() must be preceded by calls to : *
         * parser.getReader().reset();
         * 
    *
  • *
* @param rd The reader to draw characters from. * @param fb The object to use when information, * warning and error messages are produced. If null no feedback * is provided. */ public Parser(NodeReader rd, ParserFeedback fb) { setFeedback(fb); setScanners(null); resourceLocn = null; reader = null; character_set = DEFAULT_CHARSET; url_conn = null; input = null; setReader(rd); Tag.setTagParser(new TagParser(feedback)); } /** * Constructor for custom HTTP access. * @param connection A fully conditioned connection. The connect() * method will be called so it need not be connected yet. * @param fb The object to use for message communication. */ public Parser(URLConnection connection, ParserFeedback fb) throws ParserException { setFeedback(fb); setScanners(null); resourceLocn = null; reader = null; character_set = DEFAULT_CHARSET; url_conn = null; input = null; Tag.setTagParser(new TagParser(feedback)); setConnection(connection); } /** * Creates a Parser object with the location of the resource (URL or file) * You would typically create a DefaultHTMLParserFeedback object and pass it in. * @param resourceLocn Either the URL or the filename (autodetects). * A standard HTTP GET is performed to read the content of the URL. * @param feedback The HTMLParserFeedback object to use when information, * warning and error messages are produced. If null no feedback * is provided. * @see #Parser(URLConnection,ParserFeedback) */ public Parser(String resourceLocn, ParserFeedback feedback) throws ParserException { this(ParserHelper.openConnection(resourceLocn, feedback), feedback); } /** * Creates a Parser object with the location of the resource (URL or file). * A DefaultHTMLParserFeedback object is used for feedback. * @param resourceLocn Either the URL or the filename (autodetects). */ public Parser(String resourceLocn) throws ParserException { this(resourceLocn, stdout); } /** * This constructor is present to enable users to plugin their own readers. * A DefaultHTMLParserFeedback object is used for feedback. It can also be used with readers of the user's choice * streaming data into the parser.

* Important: If you are using this constructor, and you would like to use the parser * to parse multiple times (multiple calls to parser.elements()), you must ensure the following:
*

    *
  • Before the first parse, you must mark the reader for a length that you anticipate (the size of the stream).
  • *
  • After the first parse, calls to elements() must be preceded by calls to : *
         * parser.getReader().reset();
         * 
    *
  • * @param reader The source for HTML to be parsed. */ public Parser(NodeReader reader) { this(reader, stdout); } /** * Constructor for non-standard access. * A DefaultHTMLParserFeedback object is used for feedback. * @param connection A fully conditioned connection. The connect() * method will be called so it need not be connected yet. * @see #Parser(URLConnection,ParserFeedback) */ public Parser(URLConnection connection) throws ParserException { this(connection, stdout); } // // Serialization support // private void writeObject(ObjectOutputStream out) throws IOException { if ((null == getConnection()) || /*redundant*/ (null == getURL())) if (null != getReader()); // commented out by Somik - why are we not allowed to serialize parsers without url // throw new IOException ("can only serialize parsers with a URL"); out.defaultWriteObject(); } private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject(); try { // reopen the connection and create a reader which are transient fields setURL(getURL()); } catch (ParserException hpe) { throw new IOException(hpe.toString()); } } // // Bean patterns // /** * Set the connection for this parser. * This method sets four of the fields in the parser object; * resourceLocn, url_conn, character_set * and reader. It does not adjust the scanners list * or feedback object. The four fields are set atomicly by * this method, either they are all set or none of them is set. Trying to * set the connection to null is a noop. * @param connection A fully conditioned connection. The connect() * method will be called so it need not be connected yet. * @exception ParserException if the character set specified in the * HTTP header is not supported, or an i/o exception occurs creating the * reader. */ public void setConnection(URLConnection connection) throws ParserException { String res; NodeReader rd; String chs; URLConnection con; if (null != connection) { res = getURL(); rd = getReader(); chs = getEncoding(); con = getConnection(); try { resourceLocn = connection.getURL().toExternalForm(); url_conn = connection; url_conn.connect(); character_set = getCharacterSet(url_conn); createReader(); } catch (IOException ioe) { String msg = "setConnection() : Error in opening a connection to " + connection.getURL().toExternalForm(); ParserException ex = new ParserException(msg, ioe); feedback.error(msg, ex); resourceLocn = res; url_conn = con; character_set = chs; reader = rd; throw ex; } } } /** * Return the current connection. * @return The connection either created by the parser or passed into this * parser via setConnection. * @see #setConnection(URLConnection) */ public URLConnection getConnection() { return (url_conn); } /** * Set the URL for this parser. * This method sets four of the fields in the parser object; * resourceLocn, url_conn, character_set * and reader. It does not adjust the scanners list * or feedback object.Trying to set the url to null or an * empty string is a noop. * @see #setConnection(URLConnection) */ public void setURL(String url) throws ParserException { if ((null != url) && !"".equals(url)) setConnection(ParserHelper.openConnection(url, getFeedback())); } /** * Return the current URL being parsed. * @return The url passed into the constructor or the file name * passed to the constructor modified to be a URL. */ public String getURL() { return (resourceLocn); } /** * Set the encoding for this parser. * If there is no connection (getConnection() returns null) it simply sets * the character set name stored in the parser (Note: the reader object * which must have been set in the constructor or by setReader(), * may or may not be using this character set). * Otherwise (getConnection() doesn't return null) it does this by reopening the * input stream of the connection and creating a reader that uses this * character set. In this case, this method sets two of the fields in the * parser object; character_set and reader. * It does not adjust resourceLocn, url_conn, * scanners or feedback. The two fields are set * atomicly by this method, either they are both set or none of them is set. * Trying to set the encoding to null or an empty string is a noop. * @exception ParserException If the opening of the reader */ public void setEncoding(String encoding) throws ParserException { String chs; NodeReader rd; BufferedInputStream in; if ((null != encoding) && !"".equals(encoding)) if (null == getConnection()) character_set = encoding; else { rd = getReader(); chs = getEncoding(); in = input; try { character_set = encoding; recreateReader(); } catch (IOException ioe) { String msg = "setEncoding() : Error in opening a connection to " + getConnection().getURL().toExternalForm(); ParserException ex = new ParserException(msg, ioe); feedback.error(msg, ex); character_set = chs; reader = rd; input = in; throw ex; } } } /** * The current encoding. * This item is et from the HTTP header but may be overridden by meta * tags in the head, so this may change after the head has been parsed. */ public String getEncoding() { return (character_set); } /** * Set the reader for this parser. * This method sets four of the fields in the parser object; * resourceLocn, url_conn, character_set * and reader. It does not adjust the scanners list * or feedback object. The url_conn is set to * null since this cannot be determined from the reader. The * character_set is set to the default character set since * this cannot be determined from the reader. * Trying to set the reader to null is a noop. * @param rd The reader object to use. This reader will be bound to this * parser after this call. */ public void setReader(NodeReader rd) { if (null != rd) { resourceLocn = rd.getURL(); reader = rd; character_set = DEFAULT_CHARSET; url_conn = null; reader.setParser(this); } } /** * Returns the reader associated with the parser * @return NodeReader */ public NodeReader getReader() { return reader; } /** * Get the number of scanners registered currently in the scanner. * @return int number of scanners registered */ public int getNumScanners() { return scanners.size(); } /** * This method is to be used to change the set of scanners in the current parser. * @param newScanners Vector holding scanner objects to be used during the parsing process. */ public void setScanners(Map newScanners) { scanners = (null == newScanners) ? new HashMap() : newScanners; } /** * Get an enumeration of scanners registered currently in the parser * @return Enumeration of scanners currently registered in the parser */ public Map getScanners() { return scanners; } /** * Sets the feedback object used in scanning. * @param fb The new feedback object to use. */ public void setFeedback(ParserFeedback fb) { feedback = (null == fb) ? noFeedback : fb; } /** * Returns the feedback. * @return HTMLParserFeedback */ public ParserFeedback getFeedback() { return feedback; } // // Internal methods // /** * Open a stream reader on the InputStream. * Revise the character set to it's default value if an * UnsupportedEncodingException is thrown. * @exception UnsupportedEncodingException in the unlikely event that * the default character set is not supported on this platform. */ protected InputStreamReader createInputStreamReader() throws UnsupportedEncodingException { InputStreamReader ret; try { ret = new InputStreamReader(input, character_set); } catch (UnsupportedEncodingException uee) { StringBuffer msg; String message; msg = new StringBuffer(1024); msg.append(url_conn.getURL().toExternalForm()); msg.append(" has an encoding ("); msg.append(character_set); msg.append(") which is not supported, using "); msg.append(DEFAULT_CHARSET); message = msg.toString(); feedback.warning(message); character_set = DEFAULT_CHARSET; ret = new InputStreamReader(input, character_set); } return (ret); } /** * Create a new reader for the URLConnection object. * The current character set is used to transform the input stream * into a character reader. * @exception IOException if there is a problem constructing the reader. * @see #createInputStreamReader() * @see #getEncoding() */ protected void createReader() throws IOException { InputStream stream; InputStreamReader in; stream = url_conn.getInputStream(); input = new BufferedInputStream(stream); input.mark(Integer.MAX_VALUE); in = createInputStreamReader(); reader = new NodeReader(in, resourceLocn); reader.setParser(this); } /** * Create a new reader for the URLConnection object but reuse the input stream. * The current character set is used to transform the input stream * into a character reader. Defaults to createReader() if * there is no existing input stream. * @exception IOException if there is a problem constructing the reader. * @see #createReader() * @see #createInputStreamReader() * @see #getEncoding() */ protected void recreateReader() throws IOException { InputStreamReader in; if (null == input) createReader(); else { input.reset(); input.mark(Integer.MAX_VALUE); in = createInputStreamReader(); reader = new NodeReader(in, resourceLocn); reader.setParser(this); } } /** * Try and extract the character set from the HTTP header. * @param connection The connection with the charset info. * @return The character set name to use for this HTML page. */ protected String getCharacterSet(URLConnection connection) { final String field = "Content-Type"; String string; String ret; ret = DEFAULT_CHARSET; string = connection.getHeaderField(field); if (null != string) ret = getCharset(string); return (ret); } /** * Get a CharacterSet name corresponding to a charset parameter. * @param content A text line of the form: *
         * text/html; charset=Shift_JIS
         * 
    * which is applicable both to the HTTP header field Content-Type and * the meta tag http-equiv="Content-Type". * Note this method also handles non-compliant quoted charset directives such as: *
         * text/html; charset="UTF-8"
         * 
    * and *
         * text/html; charset='UTF-8'
         * 
    * @return The character set name to use when reading the input stream. * For JDKs that have the Charset class this is qualified by passing * the name to findCharset() to render it into canonical form. * If the charset parameter is not found in the given string, the default * character set is returned. * @see ParserHelper#findCharset * @see #DEFAULT_CHARSET */ protected String getCharset(String content) { int index; String ret; ret = DEFAULT_CHARSET; if (null != content) { index = content.indexOf(CHARSET_STRING); if (index != -1) { content = content.substring(index + CHARSET_STRING.length()).trim(); if (content.startsWith("=")) { content = content.substring(1).trim(); index = content.indexOf(";"); if (index != -1) content = content.substring(0, index); //remove any double quotes from around charset string if (content.startsWith("\"") && content.endsWith("\"") && (1 < content.length())) content = content.substring(1, content.length() - 1); //remove any single quote from around charset string if (content.startsWith("'") && content.endsWith("'") && (1 < content.length())) content = content.substring(1, content.length() - 1); ret = ParserHelper.findCharset(content, ret); // Charset names are not case-sensitive; // that is, case is always ignored when comparing charset names. if (!ret.equalsIgnoreCase(content)) { feedback.info( "detected charset \"" + content + "\", using \"" + ret + "\""); } } } } return (ret); } // // Public methods // /** * Add a new Tag Scanner. * In typical situations where you require a no-frills parser, use the registerScanners() method to add the most * common parsers. But when you wish to either compose a parser with only certain scanners registered, use this method. * It is advantageous to register only the scanners you want, in order to achieve faster parsing speed. This method * would also be of use when you have developed custom scanners, and need to register them into the parser. * @param scanner TagScanner object (or derivative) to be added to the list of registered scanners */ public void addScanner(TagScanner scanner) { String ids[] = scanner.getID(); for (int i = 0; i < ids.length; i++) { scanners.put(ids[i], scanner); } scanner.setFeedback(feedback); } /** * Returns an iterator (enumeration) to the html nodes. Each node can be a tag/endtag/ * string/link/image
    * This is perhaps the most important method of this class. In typical situations, you will need to use * the parser like this : *
         * Parser parser = new Parser("http://www.yahoo.com");
         * parser.registerScanners();
         * for (NodeIterator i = parser.elements();i.hasMoreElements();) {
         *    Node node = i.nextHTMLNode();
         *    if (node instanceof StringNode) {
         *      // Downcasting to StringNode
         *      StringNode stringNode = (StringNode)node;
         *      // Do whatever processing you want with the string node
         *      System.out.println(stringNode.getText());
         *    }
         *    // Check for the node or tag that you want
         *    if (node instanceof ...) {
         *      // Downcast, and process
         *    }
         * }
         * 
    */ public NodeIterator elements() throws ParserException { boolean remove_scanner; Node node; MetaTag meta; String httpEquiv; String charset; boolean restart; EndTag end; IteratorImpl ret; remove_scanner = false; restart = false; ret = new IteratorImpl(reader, resourceLocn, feedback); ret = createIteratorImpl(remove_scanner, ret); return (ret); } public IteratorImpl createIteratorImpl( boolean remove_scanner, IteratorImpl ret) throws ParserException { Node node; MetaTag meta; String httpEquiv; String charset; EndTag end; if (null != url_conn) try { if (null == scanners.get("-m")) { addScanner(new MetaTagScanner("-m")); remove_scanner = true; } /* pre-read up to looking for charset directive */ while (null != (node = ret.peek())) { if (node instanceof MetaTag) { // check for charset on Content-Type meta = (MetaTag) node; httpEquiv = meta.getAttribute("HTTP-EQUIV"); if ("Content-Type".equalsIgnoreCase(httpEquiv)) { charset = getCharset(meta.getAttribute("CONTENT")); if (!charset.equalsIgnoreCase(character_set)) { // oops, different character set, restart character_set = charset; recreateReader(); ret = new IteratorImpl( reader, resourceLocn, feedback); } // once we see the Content-Type meta tag we're finished the pre-read break; } } else if (node instanceof EndTag) { end = (EndTag) node; if (end.getTagName().equalsIgnoreCase("HEAD")) // or, once we see the tag we're finished the pre-read break; } } } catch (UnsupportedEncodingException uee) { String msg = "elements() : The content of " + url_conn.getURL().toExternalForm() + " has an encoding which is not supported"; ParserException ex = new ParserException(msg, uee); feedback.error(msg, ex); throw ex; } catch (IOException ioe) { String msg = "elements() : Error in opening a connection to " + url_conn.getURL().toExternalForm(); ParserException ex = new ParserException(msg, ioe); feedback.error(msg, ex); throw ex; } finally { if (remove_scanner) scanners.remove("-m"); } return ret; } /** * Flush the current scanners registered. The registered scanners list becomes empty with this call. */ public void flushScanners() { scanners = new Hashtable(); } /** * Return the scanner registered in the parser having the * given id * @param id The id of the requested scanner * @return TagScanner The Tag Scanner */ public TagScanner getScanner(String id) { return (TagScanner) scanners.get(id); } /** * Parse the given resource, using the filter provided */ public void parse(String filter) throws Exception { Node node; for (NodeIterator e = elements(); e.hasMoreNodes();) { node = e.nextNode(); if (node != null) { if (filter == null) System.out.println(node.toString()); else { // There is a filter. Find if the associated filter of this node // matches the specified filter if (!(node instanceof Tag)) continue; Tag tag = (Tag) node; TagScanner scanner = tag.getThisScanner(); if (scanner == null) continue; String tagFilter = scanner.getFilter(); if (tagFilter == null) continue; if (tagFilter.equals(filter)) System.out.println(node.toString()); } } else System.out.println("Node is null"); } } /** * This method should be invoked in order to register some common scanners. The scanners that get added are :
    * LinkScanner (filter key "-l")
    * HTMLImageScanner (filter key "-i")
    * HTMLScriptScanner (filter key "-s")
    * HTMLStyleScanner (filter key "-t")
    * HTMLJspScanner (filter key "-j")
    * HTMLAppletScanner (filter key "-a")
    * HTMLMetaTagScanner (filter key "-m")
    * HTMLTitleScanner (filter key "-t")
    * HTMLDoctypeScanner (filter key "-d")
    * HTMLFormScanner (filter key "-f")
    * HTMLFrameSetScanner(filter key "-r")
    * HTMLBaseHREFScanner(filter key "-b")
    *
    * Call this method after creating the Parser object. e.g.
    *
         * Parser parser = new Parser("http://www.yahoo.com");
         * parser.registerScanners();
         * 
    */ public void registerScanners() { if (scanners.size() > 0) { System.err.println( "registerScanners() should be called first, when no other scanner has been registered."); System.err.println( "Other scanners already exist, hence this method call wont have any effect"); return; } LinkScanner linkScanner = new LinkScanner(LinkTag.LINK_TAG_FILTER); // Note - The BaseHREF and Image scanners share the same // link processor - internally linked up with the factory // method in the link scanner class addScanner(linkScanner); addScanner(linkScanner.createImageScanner(ImageTag.IMAGE_TAG_FILTER)); addScanner(new ScriptScanner("-s")); addScanner(new StyleScanner("-t")); addScanner(new JspScanner("-j")); addScanner(new AppletScanner("-a")); addScanner(new MetaTagScanner("-m")); addScanner(new TitleScanner("-T")); addScanner(new DoctypeScanner("-d")); addScanner(new FormScanner("-f", this)); addScanner(new FrameSetScanner("-r")); addScanner(linkScanner.createBaseHREFScanner("-b")); addScanner(new BulletListScanner("-bulletList", this)); // addScanner(new SpanScanner("-p")); addScanner(new DivScanner("-div")); addScanner(new TableScanner(this)); } /** * Make a call to registerDomScanners(), instead of registerScanners(), * when you are interested in retrieving a Dom representation of the html * page. Upon parsing, you will receive an Html object - which will contain * children, one of which would be the body. This is still evolving, and in * future releases, you might see consolidation of Html - to provide you * with methods to access the body and the head. */ public void registerDomScanners() { registerScanners(); addScanner(new HtmlScanner()); addScanner(new BodyScanner()); addScanner(new HeadScanner()); } /** * Removes a specified scanner object. You can create * an anonymous object as a parameter. This method * will use the scanner's key and remove it from the * registry of scanners. * e.g. *
         * removeScanner(new FormScanner(""));
         * 
    * @param scanner TagScanner object to be removed from the list of registered scanners */ public void removeScanner(TagScanner scanner) { scanners.remove(scanner.getID()[0]); } /** * The main program, which can be executed from the command line */ public static void main(String[] args) { System.out.println("HTMLParser v" + VERSION_STRING); if (args.length < 1 || args[0].equals("-help")) { System.out.println(); System.out.println( "Syntax : java -jar htmlparser.jar -l"); System.out.println( " the name of the file to be parsed (with complete path if not in current directory)"); System.out.println( " -l Show only the link tags extracted from the document"); System.out.println( " -i Show only the image tags extracted from the document"); System.out.println( " -s Show only the Javascript code extracted from the document"); System.out.println( " -t Show only the Style code extracted from the document"); System.out.println( " -a Show only the Applet tag extracted from the document"); System.out.println(" -j Parse JSP tags"); System.out.println(" -m Parse Meta tags"); System.out.println(" -T Extract the Title"); System.out.println(" -f Extract forms"); System.out.println(" -r Extract frameset"); System.out.println(" -help This screen"); System.out.println(); System.out.println( "HTML Parser home page : http://htmlparser.sourceforge.net"); System.out.println(); System.out.println( "Example : java -jar htmlparser.jar http://www.yahoo.com"); System.out.println(); System.out.println( "If you have any doubts, please join the HTMLParser mailing list (user/developer) from the HTML Parser home page instead of mailing any of the contributors directly. You will be surprised with the quality of open source support. "); System.exit(-1); } try { if (args[0].indexOf("http") < 0) { File input = new File(args[0]); try { args[0] = input.toURL().toString(); System.out.println("file converted to URL: " + args[0]); } catch (MalformedURLException e) { e.printStackTrace(); } } Parser parser = new Parser(args[0]); System.out.println("Parsing " + parser.getURL()); parser.registerScanners(); try { long start = System.currentTimeMillis(); if (args.length == 2) { parser.parse(args[1]); } else parser.parse(null); System.out.println( "Elapsed Time ms: " + (System.currentTimeMillis() - start)); } catch (Exception e) { e.printStackTrace(); } } catch (ParserException e) { e.printStackTrace(); } } public void visitAllNodesWith(NodeVisitor visitor) throws ParserException { Node node; for (NodeIterator e = elements(); e.hasMoreNodes();) { node = e.nextNode(); node.accept(visitor); } visitor.finishedParsing(); } /** Initializes the parser with the given input HTML String. * @param inputHTML the input HTML that is to be parsed. */ public void setInputHTML(String inputHTML) { if ("".equals(inputHTML)) { reader = new NodeReader(new StringReader(inputHTML), ""); } } public Node[] extractAllNodesThatAre(Class nodeType) throws ParserException { NodeList nodeList = new NodeList(); for (NodeIterator e = elements(); e.hasMoreNodes();) { e.nextNode().collectInto(nodeList, nodeType); } return nodeList.toNodeArray(); } /** * Creates the parser on an input string. * @param inputHTML * @return Parser */ public static Parser createParser(String inputHTML) { NodeReader reader = new NodeReader(new StringReader(inputHTML), ""); return new Parser(reader); } public static Parser createLinkRecognizingParser(String inputHTML) { Parser parser = createParser(inputHTML); parser.addScanner(new LinkScanner(LinkTag.LINK_TAG_FILTER)); return parser; } }
... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.