alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

What this is

This file is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Other links

The source code

// $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/tests/ParserTest.java,v 1.2 2004/02/10 13:41:08 woolfel Exp $
/*
 * ====================================================================
 * Copyright 2002-2004 The Apache Software Foundation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 */

// The developers of JMeter and Apache are greatful to the developers
// of HTMLParser for giving Apache Software Foundation a non-exclusive
// license. The performance benefits of HTMLParser are clear and the
// users of JMeter will benefit from the hard work the HTMLParser
// team. For detailed information about HTMLParser, the project is
// hosted on sourceforge at http://htmlparser.sourceforge.net/.
//
// HTMLParser was originally created by Somik Raha in 2000. Since then
// a healthy community of users has formed and helped refine the
// design so that it is able to tackle the difficult task of parsing
// dirty HTML. Derrick Oswald is the current lead developer and was kind
// enough to assist JMeter.

package org.htmlparser.tests;

import java.io.File;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.Map;

import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.StringNode;
import org.htmlparser.scanners.FormScanner;
import org.htmlparser.scanners.TagScanner;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.Tag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;

public class ParserTest extends ParserTestCase
{

    public ParserTest(String name)
    {
        super(name);
    }
    public void testElements() throws Exception
    {
        StringBuffer hugeData = new StringBuffer();
        for (int i = 0; i < 5001; i++)
            hugeData.append('a');
        createParser(hugeData.toString());
        int i = 0;
        for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
        {
            node[i++] = e.nextNode();
        }
        assertEquals("There should be 1 node identified", 1, i);
        // Now try getting the elements again
        //		i = 0;
        //		reader.reset();
        //		reader.setLineCount(1);
        //		reader.setPosInLine(-1);
        //		for (HTMLEnumeration e = parser.elements();e.hasMoreNodes();)
        //		{
        //			node[i++] = e.nextHTMLNode();
        //		}
        //		assertEquals("There should be 1 node identified (second call to parser.elements())",1,i);
    }

    /**
     * This testcase needs you to be online.
     */
    public void testElementsFromWeb() throws Exception
    {
        Parser parser;
        try
        {
            parser = new Parser("http://www.google.com");
        }
        catch (Exception e)
        {
            throw new ParserException(
                "You must be offline! This test needs you to be connected to the internet.",
                e);
        }
        parser.getReader().mark(5000);

        Node[] node = new Node[500];
        int i = 0;
        for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
        {
            node[i++] = e.nextNode();
        }
        int cnt = i;
        parser.getReader().reset();
        // Now try getting the elements again
        i = 0;
        for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
        {
            node[i++] = e.nextNode();
        }
        assertEquals(
            "There should be "
                + cnt
                + " nodes identified (second call to parser.elements())",
            cnt,
            i);
    }

    /**
     * Test the Parser(URLConnection) constructor.
     * This testcase needs you to be online.
     * Based on the form at Canada Post http://www.canadapost.ca/tools/pcl/bin/default-e.asp:
     * 
     * <form NAME="SearchQuick" method="POST" action="cp_search_response-e.asp"
     * 	onSubmit="return runSubmit();">
     * 
     * <!-- begin test hidden field code -->
     *   <input TYPE="Hidden" NAME="app_language" value="english">
     * 
     *   <input TYPE="Hidden" NAME="app_response_start_row_number" value="1">
     *   <input TYPE="Hidden" NAME="app_response_rows_max" value="9">
     * 
     *   <input TYPE="Hidden" NAME="app_source" value="quick">
     *   <input TYPE="Hidden" NAME="query_source" value="q">
     * 
     *   <input TYPE="Hidden" NAME="name" value>
     *   <input TYPE="Hidden" NAME="postal_code" value>
     *   <input TYPE="Hidden" NAME="directory_area_name" value>
     * 
     *   <input TYPE="Hidden" NAME="delivery_mode" value>
     *   <input TYPE="Hidden" NAME="Suffix" value>
     * 
     *   <input TYPE="Hidden" NAME="street_direction" value>
     *   <input TYPE="Hidden" NAME="installation_type" value>
     *   <input TYPE="Hidden" NAME="delivery_number" value>
     *   <input TYPE="Hidden" NAME="installation_name" value>
     *   <input TYPE="Hidden" NAME="unit_number" value>
     * 
     *   <input TYPE="Hidden" NAME="app_state" value="production">
     * <!-- end test hidden field code -->
     * 
     * <p>
     *   <table border="0" cellpadding="0" width="90%" cellspacing="0">
     * 
     *     <tr>
     *       <td  class="tbltitle"> Street Number: </td>
     *       <td class="tbltitle"> Street Name: </td>
     *       <td class="tbltitle"> Street Type:</td>
     *     </tr>
     *     <tr>
     * 
     *       <td>
     *         <input type="text" name="street_number" size="10" maxlength="10">
     *       </td>
     *       <td>
     *         <input type="text" name="street_name" size="30" maxlength="40">
     *         <input type="hidden" name="street_type" size="30">
     *       </td>
     *       <td><input type="text" name="test" size="10" maxlength="30"></td>
     *     </tr>
     * 
     *   </table>
     * <p>
     *   <table border="0" cellpadding="0" width="90%" cellspacing="0">
     *     <tr>
     *       <td class="tbltitle">
     *         Municipality (City, Town, etc.):
     *       </td>
     *       <td class="tbltitle">
     *         Province:
     *       </td>
     * 
     *     </tr>
     *     <tr>
     *       <td>
     *         <input type="text" name="city" size="30" maxlength="30">
     *       </td>
     *       <td>
     *         <select size="1" name="prov">
     *           <option selected value="NULL">Select</option><option value="AB">AB - Alberta</option><option value="BC">BC - British Columbia</option><option value="MB">MB - Manitoba</option><option value="NB">NB - New Brunswick</option><option value="NL">NL - Newfoundland and Labrador</option><option value="NS">NS - Nova Scotia</option><option value="NT">NT - Northwest Territories</option><option value="NU">NU - Nunavut</option><option value="ON">ON - Ontario</option><option value="PE">PE - Prince Edward Island</option><option value="QC">QC - Quebec</option><option value="SK">SK - Saskatchewan</option><option value="YT">YT - Yukon</option>
     * 
     *         </select>
     *       </td>
     *     </tr>
     *     <tr>
     *       <td height="10">&nbsp;</td>
     *       <td>&nbsp;</td>
     *     </tr>
     *     <tr>
     *       <td colspan="2" align="right" nowrap>
     * 	   <input type="image" src="images/bb_submit-e.gif" name="Search" border="0" WIDTH="88" HEIGHT="23">
     *         &nbsp; <a href="#" onclick="javascript:fClearAllFields();"><img src="images/bb_clear_form-e.gif" name="Clear" border="0" WIDTH="88" HEIGHT="23"></a>
     * 	  </td>
     *     </tr>
     *   </table>
     * <p>
     * </form>
     * 
* Sumbits the POST and verifies the returned HTML contains an expected value. */ public void testPOST() throws Exception { // the form data: final String number = "2708"; final String street = "Kelly"; final String type = "Avenue"; final String city = "Ottawa"; final String province = "ON"; // the correct answer final String postal_code = "K2B 7V4"; Parser parser; URL url; HttpURLConnection connection; StringBuffer buffer; PrintWriter out; boolean pass; NodeIterator enumeration; Node node; StringNode string; try { url = new URL("http://www.canadapost.ca/tools/pcl/bin/cp_search_response-e.asp"); connection = (HttpURLConnection) url.openConnection(); connection.setRequestMethod("POST"); connection.setRequestProperty( "Referer", "http://www.canadapost.ca/tools/pcl/bin/default-e.asp"); connection.setDoOutput(true); connection.setDoInput(true); connection.setUseCaches(false); buffer = new StringBuffer(1024); buffer.append("app_language="); buffer.append("english"); buffer.append("&"); buffer.append("app_response_start_row_number="); buffer.append("1"); buffer.append("&"); buffer.append("app_response_rows_max="); buffer.append("9"); buffer.append("&"); buffer.append("app_source="); buffer.append("quick"); buffer.append("&"); buffer.append("query_source="); buffer.append("q"); buffer.append("&"); buffer.append("name="); buffer.append("&"); buffer.append("postal_code="); buffer.append("&"); buffer.append("directory_area_name="); buffer.append("&"); buffer.append("delivery_mode="); buffer.append("&"); buffer.append("Suffix="); buffer.append("&"); buffer.append("street_direction="); buffer.append("&"); buffer.append("installation_type="); buffer.append("&"); buffer.append("delivery_number="); buffer.append("&"); buffer.append("installation_name="); buffer.append("&"); buffer.append("unit_numbere="); buffer.append("&"); buffer.append("app_state="); buffer.append("production"); buffer.append("&"); buffer.append("street_number="); buffer.append(number); buffer.append("&"); buffer.append("street_name="); buffer.append(street); buffer.append("&"); buffer.append("street_type="); buffer.append(type); buffer.append("&"); buffer.append("test="); buffer.append("&"); buffer.append("city="); buffer.append(city); buffer.append("&"); buffer.append("prov="); buffer.append(province); buffer.append("&"); buffer.append("Search="); out = new PrintWriter(connection.getOutputStream()); out.print(buffer); out.close(); parser = new Parser(connection); } catch (Exception e) { throw new ParserException( "You must be offline! This test needs you to be connected to the internet.", e); } pass = false; for (enumeration = parser.elements(); enumeration.hasMoreNodes();) { node = enumeration.nextNode(); if (node instanceof StringNode) { string = (StringNode) node; if (-1 != string.getText().indexOf(postal_code)) pass = true; } } assertTrue("POST operation failed.", pass); } /** * Tests the 'from file' Parser constructor. */ public void testFile() { String path; File file; PrintWriter out; Parser parser; Node nodes[]; int i; NodeIterator enumeration; path = System.getProperty("user.dir"); if (!path.endsWith(File.separator)) path += File.separator; file = new File(path + "delete_me.html"); try { out = new PrintWriter(new FileWriter(file)); out.println( ""); out.println(""); out.println(""); out.println("test"); out.println( ""); out.println(""); out.println(""); out.println("This is a test page "); out.println(""); out.println(""); out.close(); parser = new Parser(file.getAbsolutePath()); nodes = new Node[30]; i = 0; for (enumeration = parser.elements(); enumeration.hasMoreNodes();) { nodes[i] = enumeration.nextNode(); i++; } assertEquals("Expected nodes", 12, i); } catch (Exception e) { fail(e.toString()); } finally { file.delete(); } } /** * Test with a HTTP header with a valid charset parameter. * Here, ibm.co.jp is an example of a HTTP server that correctly sets the * charset in the header to match the content encoding. */ public void testHTTPCharset() { Parser parser; try { parser = new Parser("http://www.ibm.com/jp/", Parser.noFeedback); assertTrue( "Character set should be Shift_JIS", parser.getEncoding().equalsIgnoreCase("Shift_JIS")); } catch (ParserException e) { fail("could not open http://www.ibm.com/jp/"); } } /** * Test with a HTML header with a charset parameter not matching the HTTP header. * Here, www.sony.co.jp is an example of a HTTP server that does not set the * charset in the header to match the content encoding. We check that after * the enumeration is created, that the charset has changed to the correct value. */ public void testHTMLCharset() { Parser parser; NodeIterator enumeration; try { parser = new Parser("http://www.sony.co.jp", Parser.noFeedback); assertEquals( "Character set by default is ISO-8859-1", "ISO-8859-1", parser.getEncoding()); enumeration = parser.elements(); assertTrue( "Character set should be Shift_JIS", parser.getEncoding().equalsIgnoreCase("Shift_JIS")); } catch (ParserException e) { fail("could not open http://www.sony.co.jp"); } } /** * Test the case of a charset directive different than the HTTP header. * See bug #707447 META TAG - CHARSET * and bug #699886 can't parse website other than iso-8859-1 */ public void testSwitchCharset() throws ParserException { Parser parser; String url = "http://htmlparser.sourceforge.net/test/gb2312Charset.html"; int i; Node[] nodes; parser = new Parser(url); i = 0; nodes = new Node[30]; for (NodeIterator e = parser.elements(); e.hasMoreNodes();) nodes[i++] = e.nextNode(); assertEquals("Expected nodes", 14, i); } /** * Test the case of a double quoted charset directive. * See bug #694477. * Technically, this format does not meet the HTTP/1.1 * specification in RFC 2068. In this case that I believe * that the quotes are being inproperly generated in the * header by a server-side web application. * Nonetheless, it would be nice to handle this case. */ public void testDoubleQuotedCharset() throws ParserException { Parser parser; String url = "http://htmlparser.sourceforge.net/test/DoublequotedCharset.html"; parser = new Parser(url); for (NodeIterator e = parser.elements(); e.hasMoreNodes();) e.nextNode(); assertTrue("Wrong encoding", parser.getEncoding().equals("UTF-8")); } /** * Test the case of a single quoted charset directive. * See bug #694477. * Technically, this format does not meet the HTTP/1.1 * specification in RFC 2068. In this case that I believe * that the quotes are being inproperly generated in the * header by a server-side web application. * Nonetheless, it would be nice to handle this case. */ public void testSingleQuotedCharset() throws ParserException { Parser parser; String url = "http://htmlparser.sourceforge.net/test/SinglequotedCharset.html"; parser = new Parser(url); for (NodeIterator e = parser.elements(); e.hasMoreNodes();) e.nextNode(); assertTrue("Wrong encoding", parser.getEncoding().equals("UTF-8")); } /** * Test a bogus comma delimited charset specification in the HTTP header. * See bug #722941. * A comma delimted charset in the HTTP header does not meet the HTTP/1.1 * specification in RFC 2068. In this case that I believe * that some idiot has misconfigured the HTTP server, but since it's * AOL it would be nice to handle this case. */ public void testCommaListCharset() throws ParserException { URL url; URLConnection connection; Parser parser; String idiots = "http://users.aol.com/geinster/rej.htm"; try { url = new URL(idiots); connection = url.openConnection(); // this little subclass just gets around normal JDK 1.4 processing // that filters out bogus character sets parser = new Parser() { protected String getCharset(String content) { int index; String ret; ret = DEFAULT_CHARSET; if (null != content) { index = content.indexOf(CHARSET_STRING); if (index != -1) { content = content .substring(index + CHARSET_STRING.length()) .trim(); if (content.startsWith("=")) { content = content.substring(1).trim(); index = content.indexOf(";"); if (index != -1) content = content.substring(0, index); //remove any double quotes from around charset string if (content.startsWith("\"") && content.endsWith("\"") && (1 < content.length())) content = content.substring( 1, content.length() - 1); //remove any single quote from around charset string if (content.startsWith("'") && content.endsWith("'") && (1 < content.length())) content = content.substring( 1, content.length() - 1); ret = content; // short circuit findCharset() processing } } } return (ret); } }; parser.setConnection(connection); // must be the default assertTrue( "Wrong encoding", parser.getEncoding().equals("ISO-8859-1")); for (NodeIterator e = parser.elements(); e.hasMoreNodes();) e.nextNode(); assertTrue( "Wrong encoding", parser.getEncoding().equals("windows-1252")); } catch (Exception e) { fail(e.getMessage()); } } public void testNullUrl() { Parser parser; try { parser = new Parser("http://someoneexisting.com", Parser.noFeedback); assertTrue("Should have thrown an exception!", false); } catch (ParserException e) { } } public void testURLWithSpaces() throws ParserException { Parser parser; String url = "http://htmlparser.sourceforge.net/test/This is a Test Page.html"; parser = new Parser(url); Node node[] = new Node[30]; int i = 0; for (NodeIterator e = parser.elements(); e.hasMoreNodes();) { node[i] = e.nextNode(); i++; } assertEquals("Expected nodes", 12, i); } public void testLinkCollection() throws ParserException { createParser( "Google\n" + "\n" + "
\"Google\"

\n" + "" + "" + "" + "" + "" + "" + "" + "
 " + "Web" + " Images Groups Directory News-New! 
\"\"" + "
" + "
" + "
" + "" + "" + "" + "" + "" + "" + "
 " + "" + "" + "" + "
" + "" + "" + "
" + " • Advanced Search" + "
 • Preferences" + "
 • Language Tools" + "
" + "
" + "

\n" + "
Advertise with Us - Search Solutions - Services & Tools - Jobs, Press, & Help\n" + "\n" + "

©2002 Google - Searching 3,083,324,652 web pages

\n"); parser.registerScanners(); int i = 0; NodeList collectionList = new NodeList(); for (NodeIterator e = parser.elements(); e.hasMoreNodes();) { Node node = e.nextNode(); node.collectInto(collectionList, LinkTag.class); } assertEquals( "Size of collection vector should be 11", 11, collectionList.size()); // All items in collection vector should be links for (SimpleNodeIterator e = collectionList.elements(); e.hasMoreNodes(); ) { Node node = e.nextNode(); assertTrue( "Only links should have been parsed", node instanceof LinkTag); } } public void testImageCollection() throws ParserException { createParser( "\n" + "\n" + "\n" + "\n" + "\n" + "
\n" + "\n" + "\n" + "
NISHI-HONGWAN-JI
\n" + "
\n" + "\n" + "\n" + "
The Nihi Hongwanj-ji temple is very traditional, very old, and very beautiful. This is the place that we stayed on our first night in Kyoto. We then attended the morning prayer ceremony, at 6:30 am. Staying here costed us 7,500 yen, which was inclusive of dinner and breakfast, and usage of the o-furo (public bath). Felt more like a luxury hotel than a temple.
\n" + "
\n" + "\n" + "\n" + "
\n" + "
\n" + "\n" + "\n" + "
\n" + "
\n" + "\n" + "\n" + "
\n" + "
\n" + "\n" + "\n" + "
Click on the pictures to see the full-sized versions. The picture at the top right corner is taken in Higashi-Hongwanji. Nishi means west, and Higashi means east. These two temples are adjacent to each other and represent two different Buddhist sects.
\n" + "
\n" + "\n" + "\n" + "
\n" + "
\n" + "\n" + "\n" + "
\n" + "\n" + ""); parser.registerScanners(); int i = 0; NodeList collectionList = new NodeList(); for (NodeIterator e = parser.elements(); e.hasMoreNodes();) { Node node = e.nextNode(); node.collectInto(collectionList, ImageTag.IMAGE_TAG_FILTER); } assertEquals( "Size of collection vector should be 5", 5, collectionList.size()); // All items in collection vector should be links for (SimpleNodeIterator e = collectionList.elements(); e.hasMoreNodes(); ) { Node node = e.nextNode(); assertTrue( "Only images should have been parsed", node instanceof ImageTag); } } public void testRemoveScanner() throws Exception { createParser(""); parser.registerScanners(); parser.removeScanner(new FormScanner("", parser)); Map scanners = parser.getScanners(); TagScanner scanner = (TagScanner) scanners.get("FORM"); assertNull("shouldnt have found scanner", scanner); } /** * See bug #728241 OutOfMemory error/ Infinite loop */ public void testOutOfMemory() throws Exception { createParser( "\n" + "\n" + "\n" + "\n" + " \n" + "\n" + "
\"f'sblah\n" + "
\n" + "\n"); for (NodeIterator e = parser.elements(); e.hasMoreNodes();) { Node node = e.nextNode(); } } /** * See bug #729368 Embedded quote and split tag */ public void testEmbeddedQuoteSplit() throws Exception { createParser( "\n" + "\n" + "\n" + "\n" + "
\"f'sblah
\n" + ""); int i = 0; for (NodeIterator e = parser.elements(); e.hasMoreNodes();) { Node node = e.nextNode(); if (7 == i) { assertTrue("not a tag", node instanceof Tag); assertTrue( "ALT attribute incorrect", ((Tag) node).getAttribute("ALT").equals("f's b")); } i++; } assertEquals("Expected nodes", 16, i); } }
... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.