|
What this is
Other links
The source code// $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/scanners/CompositeTagScanner.java,v 1.2 2004/02/10 13:41:09 woolfel Exp $ /* * ==================================================================== * Copyright 2002-2004 The Apache Software Foundation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ // The developers of JMeter and Apache are greatful to the developers // of HTMLParser for giving Apache Software Foundation a non-exclusive // license. The performance benefits of HTMLParser are clear and the // users of JMeter will benefit from the hard work the HTMLParser // team. For detailed information about HTMLParser, the project is // hosted on sourceforge at http://htmlparser.sourceforge.net/. // // HTMLParser was originally created by Somik Raha in 2000. Since then // a healthy community of users has formed and helped refine the // design so that it is able to tackle the difficult task of parsing // dirty HTML. Derrick Oswald is the current lead developer and was kind // enough to assist JMeter. package org.htmlparser.scanners; import java.util.HashSet; import java.util.Set; import org.htmlparser.Node; import org.htmlparser.NodeReader; import org.htmlparser.parserHelper.CompositeTagScannerHelper; import org.htmlparser.tags.EndTag; import org.htmlparser.tags.Tag; import org.htmlparser.tags.data.CompositeTagData; import org.htmlparser.tags.data.TagData; import org.htmlparser.util.ParserException; /** * To create your own scanner that can hold children, create a subclass of this class. * The composite tag scanner can be configured with:
* Tags which will trigger a match * If we wish to recognize <mytag>, * * MyScanner extends CompositeTagScanner { * private static final String [] MATCH_IDS = { "MYTAG" }; * MyScanner() { * super(MATCH_IDS); * } * ... * } ** Tags which force correction * If we wish to insert end tags if we get a or without recieving * </mytag> * * MyScanner extends CompositeTagScanner { * private static final String [] MATCH_IDS = { "MYTAG" }; * private static final String [] ENDERS = {}; * private static final String [] END_TAG_ENDERS = { "BODY", "HTML" }; * MyScanner() { * super(MATCH_IDS, ENDERS, END_TAG_ENDERS, true); * } * ... * } ** Preventing children of same type * This is useful when you know that a certain tag can never hold children of its own type. * e.g. <FORM> can never have more form tags within it. If it does, it is an error and should * be corrected. The default behavior is to allow nesting. * * MyScanner extends CompositeTagScanner { * private static final String [] MATCH_IDS = { "FORM" }; * private static final String [] ENDERS = {}; * private static final String [] END_TAG_ENDERS = { "BODY", "HTML" }; * MyScanner() { * super(MATCH_IDS, ENDERS,END_TAG_ENDERS, false); * } * ... * } ** Inside the scanner, use createTag() to specify what tag needs to be created. */ public abstract class CompositeTagScanner extends TagScanner { protected String[] nameOfTagToMatch; private boolean allowSelfChildren; private Set tagEnderSet; private Set endTagEnderSet; private boolean balance_quotes; public CompositeTagScanner(String[] nameOfTagToMatch) { this(nameOfTagToMatch, new String[] { }); } public CompositeTagScanner(String[] nameOfTagToMatch, String[] tagEnders) { this("", nameOfTagToMatch, tagEnders); } public CompositeTagScanner( String[] nameOfTagToMatch, String[] tagEnders, boolean allowSelfChildren) { this("", nameOfTagToMatch, tagEnders, allowSelfChildren); } public CompositeTagScanner(String filter, String[] nameOfTagToMatch) { this(filter, nameOfTagToMatch, new String[] { }, true); } public CompositeTagScanner( String filter, String[] nameOfTagToMatch, String[] tagEnders) { this(filter, nameOfTagToMatch, tagEnders, true); } public CompositeTagScanner( String filter, String[] nameOfTagToMatch, String[] tagEnders, boolean allowSelfChildren) { this(filter, nameOfTagToMatch, tagEnders, new String[] { }, allowSelfChildren); } public CompositeTagScanner( String filter, String[] nameOfTagToMatch, String[] tagEnders, String[] endTagEnders, boolean allowSelfChildren) { this( filter, nameOfTagToMatch, tagEnders, endTagEnders, allowSelfChildren, false); } /** * Constructor specifying all member fields. * @param filter A string that is used to match which tags are to be allowed * to pass through. This can be useful when one wishes to dynamically filter * out all tags except one type which may be programmed later than the parser. * @param nameOfTagToMatch The tag names recognized by this scanner. * @param tagEnders The non-endtag tag names which signal that no closing * end tag was found. For example, encountering <FORM> while * scanning a <A> link tag would mean that no </A> was found * and needs to be corrected. * @param endTagEnders The endtag names which signal that no closing end * tag was found. For example, encountering </HTML> while * scanning a <BODY> tag would mean that no </BODY> was found * and needs to be corrected. These items are not prefixed by a '/'. * @param allowSelfChildren If true a tag of the same name is
* allowed within this tag. Used to determine when an endtag is missing.
* @param balance_quotes true if scanning string nodes needs to
* honour quotes. For example, ScriptScanner defines this true
* so that text within <SCRIPT></SCRIPT> ignores tag-like text
* within quotes.
*/
public CompositeTagScanner(
String filter,
String[] nameOfTagToMatch,
String[] tagEnders,
String[] endTagEnders,
boolean allowSelfChildren,
boolean balance_quotes)
{
super(filter);
this.nameOfTagToMatch = nameOfTagToMatch;
this.allowSelfChildren = allowSelfChildren;
this.balance_quotes = balance_quotes;
this.tagEnderSet = new HashSet();
for (int i = 0; i < tagEnders.length; i++)
tagEnderSet.add(tagEnders[i]);
this.endTagEnderSet = new HashSet();
for (int i = 0; i < endTagEnders.length; i++)
endTagEnderSet.add(endTagEnders[i]);
}
public Tag scan(Tag tag, String url, NodeReader reader, String currLine)
throws ParserException
{
CompositeTagScannerHelper helper =
new CompositeTagScannerHelper(
this,
tag,
url,
reader,
currLine,
balance_quotes);
return helper.scan();
}
/**
* Override this method if you wish to create any data structures or do anything
* before the start of the scan. This is just after a tag has triggered the scanner
* but before the scanner begins its processing.
*/
public void beforeScanningStarts()
{
}
/**
* This method is called everytime a child to the composite is found. It is useful when we
* need to store special children seperately. Though, all children are collected anyway into a node list.
*/
public void childNodeEncountered(Node node)
{
}
/**
* You must override this method to create the tag of your choice upon successful parsing. Data required
* for construction of your tag can be found within tagData and compositeTagData
*/
public abstract Tag createTag(
TagData tagData,
CompositeTagData compositeTagData)
throws ParserException;
public final boolean isTagToBeEndedFor(Tag tag)
{
boolean isEndTag = tag instanceof EndTag;
String tagName = tag.getTagName();
if ((isEndTag && endTagEnderSet.contains(tagName))
|| (!isEndTag && tagEnderSet.contains(tagName)))
return true;
else
return false;
}
public final boolean isAllowSelfChildren()
{
return allowSelfChildren;
}
/**
* Override this method to implement scanner logic that determines if the current scanner is
* to be allowed. This is useful when there are rules which dont allow recursive tags of the same
* type. @see BulletScanner
* @return boolean true/false
*/
public boolean shouldCreateEndTagAndExit()
{
return false;
}
}
|
... this post is sponsored by my books ... | |
#1 New Release! |
FP Best Seller |
Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.
A percentage of advertising revenue from
pages under the /java/jwarehouse
URI on this website is
paid back to open source projects.