alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

What this is

This file is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Other links

The source code

// $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/scanners/CompositeTagScanner.java,v 1.2 2004/02/10 13:41:09 woolfel Exp $
/*
 * ====================================================================
 * Copyright 2002-2004 The Apache Software Foundation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 */

// The developers of JMeter and Apache are greatful to the developers
// of HTMLParser for giving Apache Software Foundation a non-exclusive
// license. The performance benefits of HTMLParser are clear and the
// users of JMeter will benefit from the hard work the HTMLParser
// team. For detailed information about HTMLParser, the project is
// hosted on sourceforge at http://htmlparser.sourceforge.net/.
//
// HTMLParser was originally created by Somik Raha in 2000. Since then
// a healthy community of users has formed and helped refine the
// design so that it is able to tackle the difficult task of parsing
// dirty HTML. Derrick Oswald is the current lead developer and was kind
// enough to assist JMeter.

package org.htmlparser.scanners;

import java.util.HashSet;
import java.util.Set;

import org.htmlparser.Node;
import org.htmlparser.NodeReader;
import org.htmlparser.parserHelper.CompositeTagScannerHelper;
import org.htmlparser.tags.EndTag;
import org.htmlparser.tags.Tag;
import org.htmlparser.tags.data.CompositeTagData;
import org.htmlparser.tags.data.TagData;
import org.htmlparser.util.ParserException;

/**
 * To create your own scanner that can hold children, create a subclass of this class.
 * The composite tag scanner can be configured with:
*
    *
  • Tags which will trigger a match
  • *
  • Tags which when encountered before a legal end tag, should force a correction
  • *
  • Preventing more tags of its own type to appear as children *
* Here are examples of each:
* Tags which will trigger a match * If we wish to recognize <mytag>, *
 * MyScanner extends CompositeTagScanner {
 *   private static final String [] MATCH_IDS = { "MYTAG" };
 *	 MyScanner() {
 *		super(MATCH_IDS);
 *	 }
 *	 ...
 * }
 * 
* Tags which force correction * If we wish to insert end tags if we get a or without recieving * </mytag> *
 * MyScanner extends CompositeTagScanner {
 *   private static final String [] MATCH_IDS = { "MYTAG" };
 *   private static final String [] ENDERS = {};
 *   private static final String [] END_TAG_ENDERS = { "BODY", "HTML" };
 *	 MyScanner() {
 *		super(MATCH_IDS, ENDERS, END_TAG_ENDERS, true);
 *	 }
 *	 ...
 * }
 * 
* Preventing children of same type * This is useful when you know that a certain tag can never hold children of its own type. * e.g. <FORM> can never have more form tags within it. If it does, it is an error and should * be corrected. The default behavior is to allow nesting. *
 * MyScanner extends CompositeTagScanner {
 *   private static final String [] MATCH_IDS = { "FORM" };
 *   private static final String [] ENDERS = {};
 *   private static final String [] END_TAG_ENDERS = { "BODY", "HTML" };
 *	 MyScanner() {
 *		super(MATCH_IDS, ENDERS,END_TAG_ENDERS, false);
 *	 }
 *	 ...
 * }
 * 
* Inside the scanner, use createTag() to specify what tag needs to be created. */ public abstract class CompositeTagScanner extends TagScanner { protected String[] nameOfTagToMatch; private boolean allowSelfChildren; private Set tagEnderSet; private Set endTagEnderSet; private boolean balance_quotes; public CompositeTagScanner(String[] nameOfTagToMatch) { this(nameOfTagToMatch, new String[] { }); } public CompositeTagScanner(String[] nameOfTagToMatch, String[] tagEnders) { this("", nameOfTagToMatch, tagEnders); } public CompositeTagScanner( String[] nameOfTagToMatch, String[] tagEnders, boolean allowSelfChildren) { this("", nameOfTagToMatch, tagEnders, allowSelfChildren); } public CompositeTagScanner(String filter, String[] nameOfTagToMatch) { this(filter, nameOfTagToMatch, new String[] { }, true); } public CompositeTagScanner( String filter, String[] nameOfTagToMatch, String[] tagEnders) { this(filter, nameOfTagToMatch, tagEnders, true); } public CompositeTagScanner( String filter, String[] nameOfTagToMatch, String[] tagEnders, boolean allowSelfChildren) { this(filter, nameOfTagToMatch, tagEnders, new String[] { }, allowSelfChildren); } public CompositeTagScanner( String filter, String[] nameOfTagToMatch, String[] tagEnders, String[] endTagEnders, boolean allowSelfChildren) { this( filter, nameOfTagToMatch, tagEnders, endTagEnders, allowSelfChildren, false); } /** * Constructor specifying all member fields. * @param filter A string that is used to match which tags are to be allowed * to pass through. This can be useful when one wishes to dynamically filter * out all tags except one type which may be programmed later than the parser. * @param nameOfTagToMatch The tag names recognized by this scanner. * @param tagEnders The non-endtag tag names which signal that no closing * end tag was found. For example, encountering <FORM> while * scanning a <A> link tag would mean that no </A> was found * and needs to be corrected. * @param endTagEnders The endtag names which signal that no closing end * tag was found. For example, encountering </HTML> while * scanning a <BODY> tag would mean that no </BODY> was found * and needs to be corrected. These items are not prefixed by a '/'. * @param allowSelfChildren If true a tag of the same name is * allowed within this tag. Used to determine when an endtag is missing. * @param balance_quotes true if scanning string nodes needs to * honour quotes. For example, ScriptScanner defines this true * so that text within <SCRIPT></SCRIPT> ignores tag-like text * within quotes. */ public CompositeTagScanner( String filter, String[] nameOfTagToMatch, String[] tagEnders, String[] endTagEnders, boolean allowSelfChildren, boolean balance_quotes) { super(filter); this.nameOfTagToMatch = nameOfTagToMatch; this.allowSelfChildren = allowSelfChildren; this.balance_quotes = balance_quotes; this.tagEnderSet = new HashSet(); for (int i = 0; i < tagEnders.length; i++) tagEnderSet.add(tagEnders[i]); this.endTagEnderSet = new HashSet(); for (int i = 0; i < endTagEnders.length; i++) endTagEnderSet.add(endTagEnders[i]); } public Tag scan(Tag tag, String url, NodeReader reader, String currLine) throws ParserException { CompositeTagScannerHelper helper = new CompositeTagScannerHelper( this, tag, url, reader, currLine, balance_quotes); return helper.scan(); } /** * Override this method if you wish to create any data structures or do anything * before the start of the scan. This is just after a tag has triggered the scanner * but before the scanner begins its processing. */ public void beforeScanningStarts() { } /** * This method is called everytime a child to the composite is found. It is useful when we * need to store special children seperately. Though, all children are collected anyway into a node list. */ public void childNodeEncountered(Node node) { } /** * You must override this method to create the tag of your choice upon successful parsing. Data required * for construction of your tag can be found within tagData and compositeTagData */ public abstract Tag createTag( TagData tagData, CompositeTagData compositeTagData) throws ParserException; public final boolean isTagToBeEndedFor(Tag tag) { boolean isEndTag = tag instanceof EndTag; String tagName = tag.getTagName(); if ((isEndTag && endTagEnderSet.contains(tagName)) || (!isEndTag && tagEnderSet.contains(tagName))) return true; else return false; } public final boolean isAllowSelfChildren() { return allowSelfChildren; } /** * Override this method to implement scanner logic that determines if the current scanner is * to be allowed. This is useful when there are rules which dont allow recursive tags of the same * type. @see BulletScanner * @return boolean true/false */ public boolean shouldCreateEndTagAndExit() { return false; } }
... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.