Java example - MyTokenizer.java

What this is

This file is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" ^TM.
The source code

// $Id: MyTokenizer.java,v 1.11 2004/08/25 19:01:44 mvw Exp $
// Copyright (c) 1996-2002 The Regents of the University of California. All
// Rights Reserved. Permission to use, copy, modify, and distribute this
// software and its documentation without fee, and without a written
// agreement is hereby granted, provided that the above copyright notice
// and this paragraph appear in all copies.  This software program and
// documentation are copyrighted by The Regents of the University of
// California. The software program and documentation are supplied "AS
// IS", without any accompanying services from The Regents. The Regents
// does not warrant that the operation of the program will be
// uninterrupted or error-free. The end-user understands that the program
// was developed for research purposes and is advised not to rely
// exclusively on the program for any reason.  IN NO EVENT SHALL THE
// UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
// SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS,
// ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF
// THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF
// SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY
// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE
// PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF
// CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT,
// UPDATES, ENHANCEMENTS, OR MODIFICATIONS.

package org.argouml.util;

import java.util.*;

/**
 * Internal class for managing the delimiters in MyTokenizer. It's rather
 * similar to CustomSeparator, but faster for short constant strings.
 */
class TokenSep {
    private TokenSep next = null;
    private final String theString;
    private final int length;
    private int pattern;

    /**
     * Constructs a TokenSep that will match the String given in str.
     *
     * @param str The delimiter string.
     */
    public TokenSep(String str) {
	theString = str;
	length = str.length();
	if (length > 32)
	    throw new IllegalArgumentException("TokenSep " + str
	                + " is " + length + " (> 32) chars long");
	pattern = 0;
    }

    /**
     * Called by MyTokenizer when a new character is processed in the
     * sequence. Returns true if we have found the delimiter.
     */
    public boolean addChar(char c) {
	int i;

	pattern <<= 1;
	pattern |= 1;
	for (i = 0; i < length; i++) {
	    if (theString.charAt(i) != c) {
		pattern &= ~(1 << i);
	    }
	}

	return (pattern & (1 << (length - 1))) != 0;
    }

    /**
     * Called by MyTokenizer before starting scanning for a new token.
     */
    public void reset() {
	pattern = 0;
    }

    /**
     * Gets the length of this token.
     */
    public int length() {
	return length;
    }

    /**
     * Gets this token.
     */
    public String getString() {
	return theString;
    }

    /**
     * @param n The next to set.
     */
    public void setNext(TokenSep n) {
        this.next = n;
    }

    /**
     * @return Returns the next.
     */
    public TokenSep getNext() {
        return next;
    }
}

/**
 * A descendent of CustomSeparator that recognizes tokens on one of two forms:
 * 
 * 'chr'.....'esc' 'chr'.....'chr'
 * 
'lchr'...'lchr'...'rchr'...'esc' 'rchr'....'rchr'
 *
 * The first form is suited for quoted strings, like 
"...\"...."
 * or '...\'...'.
 *
 * The second form is suited for expressions, like
 * 
(a+(b*c)-15*eq(a, b)).
 *
 * This is in fact the class currently used for the public separators in
 * MyTokenizer, except PAREN_EXPR_STRING_SEPARATOR and LINE_SEPARATOR.
 */
class QuotedStringSeparator extends CustomSeparator {
    private final char escChr;
    private final char startChr;
    private final char stopChr;
    private boolean esced;
    private int tokLen;
    private int level;

    /**
     * Creates a separator of the first form (see above) where
     * 'chr' = q and 'esc' = esc.
     *
     * @param q The delimiter character.
     * @param esc The escape character.
     */
    public QuotedStringSeparator(char q, char esc) {
	super(q);

	esced = false;
	escChr = esc;
	startChr = 0;
	stopChr = q;
	tokLen = 0;
	level = 1;
    }

    /**
     * Creates a separator of the second form (see above) where
     * 'lchr' = sq, 'rchr' = eq and 'esc' = esc.
     *
     * @param sq The left delimiter character.
     * @param eq The right delimiter character.
     * @param esc The escape character.
     */
    public QuotedStringSeparator(char sq, char eq, char esc) {
	super(sq);

	esced = false;
	escChr = esc;
	startChr = sq;
	stopChr = eq;
	tokLen = 0;
	level = 1;
    }

    public void reset() {
	super.reset();
	tokLen = 0;
	level = 1;
    }

    /**
     * {@inheritDoc}
     *
     * Overridden to return the entire length of the token.
     */
    public int tokenLength() {
	return super.tokenLength() + tokLen;
    }

    /**
     * {@inheritDoc}
     *
     * Overridden to return true.
     *
     * @return true
     */
    public boolean hasFreePart() {
	return true;
    }

    /**
     * {@inheritDoc}
     *
     * Overridden to find the end of the token.
     */
    public boolean endChar(char c) {
	tokLen++;

	if (esced) {
	    esced = false;
	    return false;
	}
	if (escChr != 0 && c == escChr) {
	    esced = true;
	    return false;
	}
	if (startChr != 0 && c == startChr)
	    level++;
	if (c == stopChr)
	    level--;
	return level <= 0;
    }
}

/**
 * A descendent of CustomSeparator that recognizes tokens on the form:
 *
 * 
( " \" ) " ' \' ) ' )
 *
 * 
This is, an expression inside parentheses with proper consideration
 * for quoted strings inside the the expression.
 */
class ExprSeparatorWithStrings extends CustomSeparator {
    private boolean isSQuot;
    private boolean isDQuot;
    private boolean isEsc;
    private int tokLevel;
    private int tokLen;

    /**
     * The constructor. No choices available.
     */
    public ExprSeparatorWithStrings() {
	super('(');

	isEsc = false;
	isSQuot = false;
	isDQuot = false;
	tokLevel = 1;
	tokLen = 0;
    }

    public void reset() {
	super.reset();

	isEsc = false;
	isSQuot = false;
	isDQuot = false;
	tokLevel = 1;
	tokLen = 0;
    }

    /**
     * {@inheritDoc}
     *
     * Overridden to return the entire length of the token.
     */
    public int tokenLength() {
	return super.tokenLength() + tokLen;
    }

    /**
     * {@inheritDoc}
     *
     * Overridden to return true.
     *
     * @return true
     */
    public boolean hasFreePart() {
	return true;
    }

    /**
     * {@inheritDoc}
     *
     * Overridden to find the end of the token.
     */
    public boolean endChar(char c) {
	tokLen++;
	if (isSQuot) {
	    if (isEsc) {
		isEsc = false;
		return false;
	    }
	    if (c == '\\')
		isEsc = true;
	    else if (c == '\'')
		isSQuot = false;
	    return false;
	} else if (isDQuot) {
	    if (isEsc) {
		isEsc = false;
		return false;
	    }
	    if (c == '\\')
		isEsc = true;
	    else if (c == '\"')
		isDQuot = false;
	    return false;
	} else {
	    if (c == '\'')
		isSQuot = true;
	    else if (c == '\"')
		isDQuot = true;
	    else if (c == '(')
		tokLevel++;
	    else if (c == ')')
		tokLevel--;
	    return tokLevel <= 0;
	}
    }
}

/**
 * A descendent of CustomSeparator that recognizes "the tree line ends":
 * 

 * UNIX: <lf>
 * DOS: <cr> <lf>
 * MAC: <cr>
 * 
 *
 * This is in fact the class currently used LINE_SEPARATOR in MyTokenizer.
 */
class LineSeparator extends CustomSeparator {
    private boolean hasCr;
    private boolean hasLf;
    private boolean hasPeeked;

    /**
     * Creates a LineSeparator.
     */
    public LineSeparator() {
	hasCr = false;
	hasLf = false;
	hasPeeked = false;
    }

    public void reset() {
	super.reset();
	hasCr = false;
	hasLf = false;
	hasPeeked = false;
    }

    /**
     * {@inheritDoc}
     */
    public int tokenLength() {
	return hasCr && hasLf ? 2 : 1;
    }

    /**
     * {@inheritDoc}
     */
    public int getPeekCount() {
	return hasPeeked ? 1 : 0;
    }

    /**
     * {@inheritDoc}
     */
    public boolean hasFreePart() {
	return !hasLf;
    }

    /**
     * {@inheritDoc}
     *
     * Overridden to find the start of a line-end.
     */
    public boolean addChar(char c) {
	if (c == '\n') {
	    hasLf = true;
	    return true;
	}

	if (c == '\r') {
	    hasCr = true;
	    return true;
	}

	return false;
    }

    /**
     * {@inheritDoc}
     *
     * Overridden to find the end of a line-end.
     */
    public boolean endChar(char c) {
	if (c == '\n') {
	    hasLf = true;
	} else {
	    hasPeeked = true;
	}

	return true;
    }
}

/**
 * Class for dividing a String into any number of parts. Each part will be a
 * substring of the original String. The first part will at least contain the
 * first character in the string. All following parts will at least contain
 * the first character in the String not covered by any previous part.
 *
 * 
The delim parameter to the constructors is a comma separated list of
 * tokens that should be recognized by the tokenizer. These tokens will be
 * returned by the tokenizer as tokens, and any arbitrary text between them
 * will also be returned as tokens. Since the comma has special meaning in
 * this string, it can be escaped with \ to only mean itself (like in "\\,").
 * For technical reasons it is not possible for any token in this list to be
 * more than 32 characters long.
 *
 * 
In addition to the delim parameter it is also possible to use custom
 * separators that allow any string that can be generated by the limited
 * version of a Turing machine that your computer is, to be used as a
 * delimiter.
 *
 * 
There are some custom separators provided that you can use to get
 * things like strings in one token. These cannot be used simultaneously by
 * several tokenizers, ie they are not thread safe.
 *
 * 
The tokenizer works in a kind of greedy way. When the first separator
 * token from delim is matched or any CustomSeparator returns true from
 * addChar, then it is satisfied it has found a token and does NOT check if
 * it could have found a longer token. Eg: if you have this delim string
 * "<,<<", then "<<" will never be found.
 *
 * 
Example
 * MyTokenizer tzer = new MyTokenizer("Hello, how are you?", " ,\\,");
 * while (tzer.hasMoreTokens())
 *   _cat.info("\"" + tzer.nextToken() + "\"");
 * 
 *
 * Which whould yield the following output:
 *   "Hello"
 *   ","
 *   " "
 *   "how"
 *   " "
 *   "are"
 *   " "
 *   "you?"
 * 
 *
 * @author Michael Stockman
 * @since 0.11.2
 * @see CustomSeparator
 */
public class MyTokenizer implements Enumeration
{
    /** A custom separator for quoted strings enclosed in single quotes
     *  and using \ as escape character. There may not be an end quote
     *  if the tokenizer reaches the end of the String. */
    public static final CustomSeparator SINGLE_QUOTED_SEPARATOR =
	new QuotedStringSeparator('\'', '\\');

    /** A custom separator for quoted strings enclosed in double quotes
     *  and using \ as escape character. There may not be an end quote
     *  if the tokenizer reaches the end of the String. */
    public static final CustomSeparator DOUBLE_QUOTED_SEPARATOR =
	new QuotedStringSeparator('\"', '\\');

    /** A custom separator for expressions enclosed in parentheses and
     *  matching lparams with rparams. There may not be proper matching
     *  if the tokenizer reaches the end of the String. Do not use this
     *  together with PAREN_EXPR_STRING_SEPARATOR. */
    public static final CustomSeparator PAREN_EXPR_SEPARATOR =
	new QuotedStringSeparator('(', ')', '\0');

    /** A custom separator for expressions enclosed in parentheses and
     *  matching lparams with rparams. There may not be proper matching
     *  if the tokenizer reaches the end of the String. It also takes
     *  quoted strings (either single or double quotes) in the expression
     *  into consideration, unlike PAREN_EXPR_SEPARATOR. Do not use this
     *  together with PAREN_EXPR_SEPARATOR. */
    public static final CustomSeparator PAREN_EXPR_STRING_SEPARATOR =
	new ExprSeparatorWithStrings();

    /** A custom separator for texts. Singles out the line ends,
     *  and consequently the lines, if they are in either dos, mac
     *  or unix format. */
    public static final CustomSeparator LINE_SEPARATOR =
	new LineSeparator();

    private int sIdx;
    private final int eIdx;
    private int tokIdx;
    private final String source;
    private final TokenSep delims;
    private String savedToken;
    private int savedIdx;
    private Vector customSeps;
    private String putToken;

    /**
     * Constructs a new instance. See above for a description of the
     * delimiter string.
     *
     * @param string	The String to be tokenized.
     * @param delim	The String of delimiters.
     */
    public MyTokenizer(String string, String delim) {
	source = string;
	delims = parseDelimString(delim);
	sIdx = 0;
	tokIdx = 0;
	eIdx = string.length();
	savedToken = null;
	customSeps = null;
	putToken = null;
    }

    /**
     * Constructs a new instance. See above for a description of the
     * delimiter string and custom separators.
     *
     * @param string	The String to be tokenized.
     * @param delim	The String of delimiters.
     * @param sep	A custom separator to use.
     */
    public MyTokenizer(String string, String delim, CustomSeparator sep) {
	source = string;
	delims = parseDelimString(delim);
	sIdx = 0;
	tokIdx = 0;
	eIdx = string.length();
	savedToken = null;
	customSeps = new Vector();
	customSeps.add(sep);
    }

    /**
     * Constructs a new instance. See above for a description of the
     * delimiter string and custom separators.
     *
     * @param string	The String to be tokenized.
     * @param delim	The String of delimiters.
     * @param seps	Some container with custom separators to use.
     */
    public MyTokenizer(String string, String delim, Collection seps) {
	source = string;
	delims = parseDelimString(delim);
	sIdx = 0;
	tokIdx = 0;
	eIdx = string.length();
	savedToken = null;
	customSeps = new Vector(seps);
    }

    /**
     * Returns true if there are more tokens left.
     *
     * @return true if another token can be fetched with nextToken.
     */
    public boolean hasMoreTokens() {
	return sIdx < eIdx || savedToken != null 
	    || putToken != null;
    }

    /**
     * Retrives the next token.
     *
     * @return The next token.
     */
    public String nextToken() {
	CustomSeparator csep;
	TokenSep sep;
	String s = null;
	int i, j;

	if (putToken != null) {
	    s = putToken;
	    putToken = null;
	    return s;
	}

	if (savedToken != null) {
	    s = savedToken;
	    tokIdx = savedIdx;
	    savedToken = null;
	    return s;
	}

	if (sIdx >= eIdx)
	    throw new NoSuchElementException(
					     "No more tokens available");

	for (sep = delims; sep != null; sep = sep.getNext())
	    sep.reset();

	if (customSeps != null) {
	    for (i = 0; i < customSeps.size(); i++)
		((CustomSeparator) customSeps.get(i)).reset();
	}

	for (i = sIdx; i < eIdx; i++) {
	    char c = source.charAt(i);

	    for (j = 0; customSeps != null 
	            && j < customSeps.size(); j++) {
		csep = (CustomSeparator) customSeps.get(j);

		if (csep.addChar(c))
		    break;
	    }
	    if (customSeps != null && j < customSeps.size()) {
		csep = (CustomSeparator) customSeps.get(j);

		while (csep.hasFreePart() && i + 1 < eIdx)
		    if (csep.endChar(source.charAt(++i)))
			break;
		i -= Math.min(csep.getPeekCount(), i);

		int clen = Math.min(i + 1, source.length());

		if (i - sIdx + 1 > csep.tokenLength()) {
		    s = source.substring(sIdx,
					  i - csep.tokenLength() + 1);

		    savedIdx = i - csep.tokenLength() + 1;
		    savedToken = source.substring(
						    savedIdx, clen);
		} else {
		    s = source.substring(sIdx, clen);
		}

		tokIdx = sIdx;
		sIdx = i + 1;
		break;
	    }

	    for (sep = delims; sep != null; sep = sep.getNext())
		if (sep.addChar(c))
		    break;
	    if (sep != null)
	    {
		if (i - sIdx + 1 > sep.length()) {
		    s = source.substring(sIdx,
					  i - sep.length() + 1);
		    savedIdx = i - sep.length() + 1;
		    savedToken = sep.getString();
		} else {
		    s = sep.getString();
		}
		tokIdx = sIdx;
		sIdx = i + 1;
		break;
	    }
	}

	if (s == null) {
	    s = source.substring(sIdx);
	    tokIdx = sIdx;
	    sIdx = eIdx;
	}

	return s;
    }

    /**
     * This class implements the Enumeration interface. This call maps
     * to nextToken.
     *
     * @return nextToken();
     * @see	#nextToken() nextToken
     */
    public Object nextElement() {
	return nextToken();
    }

    /**
     * This class implements the Enumeration interface. This call maps
     * to hasMoreTokens.
     *
     * @return hasMoreTokens();
     * @see	#hasMoreTokens() hasMoreTokens
     */
    public boolean hasMoreElements() {
	return hasMoreTokens();
    }

    /**
     * Returns the index in the string of the last token returned by
     * nextToken, or zero if no token has been retrived.
     *
     * @return The index of the last token.
     */
    public int getTokenIndex() {
	return tokIdx;
    }

    /**
     * Put a token on the input stream. This will be the next token read
     * from the tokenizer. If this function is called again before the
     * last token has been read, then it will be lost.
     *
     * The index returned from getTokenIndex will be the same for the
     * token put as that of the last token that wasn't put.
     *
     * @param s The token to put.
     * @throws NullPointerException if s is null.
     */
    public void putToken(String s) {
	if (s == null)
	    throw new NullPointerException(
					   "Cannot put a null token");

	putToken = s;
    }

    /**
     * Creates a linked list of TokenSeps from the comma separated string
     * str.
     *
     * @param str The string specifying delimiter strings.
     * @return A list of TokenSeps.
     */
    private static TokenSep parseDelimString(String str) {
	TokenSep first = null;
	TokenSep p = null;
	int idx0, idx1, length;
	String val = "";
	char c;

	length = str.length();
	for (idx0 = 0; idx0 < length; ) {
	    for (idx1 = idx0; idx1 < length; idx1++) {
		c = str.charAt(idx1);
		if (c == '\\') {
		    idx1++;
		    if (idx1 < length)
			val += str.charAt(idx1);
		} else if (c == ',') {
		    break;
		} else {
		    val += c;
		}
	    }
	    idx1 = Math.min(idx1, length);
	    if (idx1 > idx0) {
		p = new TokenSep(val);
		val = "";
		p.setNext(first);
		first = p;
	    }

	    idx0 = idx1 + 1;
	}

	return first;
    }
}
Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.
What this is

Other links

The source code

new blog posts

... this post is sponsored by my books ...
#1 New Release!	FP Best Seller