alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

What this is

This file is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Other links

The source code

/*
 *                 Sun Public License Notice
 *
 * The contents of this file are subject to the Sun Public License
 * Version 1.0 (the "License"). You may not use this file except in
 * compliance with the License. A copy of the License is available at
 * http://www.sun.com/
 *
 * The Original Code is NetBeans. The Initial Developer of the Original
 * Code is Sun Microsystems, Inc. Portions Copyright 1997-2004 Sun
 * Microsystems, Inc. All Rights Reserved.
 */


package org.netbeans.modules.i18n.regexp;

/**
 * Parser of regular expressions.
 *
 * @author  Marian Petras
 */
public class Parser {

    /** regular expression being parsed by this parser */
    private String regexp;

    /**
     * names of string tokens {token-name} to be recognized
     * by this parser.
     * The array contains just the names (without
     * '{' and '}').
     */
    private String[] tokenNames;

    /**
     * length of the longest string token name
     *
     * @see  #tokenNames
     */
    private int maxTokenLength;

    /**
     * Parses the given regular expression.
     *
     * @param  regexp  regular expression to parse
     * @return  root of a syntax tree of the regular expression
     * @exception  java.lang.IllegalArgumentException
     *             if the regular expression is null
     * @exception  ParseException
     *             if the given expression contained a syntax error
     */
    public static TreeNodeRoot parse(String regexp)
            throws IllegalArgumentException, ParseException {
        return parse(regexp, null);
    }

    /**
     * Parses the given regular expression with tokens enclosed
     * between { and }.
     *
     * @param  regexp  regular expression to parse
     * @param  tokenNames  names of a tokens to be recognized;
     *                     or null if no tokens should be
     *                     recognized
     * @return  root of a syntax tree of the regular expression
     * @exception  java.lang.IllegalArgumentException
     *             if the regular expression is null
     * @exception  ParseException
     *             if the given expression contained a syntax error
     */
    public static TreeNodeRoot parse(String regexp, String tokenNames[])
            throws IllegalArgumentException, ParseException {
        Parser parser = new Parser(regexp);
        if (tokenNames != null && tokenNames.length != 0) {
            parser.setTokenNames(tokenNames);
        }
        return parser.parse();
    }

    /**
     * Constructs a parser for parsing a given regular expression.
     *
     * @param  regexp  regular expression to parse
     * @exception  java.lang.IllegalArgumentException
     *             if the argument is null
     */
    Parser(String regexp) {
        if (regexp == null) {
            throw new IllegalArgumentException();
        }
        this.regexp = regexp;
    }

    /**
     */
    private void setTokenNames(String[] tokenNames) {
        if (tokenNames != null && tokenNames.length != 0) {
            this.tokenNames = tokenNames;
            maxTokenLength = tokenNames[0].length();
            for (int i = 1; i < tokenNames.length; i++) {
                if (tokenNames[i].length() > maxTokenLength) {
                    maxTokenLength = tokenNames[i].length();
                }
            }
        } else {
            this.tokenNames = null;
            maxTokenLength = 0;
        }
    }

    /**
     * Performs parsing of the regular expression.
     *
     * @return  root of a syntax tree of the regular expression
     * @exception  ParseException
     *             if the expression contained a syntax error
     */
    TreeNodeRoot parse() throws ParseException {

        TreeNodeRoot result;
        TreeNode multiRegexpNode = null;

        int begin = 0;
        int end = regexp.length();
        boolean initialPart = false;
        boolean finalPart = false;

        if (begin == end) {
            return null;
        }

        /* Handle regular expressions "^", "$" and "^$": */
        if (regexp.charAt(0) == '^') {
            initialPart = true;
            begin++;
        }
        if ((end == begin + 1) && (regexp.charAt(begin) == '$')) {
            finalPart = true;
            end--;
        }

        /*
         * The following special regular expressions are now handled:
         *
         *    ...  returned 
         *   ^   .......  begin=1, end=1, initialpart=true,  finalPart=false
         *   ^$  .......  begin=1, end=1, initialpart=true,  finalPart=true
         *   $   .......  begin=0, end=0, initialPart=false, finalPart=true
         *
         * In all the cases except for the empty regular expression,
         * it is true that (begin == end). So we know that if (begin != end),
         * there must be something between the optional characters '^' and '$'.
         * If nothing is found, it singals a syntax error.
         */

        if (begin != end) {
            multiRegexpNode = parseMultiRegexp(begin, end);

            /*
             * If nothing was found between the (optional) initial '^'
             * and the (optional) final '$', it is a syntax error:
             */
            if (multiRegexpNode == null) {
                throwParseException(begin);
            }

            /*
             * If there is a single character pending after the recognized
             * regular expression and it is '$', it is the (optional) final '$':
             */
            if ((multiRegexpNode.end == end - 1)
                    && (regexp.charAt(end - 1) == '$')) {
                finalPart = true;
                end--;
            }

            /*
             * If some characters between the (optional) initial '^'
             * and the (optional) final '$' have been left unrecognized,
             * it is a syntax error:
             */
            if (multiRegexpNode.end != end) {
                throwParseException(begin);
            }
        }

        String attribs = null;
        if (initialPart || finalPart) {
            StringBuffer buf = new StringBuffer(2);
            if (initialPart) {
                buf.append('^');
            }
            if (finalPart) {
                buf.append('$');
            }
            attribs = buf.toString();
        }

        result = new TreeNodeRoot(regexp, attribs);
        if (multiRegexpNode != null) {
            result.add(multiRegexpNode);
        }
        return result;
    }


    /** */
    private void throwParseException(int position) throws ParseException {
        throw new ParseException(regexp, position);
    }


    /** */
    private TreeNode parseMultiRegexp(int start, int end)
            throws ParseException {
        if (start == end) {
            return null;
        }

        TreeNode regexpSequenceNode = parseRegexpSequence(start, end);
        if (regexpSequenceNode == null) {
            return null;
        }

        java.util.List alternatives = new java.util.ArrayList(4);
        alternatives.add(regexpSequenceNode);

        while (regexpSequenceNode.end != end
                && regexp.charAt(regexpSequenceNode.end) == '|') {
            int from = regexpSequenceNode.end + 1;
            regexpSequenceNode = parseRegexpSequence(from, end);

            if (regexpSequenceNode == null) {

                /* expected: regexp sequence */
                throwParseException(from);
            }

            alternatives.add(regexpSequenceNode);
        };

        TreeNode result = new TreeNode(TreeNode.MULTI_REGEXP,
                                       start,
                                       regexpSequenceNode.end);
        java.util.Iterator i;
        for (i = alternatives.iterator(); i.hasNext(); ) {
            result.add((TreeNode) i.next());
        }
        return result;
    }

    
    /** */
    private TreeNode parseRegexpSequence(int start, int end)
            throws ParseException {
        if (start == end) {
            return null;
        }

        TreeNode result;
        java.util.List sequence = null;
        TreeNode lastChildNode = null;

        int from = start;
        while (true) {
            TreeNode qRegexpNode = parseQRegexp(from, end);

            if (qRegexpNode == null) {
                break;
            }

            if (sequence == null) {
                sequence = new java.util.ArrayList(4);
            }
            sequence.add(qRegexpNode);

            /* remember the last added regexp: */
            lastChildNode = qRegexpNode;

            /* test - is parsing finished? */
            if (qRegexpNode.end == end) {
                break;
            }

            from = qRegexpNode.end;
        }

        if (sequence == null) {
            return null;
        }

        result = new TreeNode(TreeNode.SIMPLE_REGEXP, start, lastChildNode.end);
        java.util.Iterator i;
        for (i = sequence.iterator(); i.hasNext(); ) {
            result.add((TreeNode) i.next());
        }
        return result;
    }


    /** */
    private TreeNode parseQRegexp(int start, int end) throws ParseException {
        if (start == end) {
            return null;
        }

        TreeNode result;

        TreeNode singleRegexpNode = parseSingleRegexp(start, end);
        if (singleRegexpNode == null) {
            return null;
        }

        /* test - is parsing finished? */
        if (singleRegexpNode.end == end) {
            result = new TreeNode(TreeNode.Q_REGEXP,
                                  start,
                                  singleRegexpNode.end);
            result.add(singleRegexpNode);
            return result;
        }

        TreeNode quantifierNode = parseQuantifier(singleRegexpNode.end, end);
        if (quantifierNode == null) {
            result = new TreeNode(TreeNode.Q_REGEXP,
                                  start,
                                  singleRegexpNode.end);
            result.add(singleRegexpNode);
        } else {
            result = new TreeNode(TreeNode.Q_REGEXP,
                                  start,
                                  quantifierNode.end);
            result.add(singleRegexpNode);
            result.add(quantifierNode);
        }
        return result;
    }


    /** */
    private TreeNode parseSingleRegexp(int start, int end)
            throws ParseException {
        if (start == end) {
            return null;
        }

        TreeNode result;
        char ch = regexp.charAt(start);
        switch (ch) {
            case '.':
                result = new TreeNode(TreeNode.METACHAR,
                                      start,
                                      start + 1,
                                      new Character(ch));
                break;

            case '[':
                TreeNode setNode = parseSet(start, end);
                assert setNode != null;
                return setNode;

            case '(':
                TreeNode subexprNode = parseSubexpr(start, end);
                assert subexprNode != null;
                return subexprNode;

            case '\\':
                if (end == start + 1) {
                    
                    /* unexpected end of regexp */
                    throwParseException(end);
                }
                char ch2 = regexp.charAt(start + 1);
                switch (ch2) {
                    case 'b':
                    case 'B':
                        result = new TreeNode(TreeNode.METACHAR,
                                              start,
                                              start + 2,
                                              new Character(ch2));
                        break;

                    case 'u':
                        Integer unicode = parseUnicode(start + 2, end);
                        if (unicode == null) {

                            /* expected: 4-digit hexadecimal number */
                            throwParseException(start + 2);
                        }
                        result = new TreeNode(TreeNode.UNICODE_CHAR,
                                              start,
                                              start + 6,
                                              unicode);
                        break;

                    default:
                        char parsedChar;
                        switch (ch2) {
                            case 't':
                                parsedChar = '\t';
                                break;

                            case 'n':
                                parsedChar = '\n';
                                break;

                            case 'r':
                                parsedChar = '\r';
                                break;

                            case 'f':
                                parsedChar = '\f';
                                break;

                            default:
                                parsedChar = ch2;
                                break;
                        }
                        result = new TreeNode(TreeNode.CHAR,
                                              start,
                                              start + 2,
                                              new Character(parsedChar));
                        break;
                }
                break;

            case '{':
                String tokenName = getTokenName(start, end);
                if (tokenName != null) {
                    result = new TreeNode(TreeNode.TOKEN,
                                          start,
                                          start + tokenName.length() + 2,
                                          tokenName);
                    break;
                }
                /* falls through */

            default:
                if ("^$|*+?)]{}".indexOf(ch) != -1) {                //NOI18N
                    return null;
                }
                result = new TreeNode(TreeNode.CHAR,
                                      start,
                                      start + 1,
                                      new Character(ch));
                break;
        }
        return result;
    }


    /** */
    private TreeNode parseQuantifier(int start, int end)
            throws ParseException {
        if (start == end) {
            return null;
        }

        TreeNode result = null;
        char ch = regexp.charAt(start);
        switch (ch) {
            case '*':
            case '+':
            case '?':
                result = new TreeNode(TreeNode.QUANTIFIER,
                                      start,
                                      start + 1,
                                      new Character(ch));
                return result;
            case '{':
                break;
            default:
                return null;
        }

        if (end - start == 1) {
            
            /* expected: number or token */
            throwParseException(start + 1);
        }

        TreeNode numberNode1 = parseNumber(start + 1, end);
        if (numberNode1 == null) {

            /* it is not a number - maybe it is a token: */
            if (getTokenName(start, end) != null) {

                /* if it is a token, it is not a quantifier: */
                return null;
            }

            /* expected: number */
            throwParseException(start + 1);
        }
        if (numberNode1.end == end) {

            /* expected: '}', ',' */
            throwParseException(numberNode1.end);
        }

        switch (regexp.charAt(numberNode1.end)) {
            case '}':
                result = new TreeNode(TreeNode.QUANTIFIER,
                                      start,
                                      numberNode1.end + 1,
                                      "{n}");                           //NOI18N
                result.add(numberNode1);
                return result;
            case ',':
                break;
            default:

                /* expected: '}' or ',' */
                throwParseException(numberNode1.end);
        }

        if (numberNode1.end + 1 == end) {
            
            /* expected: number or '}' */
            throwParseException(numberNode1.end + 1);
        }

        if (regexp.charAt(numberNode1.end + 1) == '}') {
                result = new TreeNode(TreeNode.QUANTIFIER,
                                      start,
                                      numberNode1.end + 2,
                                      "{n,}");                          //NOI18N
                result.add(numberNode1);
                return result;
        }

        TreeNode numberNode2 = parseNumber(numberNode1.end + 1, end);
        if (numberNode2 == null) {

            /* expected: number */
            throwParseException(numberNode1.end + 1);
        }
        if (numberNode2.end == end
                || regexp.charAt(numberNode2.end) != '}') {

            /* expected: '}' */
            throwParseException(numberNode2.end);
        }

        int num1 = ((Integer) numberNode1.getAttribs()).intValue();
        int num2 = ((Integer) numberNode2.getAttribs()).intValue();
        if (num2 < num1) {
            throwParseException(numberNode2.start);
        }

        result = new TreeNode(TreeNode.QUANTIFIER,
                              start,
                              numberNode2.end + 1,
                              "{n,n}");                                 //NOI18N
        result.add(numberNode1);
        result.add(numberNode2);
        return result;
    }
    

    /** */
    private TreeNode parseNumber(int start, int end) throws ParseException {
        if (start == end) {
            return null;
        }

        char[] chars = regexp.substring(start, end).toCharArray();
        int endIndex = chars.length;
        for (int i = 0; i < chars.length; i++) {
            if (chars[i] < '0' || chars[i] > '9') {
                endIndex = i;
                break;
            }
        }

        if (endIndex == 0) {
            return null;
        } else if (endIndex > 3) {

            /* max 3 digits */
            throwParseException(start);
        }

        int number;
        if (endIndex == 1) {
            number = chars[0] - '0';
        } else {
            try {
                number = Integer.parseInt(regexp.substring(start,
                                                           start + endIndex));
            } catch (NumberFormatException ex) {
                throw new AssertionError();                  //should not happen
            }
        }

        TreeNode result = new TreeNode(TreeNode.NUMBER,
                                       start,
                                       start + endIndex,
                                       new Integer(number));
        return result;
    }


    /** */
    private String getTokenName(int start, int end) {
        if (tokenNames == null) {
            return null;
        }

        int checkAreaLength = Math.min(end - start, maxTokenLength + 2);
        String substring = regexp.substring(start, start + checkAreaLength);
        if (substring.charAt(0) != '{') {
            return null;
        }
        int rightBoundaryIndex = substring.indexOf('}', 1);
        if (rightBoundaryIndex == -1) {
            return null;
        }
        String tokenName = substring.substring(1, rightBoundaryIndex);
        for (int i = 0; i < tokenNames.length; i++) {
            if (tokenName.equals(tokenNames[i])) {
                return tokenName;
            }
        }
        return null;
    }


    /** */
    private Integer parseUnicode(int start, int end) throws ParseException {
        if (start == end) {
            return null;
        }

        if (end - start < 4) {

            /* expected: 4-digit hexadecimal number */
            throwParseException(start);
        }

        char[] chars = regexp.substring(start, start + 4).toCharArray();
        for (int i = 0; i < 4; i++) {
            char ch = chars[i];
            if ("01234567890abcdefABCDEF".indexOf(ch) == -1) {          //NOI18N
                if (i == 0) {
                    return null;
                } else {
                    throwParseException(start);
                }
            }
        }

        Integer integer;
        try {
            integer = Integer.valueOf(regexp.substring(start, start + 4), 16);
        } catch (NumberFormatException ex) {
            throw new AssertionError();         //should not happen
        }
        return integer;
    }


    /** */
    private TreeNode parseSubexpr(int start, int end) throws ParseException {
        if (start == end) {
            return null;
        }

        if (regexp.charAt(start) != '(') {
            return null;
        }
        if (end == start + 1) {
            throwParseException(start + 1);
        }

        TreeNode result;
        TreeNode multiRegexpNode = parseMultiRegexp(start + 1, end);
        if (multiRegexpNode == null) {

            /* expected: regular subexpression */
            throwParseException(start + 1);
        }
        if (multiRegexpNode.end == end
                || regexp.charAt(multiRegexpNode.end) != ')') {
            throwParseException(multiRegexpNode.end);
        }
        result = new TreeNode(TreeNode.SUBEXPR, start, multiRegexpNode.end + 1);
        result.add(multiRegexpNode);
        return result;
    }


    /** */
    private TreeNode parseSet(int start, int end) throws ParseException {
        if (start == end) {
            return null;
        }

        if (regexp.charAt(start) != '[') {
            return null;
        }
        if (end == start + 1) {

            /* unexpected end of regexp: */
            throwParseException(start + 1);
        }

        /*
         * Test which of the three chars that may occur only in the beginning
         * of the set ('^', ']', '-') are present:
         */
        String setString = regexp.substring(start, end);
        String specials = getSpecials(setString);

        /* Find indices of the bounding square brackets: */
        int endIndex = setString.indexOf(']', 1 + specials.length());
        if (endIndex == -1) {

            /* matching bracket (']') not found: */
            throwParseException(start);
        } else {
            endIndex++;                 //index of the first character after ']'
        }
        endIndex += start;              //from the beginning of 'regexp'

        setString = regexp.substring(start, endIndex);
        int setLength = setString.length();

        TreeNode result;

        /* Test whether the set is a named character set: */
        if (setLength >= 5
                && setString.charAt(1) == ':'
                && setString.charAt(setLength - 2) == ':') {
            String charClassName = setString.substring(2, setLength - 2);
            if (isPosixCharClass(charClassName)) {
                result = new TreeNode(TreeNode.POSIX_SET,
                                               start,
                                               endIndex,
                                               charClassName);
                return result;
            } else {
                throwParseException(start + 2);
            }
        }

        result = new TreeNode(TreeNode.SET,
                              start,
                              endIndex,
                              specials);

        int from = start + 1 + specials.length();
        int to = endIndex - 1;

        while (from != to) {
            TreeNode rangeNode = parseRangeOrChar(from, to);
            if (rangeNode == null) {

                /* expected: character or range of characters */
                throwParseException(from);
            }
            result.add(rangeNode);
            from = rangeNode.end;
        }

        return result;
    }


    /** */
    private TreeNode parseRangeOrChar(int start, int end)
            throws ParseException {
        if (start == end) {
            return null;
        }

        TreeNode rangeCharNode1 = parseRangeChar(start, end);
        if (rangeCharNode1 == null) {
            return null;
        }

        if (rangeCharNode1.end == end
                || regexp.charAt(rangeCharNode1.end) != '-') {
            return rangeCharNode1;
        }

        TreeNode rangeCharNode2 = parseRangeChar(rangeCharNode1.end + 1, end);
        if (rangeCharNode2 == null) {

            /* expected: range character */
            throwParseException(rangeCharNode1.end + 1);
        }

        Object charObject;

        charObject = rangeCharNode1.getAttribs();
        int char1 = charObject instanceof Character
                     ? Character.getNumericValue(
                            ((Character) charObject).charValue())
                     : ((Integer) charObject).intValue();
        charObject = rangeCharNode2.getAttribs();
        int char2 = charObject instanceof Character
                     ? Character.getNumericValue(
                            ((Character) charObject).charValue())
                     : ((Integer) charObject).intValue();

        if (!(char1 < char2)) {

            /* expected: range character */
            throwParseException(rangeCharNode1.end + 1);
        }

        TreeNode result = new TreeNode(TreeNode.RANGE,
                                       start,
                                       rangeCharNode2.end);
        result.add(rangeCharNode1);
        result.add(rangeCharNode2);
        return result;
    }

    
    /** */
    private TreeNode parseRangeChar(int start, int end) throws ParseException {
        if (start == end) {
            return null;
        }

        TreeNode result;

        char ch = regexp.charAt(start);
        switch (ch) {
            case ']':
            case '-':
                return null;

            case '\\':
                if (end == start + 1) {

                    /* expected: any character except ']', '-' */
                    throwParseException(start + 1);
                }
                char ch2 = regexp.charAt(start + 1);
                char parsedChar;
                switch (ch2) {
                    case 'u':
                        Integer unicode = parseUnicode(start + 2, end);
                        if (unicode == null) {
                            
                            /* expected: 4-digit hexadecimal number */
                            throwParseException(start + 2);
                        }
                        int codeValue = unicode.intValue();
                        assert codeValue >= 0;
                        if (codeValue <= 0x007f) {

                            /* expected: unicode value >= 0080h */
                            throwParseException(start + 2);
                        }
                        return new TreeNode(TreeNode.UNICODE_CHAR,
                                              start,
                                              start + 6,
                                              unicode);

                    case ']':
                    case '-':

                        /*
                         * characters ']' and '-' must be in the beginning
                         * of the definition of the set:
                         */
                        throwParseException(start + 2);

                    case 't':
                        parsedChar = '\t';
                        break;

                    case 'n':
                        parsedChar = '\n';
                        break;

                    case 'r':
                        parsedChar = '\r';
                        break;

                    case 'f':
                        parsedChar = '\f';
                        break;

                    default:
                        parsedChar = ch2;
                        break;
                }
                result = new TreeNode(TreeNode.CHAR,
                                      start,
                                      start + 2,
                                      new Character(parsedChar));
                break;
            default:
                result = new TreeNode(TreeNode.CHAR,
                                      start,
                                      start + 1,
                                      new Character(ch));
                break;
        }
        return result;
    }


    /** */
    private String getSpecials(String setRegexp) {
        int index = 1;
        int maxIndex = 3;
        if (setRegexp.length() < 5) {
            maxIndex = setRegexp.length() - 2;
        }
        StringBuffer buf = new StringBuffer(maxIndex - index + 1);
        char ch = setRegexp.charAt(index);
        if (ch == '^') {
            buf.append(ch);
            if (index == maxIndex) {
                return buf.toString();
            }
            ch = setRegexp.charAt(++index);
        }
        if (ch == ']') {
            buf.append(ch);
            if (index == maxIndex) {
                return buf.toString();
            }
            ch = setRegexp.charAt(++index);
        }
        if (ch == '-') {
            buf.append(ch);
        }
        return buf.toString();
    }


    /** */
    private boolean isPosixCharClass(String name) {
        /* "xdigit" is the only class name not having exactly 5 characters: */
        if (name.equals("xdigit")) {                                    //NOI18N
            return true;
        }
        if (name.length() != 5) {
            return false;
        }

        String classNames = "alnum alpha blank cntrl digit graph "      //NOI18N
                            + "lower print punct space upper";          //NOI18N
        java.util.StringTokenizer tokenizer
                = new java.util.StringTokenizer(classNames, " ");       //NOI18N
        while (tokenizer.hasMoreTokens()) {
            if (name.equals(tokenizer.nextToken())) {
                return true;
            }
        }
        return false;
    }

}
... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.