|
Java example source code file (Pattern.java)
The Pattern.java Java example source code/* * Copyright (c) 1999, 2013, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ package java.util.regex; import java.text.Normalizer; import java.util.Locale; import java.util.Iterator; import java.util.Map; import java.util.ArrayList; import java.util.HashMap; import java.util.Arrays; import java.util.NoSuchElementException; import java.util.Spliterator; import java.util.Spliterators; import java.util.function.Predicate; import java.util.stream.Stream; import java.util.stream.StreamSupport; /** * A compiled representation of a regular expression. * * <p> A regular expression, specified as a string, must first be compiled into * an instance of this class. The resulting pattern can then be used to create * a {@link Matcher} object that can match arbitrary {@linkplain * java.lang.CharSequence character sequences} against the regular * expression. All of the state involved in performing a match resides in the * matcher, so many matchers can share the same pattern. * * <p> A typical invocation sequence is thus * * <blockquote>* Pattern p = Pattern.{@link #compile compile}("a*b"); * Matcher m = p.{@link #matcher matcher}("aaaaab"); * boolean b = m.{@link Matcher#matches matches}();</pre> * * <p> A {@link #matches matches} method is defined by this class as a * convenience for when a regular expression is used just once. This method * compiles an expression and matches an input sequence against it in a single * invocation. The statement * * <blockquote>* boolean b = Pattern.matches("a*b", "aaaaab");</pre> * * is equivalent to the three statements above, though for repeated matches it * is less efficient since it does not allow the compiled pattern to be reused. * * <p> Instances of this class are immutable and are safe for use by multiple * concurrent threads. Instances of the {@link Matcher} class are not safe for * such use. * * * <h3>Summary of regular-expression constructs * * <table border="0" cellpadding="1" cellspacing="0" * summary="Regular expression constructs, and what they match"> * * <tr align="left"> * <th align="left" id="construct">Construct * <th align="left" id="matches">Matches * </tr> * * <tr> | ||
Characters | ||
x | ||
\\ | ||
\0n | ||
\0nn | ||
\0mnn | ||
\xhh | ||
\uhhhh | ||
\x{h...h} | ||
\t | ||
\n | ||
\r | ||
\f | ||
\a | ||
\e | ||
\cx | ||
Character classes | ||
{@code [abc]} | ||
{@code [^abc]} | ||
{@code [a-zA-Z]} | ||
{@code [a-d[m-p]]} | ||
{@code [a-z&&[def]]} | ||
{@code [a-z&&[^bc]]} | ||
{@code [a-z&&[^m-p]]} | ||
Predefined character classes | ||
. | ||
\d | ||
\D | ||
\h | ||
\H | ||
\s | ||
\S | ||
\v | ||
\V | ||
\w | ||
\W | ||
POSIX character classes (US-ASCII only) | ||
{@code \p{Lower}} | ||
{@code \p{Upper}} | ||
{@code \p{ASCII}} | ||
{@code \p{Alpha}} | ||
{@code \p{Digit}} | ||
{@code \p{Alnum}} | ||
{@code \p{Punct}} | ||
{@code \p{Graph}} | ||
{@code \p{Print}} | ||
{@code \p{Blank}} | ||
{@code \p{Cntrl}} | ||
{@code \p{XDigit}} | ||
{@code \p{Space}} | ||
java.lang.Character classes (simple java character type) | ||
\p{javaLowerCase} | ||
\p{javaUpperCase} | ||
\p{javaWhitespace} | ||
\p{javaMirrored} | ||
Classes for Unicode scripts, blocks, categories and binary properties | ||
{@code \p{IsLatin}} | ||
{@code \p{InGreek}} | ||
{@code \p{Lu}} | ||
{@code \p{IsAlphabetic}} | ||
{@code \p{Sc}} | ||
{@code \P{InGreek}} | ||
{@code [\p{L}&&[^\p{Lu}]]} | ||
Boundary matchers | ||
^ | ||
$ | ||
\b | ||
\B | ||
\A | ||
\G | ||
\Z | ||
\z | ||
Linebreak matcher | ||
\R | ||
Greedy quantifiers | ||
X? | ||
X* | ||
X+ | ||
X{n} | ||
X{n,} | ||
X{n,m} | ||
Reluctant quantifiers | ||
X?? | ||
X*? | ||
X+? | ||
X{n}? | ||
X{n,}? | ||
X{n,m}? | ||
Possessive quantifiers | ||
X?+ | ||
X*+ | ||
X++ | ||
X{n}+ | ||
X{n,}+ | ||
X{n,m}+ | ||
Logical operators | ||
XY | ||
X|Y | ||
(X) | ||
Back references | ||
\n | ||
\k<name> | ||
Quotation | ||
\ | ||
\Q | ||
\E | ||
Special constructs (named-capturing and non-capturing) | ||
(?<name>X) | ||
(?:X) | ||
(?idmsuxU-idmsuxU) | ||
(?idmsux-idmsux:X) | ||
(?=X) | ||
(?!X) | ||
(?<=X) | ||
(?<!X) | ||
(?>X) |
2 |
---|
3 |
4 |
5 |
Pattern
engine performs traditional NFA-based matching
* with ordered alternation as occurs in Perl 5.
*
* <p> Perl constructs not supported by this class:
*
* <ul>
* <li>Predefined character classes (Unicode character) * <p>\X Match Unicode * <a href="http://www.unicode.org/reports/tr18/#Default_Grapheme_Clusters"> * <i>extended grapheme cluster * </p> * * <li>
The backreference constructs, \g{n} for * the <i>nthcapturing group and * <tt>\g{name} for * <a href="#groupname">named-capturing group. * </p> * * <li>
The named character construct, \N{name} * for a Unicode character by its name. * </p> * * <li>
The conditional constructs * <tt>(?(condition)X) and * <tt>(?(condition)X|Y), * </p> * * <li>
The embedded code constructs (?{code}) * and <tt>(??{code}),
* * <li>The embedded comment syntax (?#comment), and
* * <li>The preprocessing operations \l \u, * <tt>\L, and \U.
* * </ul> * * <p> Constructs supported by this class but not by Perl: * * <ul> * * <li>Character-class union and intersection as described * <a href="#cc">above.
* * </ul> * * <p> Notable differences from Perl: * * <ul> * * <li>In Perl, \1 through \9 are always interpreted * as back references; a backslash-escaped number greater than <tt>9 is * treated as a back reference if at least that many subexpressions exist, * otherwise it is interpreted, if possible, as an octal escape. In this * class octal escapes must always begin with a zero. In this class, * <tt>\1 through \9 are always interpreted as back * references, and a larger number is accepted as a back reference if at * least that many subexpressions exist at that point in the regular * expression, otherwise the parser will drop digits until the number is * smaller or equal to the existing number of groups or it is one digit. * </p> * * <li>
Perl uses the g flag to request a match that resumes * where the last match left off. This functionality is provided implicitly * by the {@link Matcher} class: Repeated invocations of the {@link * Matcher#find find} method will resume where the last match left off, * unless the matcher is reset. </p> * * <li>
In Perl, embedded flags at the top level of an expression affect * the whole expression. In this class, embedded flags always take effect * at the point at which they appear, whether they are at the top level or * within a group; in the latter case, flags are restored at the end of the * group just as in Perl. </p> * * </ul> * * * <p> For a more precise description of the behavior of regular expression * constructs, please see <a href="http://www.oreilly.com/catalog/regex3/"> * <i>Mastering Regular Expressions, 3nd Edition, Jeffrey E. F. Friedl, * O'Reilly and Associates, 2006.</a> * </p> * * @see java.lang.String#split(String, int) * @see java.lang.String#split(String) * * @author Mike McCloskey * @author Mark Reinhold * @author JSR-51 Expert Group * @since 1.4 * @spec JSR-51 */ public final class Pattern implements java.io.Serializable { /** * Regular expression modifier values. Instead of being passed as * arguments, they can also be passed as inline modifiers. * For example, the following statements have the same effect. * <pre> * RegExp r1 = RegExp.compile("abc", Pattern.I|Pattern.M); * RegExp r2 = RegExp.compile("(?im)abc", 0); * </pre> * * The flags are duplicated so that the familiar Perl match flag * names are available. */ /** * Enables Unix lines mode. * * <p> In this mode, only the '\n' line terminator is recognized * in the behavior of <tt>., ^, and $. * * <p> Unix lines mode can also be enabled via the embedded flag * expression <tt>(?d). */ public static final int UNIX_LINES = 0x01; /** * Enables case-insensitive matching. * * <p> By default, case-insensitive matching assumes that only characters * in the US-ASCII charset are being matched. Unicode-aware * case-insensitive matching can be enabled by specifying the {@link * #UNICODE_CASE} flag in conjunction with this flag. * * <p> Case-insensitive matching can also be enabled via the embedded flag * expression <tt>(?i). * * <p> Specifying this flag may impose a slight performance penalty.
*/ public static final int CASE_INSENSITIVE = 0x02; /** * Permits whitespace and comments in pattern. * * <p> In this mode, whitespace is ignored, and embedded comments starting * with <tt># are ignored until the end of a line. * * <p> Comments mode can also be enabled via the embedded flag * expression <tt>(?x). */ public static final int COMMENTS = 0x04; /** * Enables multiline mode. * * <p> In multiline mode the expressions ^ and $ match * just after or just before, respectively, a line terminator or the end of * the input sequence. By default these expressions only match at the * beginning and the end of the entire input sequence. * * <p> Multiline mode can also be enabled via the embedded flag * expression <tt>(?m). */ public static final int MULTILINE = 0x08; /** * Enables literal parsing of the pattern. * * <p> When this flag is specified then the input string that specifies * the pattern is treated as a sequence of literal characters. * Metacharacters or escape sequences in the input sequence will be * given no special meaning. * * <p>The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact on * matching when used in conjunction with this flag. The other flags * become superfluous. * * <p> There is no embedded flag character for enabling literal parsing. * @since 1.5 */ public static final int LITERAL = 0x10; /** * Enables dotall mode. * * <p> In dotall mode, the expression . matches any character, * including a line terminator. By default this expression does not match * line terminators. * * <p> Dotall mode can also be enabled via the embedded flag * expression <tt>(?s). (The s is a mnemonic for * "single-line" mode, which is what this is called in Perl.) </p> */ public static final int DOTALL = 0x20; /** * Enables Unicode-aware case folding. * * <p> When this flag is specified then case-insensitive matching, when * enabled by the {@link #CASE_INSENSITIVE} flag, is done in a manner * consistent with the Unicode Standard. By default, case-insensitive * matching assumes that only characters in the US-ASCII charset are being * matched. * * <p> Unicode-aware case folding can also be enabled via the embedded flag * expression <tt>(?u). * * <p> Specifying this flag may impose a performance penalty. */ public static final int UNICODE_CASE = 0x40; /** * Enables canonical equivalence. * * <p> When this flag is specified then two characters will be considered * to match if, and only if, their full canonical decompositions match. * The expression <tt>"a\u030A", for example, will match the * string <tt>"\u00E5" when this flag is specified. By default, * matching does not take canonical equivalence into account. * * <p> There is no embedded flag character for enabling canonical * equivalence. * * <p> Specifying this flag may impose a performance penalty. */ public static final int CANON_EQ = 0x80; /** * Enables the Unicode version of <i>Predefined character classes and * <i>POSIX character classes. * * <p> When this flag is specified then the (US-ASCII only) * <i>Predefined character classes and POSIX character classes * are in conformance with * <a href="http://www.unicode.org/reports/tr18/">Unicode Technical * Standard #18: Unicode Regular Expression</i> * <i>Annex C: Compatibility Properties. * <p> * The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded * flag expression <tt>(?U). * <p> * The flag implies UNICODE_CASE, that is, it enables Unicode-aware case * folding. * <p> * Specifying this flag may impose a performance penalty. </p> * @since 1.7 */ public static final int UNICODE_CHARACTER_CLASS = 0x100; /* Pattern has only two serialized components: The pattern string * and the flags, which are all that is needed to recompile the pattern * when it is deserialized. */ /** use serialVersionUID from Merlin b59 for interoperability */ private static final long serialVersionUID = 5073258162644648461L; /** * The original regular-expression pattern string. * * @serial */ private String pattern; /** * The original pattern flags. * * @serial */ private int flags; /** * Boolean indicating this Pattern is compiled; this is necessary in order * to lazily compile deserialized Patterns. */ private transient volatile boolean compiled = false; /** * The normalized pattern string. */ private transient String normalizedPattern; /** * The starting point of state machine for the find operation. This allows * a match to start anywhere in the input. */ transient Node root; /** * The root of object tree for a match operation. The pattern is matched * at the beginning. This may include a find that uses BnM or a First * node. */ transient Node matchRoot; /** * Temporary storage used by parsing pattern slice. */ transient int[] buffer; /** * Map the "name" of the "named capturing group" to its group id * node. */ transient volatile Map<String, Integer> namedGroups; /** * Temporary storage used while parsing group references. */ transient GroupHead[] groupNodes; /** * Temporary null terminated code point array used by pattern compiling. */ private transient int[] temp; /** * The number of capturing groups in this Pattern. Used by matchers to * allocate storage needed to perform a match. */ transient int capturingGroupCount; /** * The local variable count used by parsing tree. Used by matchers to * allocate storage needed to perform a match. */ transient int localCount; /** * Index into the pattern string that keeps track of how much has been * parsed. */ private transient int cursor; /** * Holds the length of the pattern string. */ private transient int patternLength; /** * If the Start node might possibly match supplementary characters. * It is set to true during compiling if * (1) There is supplementary char in pattern, or * (2) There is complement node of Category or Block */ private transient boolean hasSupplementary; /** * Compiles the given regular expression into a pattern. * * @param regex * The expression to be compiled * @return the given regular expression compiled into a pattern * @throws PatternSyntaxException * If the expression's syntax is invalid */ public static Pattern compile(String regex) { return new Pattern(regex, 0); } /** * Compiles the given regular expression into a pattern with the given * flags. * * @param regex * The expression to be compiled * * @param flags * Match flags, a bit mask that may include * {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL}, * {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES}, * {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS} * and {@link #COMMENTS} * * @return the given regular expression compiled into a pattern with the given flags * @throws IllegalArgumentException * If bit values other than those corresponding to the defined * match flags are set in <tt>flags * * @throws PatternSyntaxException * If the expression's syntax is invalid */ public static Pattern compile(String regex, int flags) { return new Pattern(regex, flags); } /** * Returns the regular expression from which this pattern was compiled. * * @return The source of this pattern */ public String pattern() { return pattern; } /** * <p>Returns the string representation of this pattern. This * is the regular expression from which this pattern was * compiled.</p> * * @return The string representation of this pattern * @since 1.5 */ public String toString() { return pattern; } /** * Creates a matcher that will match the given input against this pattern. * * @param input * The character sequence to be matched * * @return A new matcher for this pattern */ public Matcher matcher(CharSequence input) { if (!compiled) { synchronized(this) { if (!compiled) compile(); } } Matcher m = new Matcher(this, input); return m; } /** * Returns this pattern's match flags. * * @return The match flags specified when this pattern was compiled */ public int flags() { return flags; } /** * Compiles the given regular expression and attempts to match the given * input against it. * * <p> An invocation of this convenience method of the form * * <blockquote>* Pattern.matches(regex, input);</pre> * * behaves in exactly the same way as the expression * * <blockquote>* Pattern.compile(regex).matcher(input).matches()</pre> * * <p> If a pattern is to be used multiple times, compiling it once and reusing * it will be more efficient than invoking this method each time. </p> * * @param regex * The expression to be compiled * * @param input * The character sequence to be matched * @return whether or not the regular expression matches on the input * @throws PatternSyntaxException * If the expression's syntax is invalid */ public static boolean matches(String regex, CharSequence input) { Pattern p = Pattern.compile(regex); Matcher m = p.matcher(input); return m.matches(); } /** * Splits the given input sequence around matches of this pattern. * * <p> The array returned by this method contains each substring of the * input sequence that is terminated by another subsequence that matches * this pattern or is terminated by the end of the input sequence. The * substrings in the array are in the order in which they occur in the * input. If this pattern does not match any subsequence of the input then * the resulting array has just one element, namely the input sequence in * string form. * * <p> When there is a positive-width match at the beginning of the input * sequence then an empty leading substring is included at the beginning * of the resulting array. A zero-width match at the beginning however * never produces such empty leading substring. * * <p> The limit parameter controls the number of times the * pattern is applied and therefore affects the length of the resulting * array. If the limit <i>n is greater than zero then the pattern * will be applied at most <i>n - 1 times, the array's * length will be no greater than <i>n, and the array's last entry * will contain all input beyond the last matched delimiter. If <i>n * is non-positive then the pattern will be applied as many times as * possible and the array can have any length. If <i>n is zero then * the pattern will be applied as many times as possible, the array can * have any length, and trailing empty strings will be discarded. * * <p> The input "boo:and:foo", for example, yields the following * results with these parameters: * * <blockquote>
1 |
---|
2 |
3 |
4 |
\p{Lower} |
\p{Upper} |
\p{ASCII} |
\p{Alpha} |
\p{Digit} |
\p{Alnum} |
\p{Punct} |
\p{Graph} |
\p{Print} |
\p{Blank} |
\p{Cntrl} |
\p{XDigit} |
\p{Space} |
\d |
\D |
\s |
\S |
\w |
\W |
: |
: |
: |
o |
o |
o |
String
that can be used to
* create a <code>Pattern that would match the string
* <code>s as if it were a literal pattern. Metacharacters
* or escape sequences in the input sequence will be given no special
* meaning.
*
* @param s The string to be literalized
* @return A literal string replacement
* @since 1.5
*/
public static String quote(String s) {
int slashEIndex = s.indexOf("\\E");
if (slashEIndex == -1)
return "\\Q" + s + "\\E";
StringBuilder sb = new StringBuilder(s.length() * 2);
sb.append("\\Q");
slashEIndex = 0;
int current = 0;
while ((slashEIndex = s.indexOf("\\E", current)) != -1) {
sb.append(s.substring(current, slashEIndex));
current = slashEIndex + 2;
sb.append("\\E\\\\E\\Q");
}
sb.append(s.substring(current, s.length()));
sb.append("\\E");
return sb.toString();
}
/**
* Recompile the Pattern instance from a stream. The original pattern
* string is read in and the object tree is recompiled from it.
*/
private void readObject(java.io.ObjectInputStream s)
throws java.io.IOException, ClassNotFoundException {
// Read in all fields
s.defaultReadObject();
// Initialize counts
capturingGroupCount = 1;
localCount = 0;
// if length > 0, the Pattern is lazily compiled
compiled = false;
if (pattern.length() == 0) {
root = new Start(lastAccept);
matchRoot = lastAccept;
compiled = true;
}
}
/**
* This private constructor is used to create all Patterns. The pattern
* string and match flags are all that is needed to completely describe
* a Pattern. An empty pattern string results in an object tree with
* only a Start node and a LastNode node.
*/
private Pattern(String p, int f) {
pattern = p;
flags = f;
// to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present
if ((flags & UNICODE_CHARACTER_CLASS) != 0)
flags |= UNICODE_CASE;
// Reset group index count
capturingGroupCount = 1;
localCount = 0;
if (pattern.length() > 0) {
compile();
} else {
root = new Start(lastAccept);
matchRoot = lastAccept;
}
}
/**
* The pattern is converted to normalizedD form and then a pure group
* is constructed to match canonical equivalences of the characters.
*/
private void normalize() {
boolean inCharClass = false;
int lastCodePoint = -1;
// Convert pattern into normalizedD form
normalizedPattern = Normalizer.normalize(pattern, Normalizer.Form.NFD);
patternLength = normalizedPattern.length();
// Modify pattern to match canonical equivalences
StringBuilder newPattern = new StringBuilder(patternLength);
for(int i=0; i<patternLength; ) {
int c = normalizedPattern.codePointAt(i);
StringBuilder sequenceBuffer;
if ((Character.getType(c) == Character.NON_SPACING_MARK)
&& (lastCodePoint != -1)) {
sequenceBuffer = new StringBuilder();
sequenceBuffer.appendCodePoint(lastCodePoint);
sequenceBuffer.appendCodePoint(c);
while(Character.getType(c) == Character.NON_SPACING_MARK) {
i += Character.charCount(c);
if (i >= patternLength)
break;
c = normalizedPattern.codePointAt(i);
sequenceBuffer.appendCodePoint(c);
}
String ea = produceEquivalentAlternation(
sequenceBuffer.toString());
newPattern.setLength(newPattern.length()-Character.charCount(lastCodePoint));
newPattern.append("(?:").append(ea).append(")");
} else if (c == '[' && lastCodePoint != '\\') {
i = normalizeCharClass(newPattern, i);
} else {
newPattern.appendCodePoint(c);
}
lastCodePoint = c;
i += Character.charCount(c);
}
normalizedPattern = newPattern.toString();
}
/**
* Complete the character class being parsed and add a set
* of alternations to it that will match the canonical equivalences
* of the characters within the class.
*/
private int normalizeCharClass(StringBuilder newPattern, int i) {
StringBuilder charClass = new StringBuilder();
StringBuilder eq = null;
int lastCodePoint = -1;
String result;
i++;
charClass.append("[");
while(true) {
int c = normalizedPattern.codePointAt(i);
StringBuilder sequenceBuffer;
if (c == ']' && lastCodePoint != '\\') {
charClass.append((char)c);
break;
} else if (Character.getType(c) == Character.NON_SPACING_MARK) {
sequenceBuffer = new StringBuilder();
sequenceBuffer.appendCodePoint(lastCodePoint);
while(Character.getType(c) == Character.NON_SPACING_MARK) {
sequenceBuffer.appendCodePoint(c);
i += Character.charCount(c);
if (i >= normalizedPattern.length())
break;
c = normalizedPattern.codePointAt(i);
}
String ea = produceEquivalentAlternation(
sequenceBuffer.toString());
charClass.setLength(charClass.length()-Character.charCount(lastCodePoint));
if (eq == null)
eq = new StringBuilder();
eq.append('|');
eq.append(ea);
} else {
charClass.appendCodePoint(c);
i++;
}
if (i == normalizedPattern.length())
throw error("Unclosed character class");
lastCodePoint = c;
}
if (eq != null) {
result = "(?:"+charClass.toString()+eq.toString()+")";
} else {
result = charClass.toString();
}
newPattern.append(result);
return i;
}
/**
* Given a specific sequence composed of a regular character and
* combining marks that follow it, produce the alternation that will
* match all canonical equivalences of that sequence.
*/
private String produceEquivalentAlternation(String source) {
int len = countChars(source, 0, 1);
if (source.length() == len)
// source has one character.
return source;
String base = source.substring(0,len);
String combiningMarks = source.substring(len);
String[] perms = producePermutations(combiningMarks);
StringBuilder result = new StringBuilder(source);
// Add combined permutations
for(int x=0; x<perms.length; x++) {
String next = base + perms[x];
if (x>0)
result.append("|"+next);
next = composeOneStep(next);
if (next != null)
result.append("|"+produceEquivalentAlternation(next));
}
return result.toString();
}
/**
* Returns an array of strings that have all the possible
* permutations of the characters in the input string.
* This is used to get a list of all possible orderings
* of a set of combining marks. Note that some of the permutations
* are invalid because of combining class collisions, and these
* possibilities must be removed because they are not canonically
* equivalent.
*/
private String[] producePermutations(String input) {
if (input.length() == countChars(input, 0, 1))
return new String[] {input};
if (input.length() == countChars(input, 0, 2)) {
int c0 = Character.codePointAt(input, 0);
int c1 = Character.codePointAt(input, Character.charCount(c0));
if (getClass(c1) == getClass(c0)) {
return new String[] {input};
}
String[] result = new String[2];
result[0] = input;
StringBuilder sb = new StringBuilder(2);
sb.appendCodePoint(c1);
sb.appendCodePoint(c0);
result[1] = sb.toString();
return result;
}
int length = 1;
int nCodePoints = countCodePoints(input);
for(int x=1; x<nCodePoints; x++)
length = length * (x+1);
String[] temp = new String[length];
int combClass[] = new int[nCodePoints];
for(int x=0, i=0; x<nCodePoints; x++) {
int c = Character.codePointAt(input, i);
combClass[x] = getClass(c);
i += Character.charCount(c);
}
// For each char, take it out and add the permutations
// of the remaining chars
int index = 0;
int len;
// offset maintains the index in code units.
loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
len = countChars(input, offset, 1);
boolean skip = false;
for(int y=x-1; y>=0; y--) {
if (combClass[y] == combClass[x]) {
continue loop;
}
}
StringBuilder sb = new StringBuilder(input);
String otherChars = sb.delete(offset, offset+len).toString();
String[] subResult = producePermutations(otherChars);
String prefix = input.substring(offset, offset+len);
for(int y=0; y<subResult.length; y++)
temp[index++] = prefix + subResult[y];
}
String[] result = new String[index];
for (int x=0; x<index; x++)
result[x] = temp[x];
return result;
}
private int getClass(int c) {
return sun.text.Normalizer.getCombiningClass(c);
}
/**
* Attempts to compose input by combining the first character
* with the first combining mark following it. Returns a String
* that is the composition of the leading character with its first
* combining mark followed by the remaining combining marks. Returns
* null if the first two characters cannot be further composed.
*/
private String composeOneStep(String input) {
int len = countChars(input, 0, 2);
String firstTwoCharacters = input.substring(0, len);
String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC);
if (result.equals(firstTwoCharacters))
return null;
else {
String remainder = input.substring(len);
return result + remainder;
}
}
/**
* Preprocess any \Q...\E sequences in `temp', meta-quoting them.
* See the description of `quotemeta' in perlfunc(1).
*/
private void RemoveQEQuoting() {
final int pLen = patternLength;
int i = 0;
while (i < pLen-1) {
if (temp[i] != '\\')
i += 1;
else if (temp[i + 1] != 'Q')
i += 2;
else
break;
}
if (i >= pLen - 1) // No \Q sequence found
return;
int j = i;
i += 2;
int[] newtemp = new int[j + 3*(pLen-i) + 2];
System.arraycopy(temp, 0, newtemp, 0, j);
boolean inQuote = true;
boolean beginQuote = true;
while (i < pLen) {
int c = temp[i++];
if (!ASCII.isAscii(c) || ASCII.isAlpha(c)) {
newtemp[j++] = c;
} else if (ASCII.isDigit(c)) {
if (beginQuote) {
/*
* A unicode escape \[0xu] could be before this quote,
* and we don't want this numeric char to processed as
* part of the escape.
*/
newtemp[j++] = '\\';
newtemp[j++] = 'x';
newtemp[j++] = '3';
}
newtemp[j++] = c;
} else if (c != '\\') {
if (inQuote) newtemp[j++] = '\\';
newtemp[j++] = c;
} else if (inQuote) {
if (temp[i] == 'E') {
i++;
inQuote = false;
} else {
newtemp[j++] = '\\';
newtemp[j++] = '\\';
}
} else {
if (temp[i] == 'Q') {
i++;
inQuote = true;
beginQuote = true;
continue;
} else {
newtemp[j++] = c;
if (i != pLen)
newtemp[j++] = temp[i++];
}
}
beginQuote = false;
}
patternLength = j;
temp = Arrays.copyOf(newtemp, j + 2); // double zero termination
}
/**
* Copies regular expression to an int array and invokes the parsing
* of the expression which will create the object tree.
*/
private void compile() {
// Handle canonical equivalences
if (has(CANON_EQ) && !has(LITERAL)) {
normalize();
} else {
normalizedPattern = pattern;
}
patternLength = normalizedPattern.length();
// Copy pattern to int array for convenience
// Use double zero to terminate pattern
temp = new int[patternLength + 2];
hasSupplementary = false;
int c, count = 0;
// Convert all chars into code points
for (int x = 0; x < patternLength; x += Character.charCount(c)) {
c = normalizedPattern.codePointAt(x);
if (isSupplementary(c)) {
hasSupplementary = true;
}
temp[count++] = c;
}
patternLength = count; // patternLength now in code points
if (! has(LITERAL))
RemoveQEQuoting();
// Allocate all temporary objects here.
buffer = new int[32];
groupNodes = new GroupHead[10];
namedGroups = null;
if (has(LITERAL)) {
// Literal pattern handling
matchRoot = newSlice(temp, patternLength, hasSupplementary);
matchRoot.next = lastAccept;
} else {
// Start recursive descent parsing
matchRoot = expr(lastAccept);
// Check extra pattern characters
if (patternLength != cursor) {
if (peek() == ')') {
throw error("Unmatched closing ')'");
} else {
throw error("Unexpected internal error");
}
}
}
// Peephole optimization
if (matchRoot instanceof Slice) {
root = BnM.optimize(matchRoot);
if (root == matchRoot) {
root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
}
} else if (matchRoot instanceof Begin || matchRoot instanceof First) {
root = matchRoot;
} else {
root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
}
// Release temporary storage
temp = null;
buffer = null;
groupNodes = null;
patternLength = 0;
compiled = true;
}
Map<String, Integer> namedGroups() {
if (namedGroups == null)
namedGroups = new HashMap<>(2);
return namedGroups;
}
/**
* Used to print out a subtree of the Pattern to help with debugging.
*/
private static void printObjectTree(Node node) {
while(node != null) {
if (node instanceof Prolog) {
System.out.println(node);
printObjectTree(((Prolog)node).loop);
System.out.println("**** end contents prolog loop");
} else if (node instanceof Loop) {
System.out.println(node);
printObjectTree(((Loop)node).body);
System.out.println("**** end contents Loop body");
} else if (node instanceof Curly) {
System.out.println(node);
printObjectTree(((Curly)node).atom);
System.out.println("**** end contents Curly body");
} else if (node instanceof GroupCurly) {
System.out.println(node);
printObjectTree(((GroupCurly)node).atom);
System.out.println("**** end contents GroupCurly body");
} else if (node instanceof GroupTail) {
System.out.println(node);
System.out.println("Tail next is "+node.next);
return;
} else {
System.out.println(node);
}
node = node.next;
if (node != null)
System.out.println("->next:");
if (node == Pattern.accept) {
System.out.println("Accept Node");
node = null;
}
}
}
/**
* Used to accumulate information about a subtree of the object graph
* so that optimizations can be applied to the subtree.
*/
static final class TreeInfo {
int minLength;
int maxLength;
boolean maxValid;
boolean deterministic;
TreeInfo() {
reset();
}
void reset() {
minLength = 0;
maxLength = 0;
maxValid = true;
deterministic = true;
}
}
/*
* The following private methods are mainly used to improve the
* readability of the code. In order to let the Java compiler easily
* inline them, we should not put many assertions or error checks in them.
*/
/**
* Indicates whether a particular flag is set or not.
*/
private boolean has(int f) {
return (flags & f) != 0;
}
/**
* Match next character, signal error if failed.
*/
private void accept(int ch, String s) {
int testChar = temp[cursor++];
if (has(COMMENTS))
testChar = parsePastWhitespace(testChar);
if (ch != testChar) {
throw error(s);
}
}
/**
* Mark the end of pattern with a specific character.
*/
private void mark(int c) {
temp[patternLength] = c;
}
/**
* Peek the next character, and do not advance the cursor.
*/
private int peek() {
int ch = temp[cursor];
if (has(COMMENTS))
ch = peekPastWhitespace(ch);
return ch;
}
/**
* Read the next character, and advance the cursor by one.
*/
private int read() {
int ch = temp[cursor++];
if (has(COMMENTS))
ch = parsePastWhitespace(ch);
return ch;
}
/**
* Read the next character, and advance the cursor by one,
* ignoring the COMMENTS setting
*/
private int readEscaped() {
int ch = temp[cursor++];
return ch;
}
/**
* Advance the cursor by one, and peek the next character.
*/
private int next() {
int ch = temp[++cursor];
if (has(COMMENTS))
ch = peekPastWhitespace(ch);
return ch;
}
/**
* Advance the cursor by one, and peek the next character,
* ignoring the COMMENTS setting
*/
private int nextEscaped() {
int ch = temp[++cursor];
return ch;
}
/**
* If in xmode peek past whitespace and comments.
*/
private int peekPastWhitespace(int ch) {
while (ASCII.isSpace(ch) || ch == '#') {
while (ASCII.isSpace(ch))
ch = temp[++cursor];
if (ch == '#') {
ch = peekPastLine();
}
}
return ch;
}
/**
* If in xmode parse past whitespace and comments.
*/
private int parsePastWhitespace(int ch) {
while (ASCII.isSpace(ch) || ch == '#') {
while (ASCII.isSpace(ch))
ch = temp[cursor++];
if (ch == '#')
ch = parsePastLine();
}
return ch;
}
/**
* xmode parse past comment to end of line.
*/
private int parsePastLine() {
int ch = temp[cursor++];
while (ch != 0 && !isLineSeparator(ch))
ch = temp[cursor++];
return ch;
}
/**
* xmode peek past comment to end of line.
*/
private int peekPastLine() {
int ch = temp[++cursor];
while (ch != 0 && !isLineSeparator(ch))
ch = temp[++cursor];
return ch;
}
/**
* Determines if character is a line separator in the current mode
*/
private boolean isLineSeparator(int ch) {
if (has(UNIX_LINES)) {
return ch == '\n';
} else {
return (ch == '\n' ||
ch == '\r' ||
(ch|1) == '\u2029' ||
ch == '\u0085');
}
}
/**
* Read the character after the next one, and advance the cursor by two.
*/
private int skip() {
int i = cursor;
int ch = temp[i+1];
cursor = i + 2;
return ch;
}
/**
* Unread one next character, and retreat cursor by one.
*/
private void unread() {
cursor--;
}
/**
* Internal method used for handling all syntax errors. The pattern is
* displayed with a pointer to aid in locating the syntax error.
*/
private PatternSyntaxException error(String s) {
return new PatternSyntaxException(s, normalizedPattern, cursor - 1);
}
/**
* Determines if there is any supplementary character or unpaired
* surrogate in the specified range.
*/
private boolean findSupplementary(int start, int end) {
for (int i = start; i < end; i++) {
if (isSupplementary(temp[i]))
return true;
}
return false;
}
/**
* Determines if the specified code point is a supplementary
* character or unpaired surrogate.
*/
private static final boolean isSupplementary(int ch) {
return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT ||
Character.isSurrogate((char)ch);
}
/**
* The following methods handle the main parsing. They are sorted
* according to their precedence order, the lowest one first.
*/
/**
* The expression is parsed with branch nodes added for alternations.
* This may be called recursively to parse sub expressions that may
* contain alternations.
*/
private Node expr(Node end) {
Node prev = null;
Node firstTail = null;
Branch branch = null;
Node branchConn = null;
for (;;) {
Node node = sequence(end);
Node nodeTail = root; //double return
if (prev == null) {
prev = node;
firstTail = nodeTail;
} else {
// Branch
if (branchConn == null) {
branchConn = new BranchConn();
branchConn.next = end;
}
if (node == end) {
// if the node returned from sequence() is "end"
// we have an empty expr, set a null atom into
// the branch to indicate to go "next" directly.
node = null;
} else {
// the "tail.next" of each atom goes to branchConn
nodeTail.next = branchConn;
}
if (prev == branch) {
branch.add(node);
} else {
if (prev == end) {
prev = null;
} else {
// replace the "end" with "branchConn" at its tail.next
// when put the "prev" into the branch as the first atom.
firstTail.next = branchConn;
}
prev = branch = new Branch(prev, node, branchConn);
}
}
if (peek() != '|') {
return prev;
}
next();
}
}
@SuppressWarnings("fallthrough")
/**
* Parsing of sequences between alternations.
*/
private Node sequence(Node end) {
Node head = null;
Node tail = null;
Node node = null;
LOOP:
for (;;) {
int ch = peek();
switch (ch) {
case '(':
// Because group handles its own closure,
// we need to treat it differently
node = group0();
// Check for comment or flag group
if (node == null)
continue;
if (head == null)
head = node;
else
tail.next = node;
// Double return: Tail was returned in root
tail = root;
continue;
case '[':
node = clazz(true);
break;
case '\\':
ch = nextEscaped();
if (ch == 'p' || ch == 'P') {
boolean oneLetter = true;
boolean comp = (ch == 'P');
ch = next(); // Consume { if present
if (ch != '{') {
unread();
} else {
oneLetter = false;
}
node = family(oneLetter, comp);
} else {
unread();
node = atom();
}
break;
case '^':
next();
if (has(MULTILINE)) {
if (has(UNIX_LINES))
node = new UnixCaret();
else
node = new Caret();
} else {
node = new Begin();
}
break;
case '$':
next();
if (has(UNIX_LINES))
node = new UnixDollar(has(MULTILINE));
else
node = new Dollar(has(MULTILINE));
break;
case '.':
next();
if (has(DOTALL)) {
node = new All();
} else {
if (has(UNIX_LINES))
node = new UnixDot();
else {
node = new Dot();
}
}
break;
case '|':
case ')':
break LOOP;
case ']': // Now interpreting dangling ] and } as literals
case '}':
node = atom();
break;
case '?':
case '*':
case '+':
next();
throw error("Dangling meta character '" + ((char)ch) + "'");
case 0:
if (cursor >= patternLength) {
break LOOP;
}
// Fall through
default:
node = atom();
break;
}
node = closure(node);
if (head == null) {
head = tail = node;
} else {
tail.next = node;
tail = node;
}
}
if (head == null) {
return end;
}
tail.next = end;
root = tail; //double return
return head;
}
@SuppressWarnings("fallthrough")
/**
* Parse and add a new Single or Slice.
*/
private Node atom() {
int first = 0;
int prev = -1;
boolean hasSupplementary = false;
int ch = peek();
for (;;) {
switch (ch) {
case '*':
case '+':
case '?':
case '{':
if (first > 1) {
cursor = prev; // Unwind one character
first--;
}
break;
case '$':
case '.':
case '^':
case '(':
case '[':
case '|':
case ')':
break;
case '\\':
ch = nextEscaped();
if (ch == 'p' || ch == 'P') { // Property
if (first > 0) { // Slice is waiting; handle it first
unread();
break;
} else { // No slice; just return the family node
boolean comp = (ch == 'P');
boolean oneLetter = true;
ch = next(); // Consume { if present
if (ch != '{')
unread();
else
oneLetter = false;
return family(oneLetter, comp);
}
}
unread();
prev = cursor;
ch = escape(false, first == 0, false);
if (ch >= 0) {
append(ch, first);
first++;
if (isSupplementary(ch)) {
hasSupplementary = true;
}
ch = peek();
continue;
} else if (first == 0) {
return root;
}
// Unwind meta escape sequence
cursor = prev;
break;
case 0:
if (cursor >= patternLength) {
break;
}
// Fall through
default:
prev = cursor;
append(ch, first);
first++;
if (isSupplementary(ch)) {
hasSupplementary = true;
}
ch = next();
continue;
}
break;
}
if (first == 1) {
return newSingle(buffer[0]);
} else {
return newSlice(buffer, first, hasSupplementary);
}
}
private void append(int ch, int len) {
if (len >= buffer.length) {
int[] tmp = new int[len+len];
System.arraycopy(buffer, 0, tmp, 0, len);
buffer = tmp;
}
buffer[len] = ch;
}
/**
* Parses a backref greedily, taking as many numbers as it
* can. The first digit is always treated as a backref, but
* multi digit numbers are only treated as a backref if at
* least that many backrefs exist at this point in the regex.
*/
private Node ref(int refNum) {
boolean done = false;
while(!done) {
int ch = peek();
switch(ch) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
int newRefNum = (refNum * 10) + (ch - '0');
// Add another number if it doesn't make a group
// that doesn't exist
if (capturingGroupCount - 1 < newRefNum) {
done = true;
break;
}
refNum = newRefNum;
read();
break;
default:
done = true;
break;
}
}
if (has(CASE_INSENSITIVE))
return new CIBackRef(refNum, has(UNICODE_CASE));
else
return new BackRef(refNum);
}
/**
* Parses an escape sequence to determine the actual value that needs
* to be matched.
* If -1 is returned and create was true a new object was added to the tree
* to handle the escape sequence.
* If the returned value is greater than zero, it is the value that
* matches the escape sequence.
*/
private int escape(boolean inclass, boolean create, boolean isrange) {
int ch = skip();
switch (ch) {
case '0':
return o();
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
if (inclass) break;
if (create) {
root = ref((ch - '0'));
}
return -1;
case 'A':
if (inclass) break;
if (create) root = new Begin();
return -1;
case 'B':
if (inclass) break;
if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
return -1;
case 'C':
break;
case 'D':
if (create) root = has(UNICODE_CHARACTER_CLASS)
? new Utype(UnicodeProp.DIGIT).complement()
: new Ctype(ASCII.DIGIT).complement();
return -1;
case 'E':
case 'F':
break;
case 'G':
if (inclass) break;
if (create) root = new LastMatch();
return -1;
case 'H':
if (create) root = new HorizWS().complement();
return -1;
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
case 'O':
case 'P':
case 'Q':
break;
case 'R':
if (inclass) break;
if (create) root = new LineEnding();
return -1;
case 'S':
if (create) root = has(UNICODE_CHARACTER_CLASS)
? new Utype(UnicodeProp.WHITE_SPACE).complement()
: new Ctype(ASCII.SPACE).complement();
return -1;
case 'T':
case 'U':
break;
case 'V':
if (create) root = new VertWS().complement();
return -1;
case 'W':
if (create) root = has(UNICODE_CHARACTER_CLASS)
? new Utype(UnicodeProp.WORD).complement()
: new Ctype(ASCII.WORD).complement();
return -1;
case 'X':
case 'Y':
break;
case 'Z':
if (inclass) break;
if (create) {
if (has(UNIX_LINES))
root = new UnixDollar(false);
else
root = new Dollar(false);
}
return -1;
case 'a':
return '\007';
case 'b':
if (inclass) break;
if (create) root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
return -1;
case 'c':
return c();
case 'd':
if (create) root = has(UNICODE_CHARACTER_CLASS)
? new Utype(UnicodeProp.DIGIT)
: new Ctype(ASCII.DIGIT);
return -1;
case 'e':
return '\033';
case 'f':
return '\f';
case 'g':
break;
case 'h':
if (create) root = new HorizWS();
return -1;
case 'i':
case 'j':
break;
case 'k':
if (inclass)
break;
if (read() != '<')
throw error("\\k is not followed by '<' for named capturing group");
String name = groupname(read());
if (!namedGroups().containsKey(name))
throw error("(named capturing group <"+ name+"> does not exit");
if (create) {
if (has(CASE_INSENSITIVE))
root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE));
else
root = new BackRef(namedGroups().get(name));
}
return -1;
case 'l':
case 'm':
break;
case 'n':
return '\n';
case 'o':
case 'p':
case 'q':
break;
case 'r':
return '\r';
case 's':
if (create) root = has(UNICODE_CHARACTER_CLASS)
? new Utype(UnicodeProp.WHITE_SPACE)
: new Ctype(ASCII.SPACE);
return -1;
case 't':
return '\t';
case 'u':
return u();
case 'v':
// '\v' was implemented as VT/0x0B in releases < 1.8 (though
// undocumented). In JDK8 '\v' is specified as a predefined
// character class for all vertical whitespace characters.
// So [-1, root=VertWS node] pair is returned (instead of a
// single 0x0B). This breaks the range if '\v' is used as
// the start or end value, such as [\v-...] or [...-\v], in
// which a single definite value (0x0B) is expected. For
// compatibility concern '\013'/0x0B is returned if isrange.
if (isrange)
return '\013';
if (create) root = new VertWS();
return -1;
case 'w':
if (create) root = has(UNICODE_CHARACTER_CLASS)
? new Utype(UnicodeProp.WORD)
: new Ctype(ASCII.WORD);
return -1;
case 'x':
return x();
case 'y':
break;
case 'z':
if (inclass) break;
if (create) root = new End();
return -1;
default:
return ch;
}
throw error("Illegal/unsupported escape sequence");
}
/**
* Parse a character class, and return the node that matches it.
*
* Consumes a ] on the way out if consume is true. Usually consume
* is true except for the case of [abc&&def] where def is a separate
* right hand node with "understood" brackets.
*/
private CharProperty clazz(boolean consume) {
CharProperty prev = null;
CharProperty node = null;
BitClass bits = new BitClass();
boolean include = true;
boolean firstInClass = true;
int ch = next();
for (;;) {
switch (ch) {
case '^':
// Negates if first char in a class, otherwise literal
if (firstInClass) {
if (temp[cursor-1] != '[')
break;
ch = next();
include = !include;
continue;
} else {
// ^ not first in class, treat as literal
break;
}
case '[':
firstInClass = false;
node = clazz(true);
if (prev == null)
prev = node;
else
prev = union(prev, node);
ch = peek();
continue;
case '&':
firstInClass = false;
ch = next();
if (ch == '&') {
ch = next();
CharProperty rightNode = null;
while (ch != ']' && ch != '&') {
if (ch == '[') {
if (rightNode == null)
rightNode = clazz(true);
else
rightNode = union(rightNode, clazz(true));
} else { // abc&&def
unread();
rightNode = clazz(false);
}
ch = peek();
}
if (rightNode != null)
node = rightNode;
if (prev == null) {
if (rightNode == null)
throw error("Bad class syntax");
else
prev = rightNode;
} else {
prev = intersection(prev, node);
}
} else {
// treat as a literal &
unread();
break;
}
continue;
case 0:
firstInClass = false;
if (cursor >= patternLength)
throw error("Unclosed character class");
break;
case ']':
firstInClass = false;
if (prev != null) {
if (consume)
next();
return prev;
}
break;
default:
firstInClass = false;
break;
}
node = range(bits);
if (include) {
if (prev == null) {
prev = node;
} else {
if (prev != node)
prev = union(prev, node);
}
} else {
if (prev == null) {
prev = node.complement();
} else {
if (prev != node)
prev = setDifference(prev, node);
}
}
ch = peek();
}
}
private CharProperty bitsOrSingle(BitClass bits, int ch) {
/* Bits can only handle codepoints in [u+0000-u+00ff] range.
Use "single" node instead of bits when dealing with unicode
case folding for codepoints listed below.
(1)Uppercase out of range: u+00ff, u+00b5
toUpperCase(u+00ff) -> u+0178
toUpperCase(u+00b5) -> u+039c
(2)LatinSmallLetterLongS u+17f
toUpperCase(u+017f) -> u+0053
(3)LatinSmallLetterDotlessI u+131
toUpperCase(u+0131) -> u+0049
(4)LatinCapitalLetterIWithDotAbove u+0130
toLowerCase(u+0130) -> u+0069
(5)KelvinSign u+212a
toLowerCase(u+212a) ==> u+006B
(6)AngstromSign u+212b
toLowerCase(u+212b) ==> u+00e5
*/
int d;
if (ch < 256 &&
!(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
(ch == 0xff || ch == 0xb5 ||
ch == 0x49 || ch == 0x69 || //I and i
ch == 0x53 || ch == 0x73 || //S and s
ch == 0x4b || ch == 0x6b || //K and k
ch == 0xc5 || ch == 0xe5))) //A+ring
return bits.add(ch, flags());
return newSingle(ch);
}
/**
* Parse a single character or a character range in a character class
* and return its representative node.
*/
private CharProperty range(BitClass bits) {
int ch = peek();
if (ch == '\\') {
ch = nextEscaped();
if (ch == 'p' || ch == 'P') { // A property
boolean comp = (ch == 'P');
boolean oneLetter = true;
// Consume { if present
ch = next();
if (ch != '{')
unread();
else
oneLetter = false;
return family(oneLetter, comp);
} else { // ordinary escape
boolean isrange = temp[cursor+1] == '-';
unread();
ch = escape(true, true, isrange);
if (ch == -1)
return (CharProperty) root;
}
} else {
next();
}
if (ch >= 0) {
if (peek() == '-') {
int endRange = temp[cursor+1];
if (endRange == '[') {
return bitsOrSingle(bits, ch);
}
if (endRange != ']') {
next();
int m = peek();
if (m == '\\') {
m = escape(true, false, true);
} else {
next();
}
if (m < ch) {
throw error("Illegal character range");
}
if (has(CASE_INSENSITIVE))
return caseInsensitiveRangeFor(ch, m);
else
return rangeFor(ch, m);
}
}
return bitsOrSingle(bits, ch);
}
throw error("Unexpected character '"+((char)ch)+"'");
}
/**
* Parses a Unicode character family and returns its representative node.
*/
private CharProperty family(boolean singleLetter,
boolean maybeComplement)
{
next();
String name;
CharProperty node = null;
if (singleLetter) {
int c = temp[cursor];
if (!Character.isSupplementaryCodePoint(c)) {
name = String.valueOf((char)c);
} else {
name = new String(temp, cursor, 1);
}
read();
} else {
int i = cursor;
mark('}');
while(read() != '}') {
}
mark('\000');
int j = cursor;
if (j > patternLength)
throw error("Unclosed character family");
if (i + 1 >= j)
throw error("Empty character family");
name = new String(temp, i, j-i-1);
}
int i = name.indexOf('=');
if (i != -1) {
// property construct \p{name=value}
String value = name.substring(i + 1);
name = name.substring(0, i).toLowerCase(Locale.ENGLISH);
if ("sc".equals(name) || "script".equals(name)) {
node = unicodeScriptPropertyFor(value);
} else if ("blk".equals(name) || "block".equals(name)) {
node = unicodeBlockPropertyFor(value);
} else if ("gc".equals(name) || "general_category".equals(name)) {
node = charPropertyNodeFor(value);
} else {
throw error("Unknown Unicode property {name=<" + name + ">, "
+ "value=<" + value + ">}");
}
} else {
if (name.startsWith("In")) {
// \p{inBlockName}
node = unicodeBlockPropertyFor(name.substring(2));
} else if (name.startsWith("Is")) {
// \p{isGeneralCategory} and \p{isScriptName}
name = name.substring(2);
UnicodeProp uprop = UnicodeProp.forName(name);
if (uprop != null)
node = new Utype(uprop);
if (node == null)
node = CharPropertyNames.charPropertyFor(name);
if (node == null)
node = unicodeScriptPropertyFor(name);
} else {
if (has(UNICODE_CHARACTER_CLASS)) {
UnicodeProp uprop = UnicodeProp.forPOSIXName(name);
if (uprop != null)
node = new Utype(uprop);
}
if (node == null)
node = charPropertyNodeFor(name);
}
}
if (maybeComplement) {
if (node instanceof Category || node instanceof Block)
hasSupplementary = true;
node = node.complement();
}
return node;
}
/**
* Returns a CharProperty matching all characters belong to
* a UnicodeScript.
*/
private CharProperty unicodeScriptPropertyFor(String name) {
final Character.UnicodeScript script;
try {
script = Character.UnicodeScript.forName(name);
} catch (IllegalArgumentException iae) {
throw error("Unknown character script name {" + name + "}");
}
return new Script(script);
}
/**
* Returns a CharProperty matching all characters in a UnicodeBlock.
*/
private CharProperty unicodeBlockPropertyFor(String name) {
final Character.UnicodeBlock block;
try {
block = Character.UnicodeBlock.forName(name);
} catch (IllegalArgumentException iae) {
throw error("Unknown character block name {" + name + "}");
}
return new Block(block);
}
/**
* Returns a CharProperty matching all characters in a named property.
*/
private CharProperty charPropertyNodeFor(String name) {
CharProperty p = CharPropertyNames.charPropertyFor(name);
if (p == null)
throw error("Unknown character property name {" + name + "}");
return p;
}
/**
* Parses and returns the name of a "named capturing group", the trailing
* ">" is consumed after parsing.
*/
private String groupname(int ch) {
StringBuilder sb = new StringBuilder();
sb.append(Character.toChars(ch));
while (ASCII.isLower(ch=read()) || ASCII.isUpper(ch) ||
ASCII.isDigit(ch)) {
sb.append(Character.toChars(ch));
}
if (sb.length() == 0)
throw error("named capturing group has 0 length name");
if (ch != '>')
throw error("named capturing group is missing trailing '>'");
return sb.toString();
}
/**
* Parses a group and returns the head node of a set of nodes that process
* the group. Sometimes a double return system is used where the tail is
* returned in root.
*/
private Node group0() {
boolean capturingGroup = false;
Node head = null;
Node tail = null;
int save = flags;
root = null;
int ch = next();
if (ch == '?') {
ch = skip();
switch (ch) {
case ':': // (?:xxx) pure group
head = createGroup(true);
tail = root;
head.next = expr(tail);
break;
case '=': // (?=xxx) and (?!xxx) lookahead
case '!':
head = createGroup(true);
tail = root;
head.next = expr(tail);
if (ch == '=') {
head = tail = new Pos(head);
} else {
head = tail = new Neg(head);
}
break;
case '>': // (?>xxx) independent group
head = createGroup(true);
tail = root;
head.next = expr(tail);
head = tail = new Ques(head, INDEPENDENT);
break;
case '<': // (?Here is a short list of links related to this Java Pattern.java source code file:
: |
o |
... this post is sponsored by my books ... | |
#1 New Release! |
FP Best Seller |
Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.
A percentage of advertising revenue from
pages under the /java/jwarehouse
URI on this website is
paid back to open source projects.