|
What this is
Other links
The source code
/*
* BoyerMooreSearchMatcher.java - Literal pattern String matcher utilizing the
* Boyer-Moore algorithm
* Copyright (C) 1999, 2000 mike dillon
* Portions copyright (C) 2001 Tom Locke
* Portions copyright (C) 2001 Slava Pestov
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
package org.jext.search;
import javax.swing.text.Segment;
import org.jext.scripting.python.Run;
public class BoyerMooreSearchMatcher implements SearchMatcher
{
// private members
private char[] pattern;
private String replace;
private boolean ignoreCase;
private boolean reverseSearch;
private boolean script;
private String pythonScript;
private Object[] replaceArgs;
// Boyer-Moore member fields
private int[] skip;
private int[] suffix;
/**
* Creates a new string literal matcher.
*/
public BoyerMooreSearchMatcher(String pattern, String replace, boolean ignoreCase, boolean reverseSearch,
boolean script, String pythonScript)
{
if (ignoreCase)
{
this.pattern = pattern.toUpperCase().toCharArray();
} else {
this.pattern = pattern.toCharArray();
}
if (reverseSearch)
{
char[] tmp = new char[this.pattern.length];
for (int i = 0; i < tmp.length; i++)
{
tmp[i] = this.pattern[this.pattern.length - (i + 1)];
}
this.pattern = tmp;
}
this.replace = replace;
this.ignoreCase = ignoreCase;
this.reverseSearch = reverseSearch;
this.script = script;
this.pythonScript = pythonScript;
replaceArgs = new Object[10];
generateSkipArray();
generateSuffixArray();
}
/**
* Returns the offset of the first match of the specified text
* within this matcher.
* @param text The text to search in
* @return an array where the first element is the start offset
* of the match, and the second element is the end offset of
* the match
*/
public int[] nextMatch(Segment text)
{
int pos = match(text.array, text.offset, text.offset + text.count);
if (pos == -1)
{
return null;
} else {
return new int[]{ pos - text.offset, pos + pattern.length - text.offset };
}
}
/**
* Returns the specified text, with any substitution specified
* within this matcher performed.
* @param text The text
*/
public String substitute(String text) throws Exception
{
if (script)
{
String[] args = new String[10];
args[0] = text;
Object obj = Run.eval(pythonScript, "_m", args, null);
if (obj == null)
return null;
else
return obj.toString();
} else
return replace;
}
/*
* a good introduction to the Boyer-Moore fast string matching
* algorithm may be found on Moore's website at:
*
* http://www.cs.utexas.edu/users/moore/best-ideas/string-searching/
*
*/
public int match(char[] text, int offset, int length)
{
// position variable for pattern start
int anchor = reverseSearch ? offset - 1 : offset;
// position variable for pattern test position
int pos;
// last possible start position of a match with this pattern;
// this is negative if the pattern is longer than the text
// causing the search loop below to immediately fail
int last_anchor = reverseSearch ? pattern.length - 1 : length - pattern.length;
// each time the pattern is checked, we start this many
// characters ahead of 'anchor'
int pattern_end = pattern.length - 1;
char ch = 0;
int bad_char;
int good_suffix;
// the search works by starting the anchor (first character
// of the pattern) at the initial offset. as long as the
// anchor is far enough from the enough of the text for the
// pattern to match, and until the pattern matches, we
// compare the pattern to the text from the last character
// to the first character in reverse order. where a character
// in the pattern mismatches, we use the two heuristics
// based on the mismatch character and its position in the
// pattern to determine the furthest we can move the anchor
// without missing any potential pattern matches.
SEARCH:
while (reverseSearch ? anchor >= last_anchor : anchor <= last_anchor)
{
for (pos = pattern_end; pos >= 0; --pos)
{
int idx = reverseSearch ? anchor - pos : anchor + pos;
ch = ignoreCase ? Character.toUpperCase(text[idx]) : text[idx];
// pattern test
if (ch != pattern[pos])
{
// character mismatch, determine how many characters to skip
// heuristic #1
bad_char = pos - skip[getSkipIndex(ch)];
// heuristic #2
good_suffix = suffix[pos];
// skip the greater of the two distances provided by the
// heuristics
int skip = (bad_char > good_suffix) ? bad_char : good_suffix;
anchor += reverseSearch ? -skip : skip;
// go back to the while loop
continue SEARCH;
}
}
// MATCH: return the position of its first character
return reverseSearch ? anchor - (pattern.length - 1) : anchor;
}
// MISMATCH: return -1 as defined by API
return -1;
}
// Boyer-Moore helper methods
/*
* the 'skip' array is used to determine for each index in the
* hashed alphabet how many characters can be skipped if
* a mismatch occurs on a characater hashing to that index.
*/
private void generateSkipArray()
{
// initialize the skip array to all zeros
skip = new int[256];
// leave the table cleanly-initialized for an empty pattern
if (pattern.length == 0)
return;
int pos = 0;
do
{
skip[getSkipIndex(pattern[pos])] = pos;
} while (++pos < pattern.length);
}
/*
* to avoid our skip table having a length of 2 ^ 16, we hash each
* character of the input into a character in the alphabet [\x00-\xFF]
* using the lower 8 bits of the character's value (resulting in
* a more reasonable skip table of length 2 ^ 8).
*
* the result of this is that more than one character can hash to the
* same index, but since the skip table encodes the position of
* occurence of the character furthest into the string with a particular
* index (whether or not it is the only character with that index), an
* index collision only means that that this heuristic will give a
* sub-optimal skip (i.e. a complete skip table could use the differences
* between colliding characters to maximal effect, at the expense of
* building a table that is over 2 orders of magnitude larger and very
* sparse).
*/
private static final int getSkipIndex(char ch)
{
return ((int) ch) & 0x000000FF;
}
/*
* XXX: hairy code that is basically just a functional(?) port of some
* other code i barely understood
*/
private void generateSuffixArray()
{
int m = pattern.length;
int j = m + 1;
suffix = new int[j];
int[] tmp = new int[j];
tmp[m] = j;
for (int i = m; i > 0; --i)
{
while (j <= m && pattern[i - 1] != pattern[j - 1])
{
if (suffix[j] == 0)
{
suffix[j] = j - i;
}
j = tmp[j];
}
tmp[i - 1] = --j;
}
int k = tmp[0];
for (j = 0; j <= m; j++)
{
// the code above builds a 1-indexed suffix array,
// but we shift it to be 0-indexed, ignoring the
// original 0-th element
if (j > 0)
{
suffix[j - 1] = (suffix[j] == 0) ? k : suffix[j];
}
if (j == k)
{
k = tmp[k];
}
}
}
}
|
| ... this post is sponsored by my books ... | |
#1 New Release! |
FP Best Seller |
Copyright 1998-2024 Alvin Alexander, alvinalexander.com
All Rights Reserved.
A percentage of advertising revenue from
pages under the /java/jwarehouse
URI on this website is
paid back to open source projects.