|
What this is
Other links
The source code/* * BoyerMooreSearchMatcher.java - Literal pattern String matcher utilizing the * Boyer-Moore algorithm * :tabSize=8:indentSize=8:noTabs=false: * :folding=explicit:collapseFolds=1: * * Copyright (C) 1999, 2000 mike dillon * Portions copyright (C) 2001 Tom Locke * Portions copyright (C) 2001, 2002 Slava Pestov * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ package org.gjt.sp.jedit.search; //{{{ Imports import bsh.BshMethod; import bsh.NameSpace; import gnu.regexp.CharIndexed; import org.gjt.sp.jedit.BeanShell; //}}} /** * Implements literal search using the Boyer-Moore algorithm. */ public class BoyerMooreSearchMatcher extends SearchMatcher { //{{{ BoyerMooreSearchMatcher constructor /** * Creates a new string literal matcher. */ public BoyerMooreSearchMatcher(String pattern, boolean ignoreCase) { this.pattern = pattern.toCharArray(); if(ignoreCase) { for(int i = 0; i < this.pattern.length; i++) { this.pattern[i] = Character.toUpperCase( this.pattern[i]); } } this.ignoreCase = ignoreCase; pattern_end = this.pattern.length - 1; } //}}} //{{{ nextMatch() method /** * Returns the offset of the first match of the specified text * within this matcher. * @param text The text to search in * @param start True if the start of the segment is the beginning of the * buffer * @param end True if the end of the segment is the end of the buffer * @param firstTime If false and the search string matched at the start * offset with length zero, automatically find next match * @param reverse If true, searching will be performed in a backward * direction. * @return an array where the first element is the start offset * of the match, and the second element is the end offset of * the match * @since jEdit 4.2pre4 */ public SearchMatcher.Match nextMatch(CharIndexed text, boolean start, boolean end, boolean firstTime, boolean reverse) { int pos = match(text,reverse); if (pos == -1) { return null; } else { returnValue.start = pos; returnValue.end = pos + pattern.length; return returnValue; } } //}}} //{{{ match() method /* * a good introduction to the Boyer-Moore fast string matching * algorithm may be found on Moore's website at: * * http://www.cs.utexas.edu/users/moore/best-ideas/string-searching/ * */ public int match(CharIndexed text, boolean reverse) { //{{{ // lazily create skip and suffix arrays for either the // search pattern, or the reversed search pattern int[] skip, suffix; if(reverse) { if(back_skip == null) { back_skip = generateSkipArray(true); back_suffix = generateSuffixArray(true); } skip = back_skip; suffix = back_suffix; } else { if(fwd_skip == null) { fwd_skip = generateSkipArray(false); fwd_suffix = generateSuffixArray(false); } skip = fwd_skip; suffix = fwd_suffix; } //}}} // position variable for pattern test position int pos; // position variable for pattern start int anchor = 0; // last possible start position of a match with this pattern; // this is negative if the pattern is longer than the text // causing the search loop below to immediately fail //int last_anchor = reverseSearch // ? offset + pattern.length - 1 // : length - pattern.length; char ch = 0; int bad_char; int good_suffix; // the search works by starting the anchor (first character // of the pattern) at the initial offset. as long as the // anchor is far enough from the enough of the text for the // pattern to match, and until the pattern matches, we // compare the pattern to the text from the last character // to the first character in reverse order. where a character // in the pattern mismatches, we use the two heuristics // based on the mismatch character and its position in the // pattern to determine the furthest we can move the anchor // without missing any potential pattern matches. SEARCH: while (text.isValid()) { for (pos = pattern_end; pos >= 0; --pos) { ch = text.charAt(pos); if(ignoreCase) ch = Character.toUpperCase(ch); // pattern test if ((reverse ? ch != pattern[pattern_end - pos] : ch != pattern[pos])) { // character mismatch, determine how many characters to skip // heuristic #1 bad_char = pos - skip[getSkipIndex(ch)]; // heuristic #2 good_suffix = suffix[pos]; // skip the greater of the two distances provided by the // heuristics int skip_index = (bad_char > good_suffix) ? bad_char : good_suffix; anchor += skip_index; text.move(skip_index); // go back to the while loop continue SEARCH; } } // MATCH: return the position of its first character return anchor; } // MISMATCH: return -1 as defined by API return -1; } //}}} //{{{ Private members private char[] pattern; private int pattern_end; private boolean ignoreCase; // Boyer-Moore member fields private int[] fwd_skip; private int[] fwd_suffix; private int[] back_skip; private int[] back_suffix; //}}} // Boyer-Moore helper methods //{{{ generateSkipArray() method /* * the 'skip' array is used to determine for each index in the * hashed alphabet how many characters can be skipped if * a mismatch occurs on a characater hashing to that index. */ private int[] generateSkipArray(boolean reverse) { // initialize the skip array to all zeros int[] skip = new int[256]; // leave the table cleanly-initialized for an empty pattern if (pattern.length == 0) return skip; int pos = 0; do { skip[getSkipIndex(pattern[reverse ? pattern_end - pos : pos])] = pos; } while (++pos < pattern.length); return skip; } //}}} //{{{ getSkipIndex() method /* * to avoid our skip table having a length of 2 ^ 16, we hash each * character of the input into a character in the alphabet [\x00-\xFF] * using the lower 8 bits of the character's value (resulting in * a more reasonable skip table of length 2 ^ 8). * * the result of this is that more than one character can hash to the * same index, but since the skip table encodes the position of * occurence of the character furthest into the string with a particular * index (whether or not it is the only character with that index), an * index collision only means that that this heuristic will give a * sub-optimal skip (i.e. a complete skip table could use the differences * between colliding characters to maximal effect, at the expense of * building a table that is over 2 orders of magnitude larger and very * sparse). */ private static final int getSkipIndex(char ch) { return ((int) ch) & 0x000000FF; } //}}} //{{{ generateSuffixArray() method /* * XXX: hairy code that is basically just a functional(?) port of some * other code i barely understood */ private int[] generateSuffixArray(boolean reverse) { int m = pattern.length; int j = m + 1; int[] suffix = new int[j]; int[] tmp = new int[j]; tmp[m] = j; for (int i = m; i > 0; --i) { while (j <= m && pattern[reverse ? pattern_end - i + 1 : i - 1] != pattern[reverse ? pattern_end - j + 1 : j - 1]) { if (suffix[j] == 0) { suffix[j] = j - i; } j = tmp[j]; } tmp[i - 1] = --j; } int k = tmp[0]; for (j = 0; j <= m; j++) { // the code above builds a 1-indexed suffix array, // but we shift it to be 0-indexed, ignoring the // original 0-th element if (j > 0) { suffix[j - 1] = (suffix[j] == 0) ? k : suffix[j]; } if (j == k) { k = tmp[k]; } } return suffix; } //}}} //}}} } |
... this post is sponsored by my books ... | |
#1 New Release! |
FP Best Seller |
Copyright 1998-2024 Alvin Alexander, alvinalexander.com
All Rights Reserved.
A percentage of advertising revenue from
pages under the /java/jwarehouse
URI on this website is
paid back to open source projects.