alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (FuzzyTermEnum.java)

This example Lucene source code file (FuzzyTermEnum.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

filteredtermenum, fuzzytermenum, fuzzytermenum, illegalargumentexception, io, ioexception, ioexception, override, override, string, string, term

The Lucene FuzzyTermEnum.java source code

package org.apache.lucene.search;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;

/** Subclass of FilteredTermEnum for enumerating all terms that are similar
 * to the specified filter term.
 *
 * <p>Term enumerations are always ordered by Term.compareTo().  Each term in
 * the enumeration is greater than all that precede it.
 */
public final class FuzzyTermEnum extends FilteredTermEnum {

  /* Allows us save time required to create a new array
   * every time similarity is called.
   */
  private int[] p;
  private int[] d;

  private float similarity;
  private boolean endEnum = false;

  private Term searchTerm = null;
  private final String field;
  private final char[] text;
  private final String prefix;

  private final float minimumSimilarity;
  private final float scale_factor;

  /**
   * Creates a FuzzyTermEnum with an empty prefix and a minSimilarity of 0.5f.
   * <p>
   * After calling the constructor the enumeration is already pointing to the first 
   * valid term if such a term exists. 
   * 
   * @param reader
   * @param term
   * @throws IOException
   * @see #FuzzyTermEnum(IndexReader, Term, float, int)
   */
  public FuzzyTermEnum(IndexReader reader, Term term) throws IOException {
    this(reader, term, FuzzyQuery.defaultMinSimilarity, FuzzyQuery.defaultPrefixLength);
  }
    
  /**
   * Creates a FuzzyTermEnum with an empty prefix.
   * <p>
   * After calling the constructor the enumeration is already pointing to the first 
   * valid term if such a term exists. 
   * 
   * @param reader
   * @param term
   * @param minSimilarity
   * @throws IOException
   * @see #FuzzyTermEnum(IndexReader, Term, float, int)
   */
  public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity) throws IOException {
    this(reader, term, minSimilarity, FuzzyQuery.defaultPrefixLength);
  }
    
  /**
   * Constructor for enumeration of all terms from specified <code>reader which share a prefix of
   * length <code>prefixLength with term and which have a fuzzy similarity >
   * <code>minSimilarity.
   * <p>
   * After calling the constructor the enumeration is already pointing to the first 
   * valid term if such a term exists. 
   * 
   * @param reader Delivers terms.
   * @param term Pattern term.
   * @param minSimilarity Minimum required similarity for terms from the reader. Default value is 0.5f.
   * @param prefixLength Length of required common prefix. Default value is 0.
   * @throws IOException
   */
  public FuzzyTermEnum(IndexReader reader, Term term, final float minSimilarity, final int prefixLength) throws IOException {
    super();
    
    if (minSimilarity >= 1.0f)
      throw new IllegalArgumentException("minimumSimilarity cannot be greater than or equal to 1");
    else if (minSimilarity < 0.0f)
      throw new IllegalArgumentException("minimumSimilarity cannot be less than 0");
    if(prefixLength < 0)
      throw new IllegalArgumentException("prefixLength cannot be less than 0");

    this.minimumSimilarity = minSimilarity;
    this.scale_factor = 1.0f / (1.0f - minimumSimilarity);
    this.searchTerm = term;
    this.field = searchTerm.field();

    //The prefix could be longer than the word.
    //It's kind of silly though.  It means we must match the entire word.
    final int fullSearchTermLength = searchTerm.text().length();
    final int realPrefixLength = prefixLength > fullSearchTermLength ? fullSearchTermLength : prefixLength;

    this.text = searchTerm.text().substring(realPrefixLength).toCharArray();
    this.prefix = searchTerm.text().substring(0, realPrefixLength);

    this.p = new int[this.text.length+1];
    this.d = new int[this.text.length+1];

    setEnum(reader.terms(new Term(searchTerm.field(), prefix)));
  }

  /**
   * The termCompare method in FuzzyTermEnum uses Levenshtein distance to 
   * calculate the distance between the given term and the comparing term. 
   */
  @Override
  protected final boolean termCompare(Term term) {
    if (field == term.field() && term.text().startsWith(prefix)) {
        final String target = term.text().substring(prefix.length());
        this.similarity = similarity(target);
        return (similarity > minimumSimilarity);
    }
    endEnum = true;
    return false;
  }
  
  /** {@inheritDoc} */
  @Override
  public final float difference() {
    return (similarity - minimumSimilarity) * scale_factor;
  }
  
  /** {@inheritDoc} */
  @Override
  public final boolean endEnum() {
    return endEnum;
  }
  
  /******************************
   * Compute Levenshtein distance
   ******************************/
  
  /**
   * <p>Similarity returns a number that is 1.0f or less (including negative numbers)
   * based on how similar the Term is compared to a target term.  It returns
   * exactly 0.0f when
   * <pre>
   *    editDistance > maximumEditDistance</pre>
   * Otherwise it returns:
   * <pre>
   *    1 - (editDistance / length)</pre>
   * where length is the length of the shortest term (text or target) including a
   * prefix that are identical and editDistance is the Levenshtein distance for
   * the two words.</p>
   *
   * <p>Embedded within this algorithm is a fail-fast Levenshtein distance
   * algorithm.  The fail-fast algorithm differs from the standard Levenshtein
   * distance algorithm in that it is aborted if it is discovered that the
   * minimum distance between the words is greater than some threshold.
   *
   * <p>To calculate the maximum distance threshold we use the following formula:
   * <pre>
   *     (1 - minimumSimilarity) * length</pre>
   * where length is the shortest term including any prefix that is not part of the
   * similarity comparison.  This formula was derived by solving for what maximum value
   * of distance returns false for the following statements:
   * <pre>
   *   similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
   *   return (similarity > minimumSimilarity);</pre>
   * where distance is the Levenshtein distance for the two words.
   * </p>
   * <p>Levenshtein distance (also known as edit distance) is a measure of similarity
   * between two strings where the distance is measured as the number of character
   * deletions, insertions or substitutions required to transform one string to
   * the other string.
   * @param target the target word or phrase
   * @return the similarity,  0.0 or less indicates that it matches less than the required
   * threshold and 1.0 indicates that the text and target are identical
   */
  private float similarity(final String target) {
    final int m = target.length();
    final int n = text.length;
    if (n == 0)  {
      //we don't have anything to compare.  That means if we just add
      //the letters for m we get the new word
      return prefix.length() == 0 ? 0.0f : 1.0f - ((float) m / prefix.length());
    }
    if (m == 0) {
      return prefix.length() == 0 ? 0.0f : 1.0f - ((float) n / prefix.length());
    }

    final int maxDistance = calculateMaxDistance(m);

    if (maxDistance < Math.abs(m-n)) {
      //just adding the characters of m to n or vice-versa results in
      //too many edits
      //for example "pre" length is 3 and "prefixes" length is 8.  We can see that
      //given this optimal circumstance, the edit distance cannot be less than 5.
      //which is 8-3 or more precisely Math.abs(3-8).
      //if our maximum edit distance is 4, then we can discard this word
      //without looking at it.
      return 0.0f;
    }

    // init matrix d
    for (int i = 0; i<=n; ++i) {
      p[i] = i;
    }

    // start computing edit distance
    for (int j = 1; j<=m; ++j) { // iterates through target
      int bestPossibleEditDistance = m;
      final char t_j = target.charAt(j-1); // jth character of t
      d[0] = j;

      for (int i=1; i<=n; ++i) { // iterates through text
        // minimum of cell to the left+1, to the top+1, diagonally left and up +(0|1)
        if (t_j != text[i-1]) {
          d[i] = Math.min(Math.min(d[i-1], p[i]),  p[i-1]) + 1;
		} else {
          d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1),  p[i-1]);
		}
        bestPossibleEditDistance = Math.min(bestPossibleEditDistance, d[i]);
      }

      //After calculating row i, the best possible edit distance
      //can be found by found by finding the smallest value in a given column.
      //If the bestPossibleEditDistance is greater than the max distance, abort.

      if (j > maxDistance && bestPossibleEditDistance > maxDistance) {  //equal is okay, but not greater
        //the closest the target can be to the text is just too far away.
        //this target is leaving the party early.
        return 0.0f;
      }

      // copy current distance counts to 'previous row' distance counts: swap p and d
      int _d[] = p;
      p = d;
      d = _d;
    }

    // our last action in the above loop was to switch d and p, so p now
    // actually has the most recent cost counts

    // this will return less than 0.0 when the edit distance is
    // greater than the number of characters in the shorter word.
    // but this was the formula that was previously used in FuzzyTermEnum,
    // so it has not been changed (even though minimumSimilarity must be
    // greater than 0.0)
    return 1.0f - ((float)p[n] / (float) (prefix.length() + Math.min(n, m)));
  }

  /**
   * The max Distance is the maximum Levenshtein distance for the text
   * compared to some other value that results in score that is
   * better than the minimum similarity.
   * @param m the length of the "other value"
   * @return the maximum levenshtein distance that we care about
   */
  private int calculateMaxDistance(int m) {
    return (int) ((1-minimumSimilarity) * (Math.min(text.length, m) + prefix.length()));
  }

  /** {@inheritDoc} */
  @Override
  public void close() throws IOException {
    p = d = null;
    searchTerm = null;
    super.close();  //call super.close() and let the garbage collector do its work.
  }
  
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene FuzzyTermEnum.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.