home | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (FuzzyLikeThisQuery.java)

This example Lucene source code file (FuzzyLikeThisQuery.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

arraylist, booleanquery, fieldvals, fuzzylikethisquery, io, iterator, override, override, scoreterm, scoreterm, scoretermqueue, string, string, term, term, util

The Lucene FuzzyLikeThisQuery.java source code

package org.apache.lucene.search;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.util.PriorityQueue;

/**
 * Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms.
 * In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration
 * of fuzzy scoring factors.
 * This generally produces good results for queries where users may provide details in a number of 
 * fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and
 * a fast query.
 * 
 * For each source term the fuzzy variants are held in a BooleanQuery with no coord factor (because
 * we are not looking for matches on multiple variants in any one doc). Additionally, a specialized
 * TermQuery is used for variants and does not use that variant term's IDF because this would favour rarer 
 * terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query 
 * term) and this is factored into the variant's boost. If the source query term does not exist in the
 * index the average IDF of the variants is used.
 */
public class FuzzyLikeThisQuery extends Query
{
    static Similarity sim=new DefaultSimilarity();
    Query rewrittenQuery=null;
    ArrayList<FieldVals> fieldVals=new ArrayList();
    Analyzer analyzer;
    
    ScoreTermQueue q;
    int MAX_VARIANTS_PER_TERM=50;
    boolean ignoreTF=false;
    private int maxNumTerms;

    @Override
    public int hashCode() {
      final int prime = 31;
      int result = 1;
      result = prime * result + ((analyzer == null) ? 0 : analyzer.hashCode());
      result = prime * result
          + ((fieldVals == null) ? 0 : fieldVals.hashCode());
      result = prime * result + (ignoreTF ? 1231 : 1237);
      result = prime * result + maxNumTerms;
      return result;
    }

    @Override
    public boolean equals(Object obj) {
      if (this == obj)
        return true;
      if (obj == null)
        return false;
      if (getClass() != obj.getClass())
        return false;
      FuzzyLikeThisQuery other = (FuzzyLikeThisQuery) obj;
      if (analyzer == null) {
        if (other.analyzer != null)
          return false;
      } else if (!analyzer.equals(other.analyzer))
        return false;
      if (fieldVals == null) {
        if (other.fieldVals != null)
          return false;
      } else if (!fieldVals.equals(other.fieldVals))
        return false;
      if (ignoreTF != other.ignoreTF)
        return false;
      if (maxNumTerms != other.maxNumTerms)
        return false;
      return true;
    }


    /**
     * 
     * @param maxNumTerms The total number of terms clauses that will appear once rewritten as a BooleanQuery
     * @param analyzer
     */
    public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer)
    {
        q=new ScoreTermQueue(maxNumTerms);
        this.analyzer=analyzer;
        this.maxNumTerms = maxNumTerms;
    }

    class FieldVals
    {
    	String queryString;
    	String fieldName;
    	float minSimilarity;
    	int prefixLength;
		public FieldVals(String name, float similarity, int length, String queryString)
		{
			fieldName = name;
			minSimilarity = similarity;
			prefixLength = length;
			this.queryString = queryString;
		}

    @Override
    public int hashCode() {
      final int prime = 31;
      int result = 1;
      result = prime * result
          + ((fieldName == null) ? 0 : fieldName.hashCode());
      result = prime * result + Float.floatToIntBits(minSimilarity);
      result = prime * result + prefixLength;
      result = prime * result
          + ((queryString == null) ? 0 : queryString.hashCode());
      return result;
    }

    @Override
    public boolean equals(Object obj) {
      if (this == obj)
        return true;
      if (obj == null)
        return false;
      if (getClass() != obj.getClass())
        return false;
      FieldVals other = (FieldVals) obj;
      if (fieldName == null) {
        if (other.fieldName != null)
          return false;
      } else if (!fieldName.equals(other.fieldName))
        return false;
      if (Float.floatToIntBits(minSimilarity) != Float
          .floatToIntBits(other.minSimilarity))
        return false;
      if (prefixLength != other.prefixLength)
        return false;
      if (queryString == null) {
        if (other.queryString != null)
          return false;
      } else if (!queryString.equals(other.queryString))
        return false;
      return true;
    }
    

    	
    }
    
    /**
     * Adds user input for "fuzzification" 
     * @param queryString The string which will be parsed by the analyzer and for which fuzzy variants will be parsed
     * @param fieldName
     * @param minSimilarity The minimum similarity of the term variants (see FuzzyTermEnum)
     * @param prefixLength Length of required common prefix on variant terms (see FuzzyTermEnum)
     */
    public void addTerms(String queryString, String fieldName,float minSimilarity, int prefixLength) 
    {
    	fieldVals.add(new FieldVals(fieldName,minSimilarity,prefixLength,queryString));
    }
    
    
    private void addTerms(IndexReader reader,FieldVals f) throws IOException
    {
        if(f.queryString==null) return;
        TokenStream ts=analyzer.reusableTokenStream(f.fieldName,new StringReader(f.queryString));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        
        int corpusNumDocs=reader.numDocs();
        Term internSavingTemplateTerm =new Term(f.fieldName); //optimization to avoid constructing new Term() objects
        HashSet<String> processedTerms=new HashSet();
        ts.reset();
        while (ts.incrementToken()) 
        {
                String term = termAtt.toString();
        	if(!processedTerms.contains(term))
        	{
        		processedTerms.add(term);
                ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
                float minScore=0;
                Term startTerm=internSavingTemplateTerm.createTerm(term);
                FuzzyTermEnum fe=new FuzzyTermEnum(reader,startTerm,f.minSimilarity,f.prefixLength);
                TermEnum origEnum = reader.terms(startTerm);
                int df=0;
                if(startTerm.equals(origEnum.term()))
                {
                    df=origEnum.docFreq(); //store the df so all variants use same idf
                }
                int numVariants=0;
                int totalVariantDocFreqs=0;
                do
                {
                    Term possibleMatch=fe.term();
                    if(possibleMatch!=null)
                    {
    	                numVariants++;
    	                totalVariantDocFreqs+=fe.docFreq();
    	                float score=fe.difference();
    	                if(variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore){
    	                    ScoreTerm st=new ScoreTerm(possibleMatch,score,startTerm);                    
    	                    variantsQ.insertWithOverflow(st);
    	                    minScore = variantsQ.top().score; // maintain minScore
    	                }
                    }
                }
                while(fe.next());
                if(numVariants>0)
                {
	                int avgDf=totalVariantDocFreqs/numVariants;
	                if(df==0)//no direct match we can use as df for all variants 
	                {
	                    df=avgDf; //use avg df of all variants
	                }
	                
	                // take the top variants (scored by edit distance) and reset the score
	                // to include an IDF factor then add to the global queue for ranking 
	                // overall top query terms
	                int size = variantsQ.size();
	                for(int i = 0; i < size; i++)
	                {
	                  ScoreTerm st = variantsQ.pop();
	                  st.score=(st.score*st.score)*sim.idf(df,corpusNumDocs);
	                  q.insertWithOverflow(st);
	                }                            
                }
        	}
        }
        ts.end();
        ts.close();
    }
            
    @Override
    public Query rewrite(IndexReader reader) throws IOException
    {
        if(rewrittenQuery!=null)
        {
            return rewrittenQuery;
        }
        //load up the list of possible terms
        for (Iterator<FieldVals> iter = fieldVals.iterator(); iter.hasNext();)
		{
			FieldVals f = iter.next();
			addTerms(reader,f);			
		}
        //clear the list of fields
        fieldVals.clear();
        
        BooleanQuery bq=new BooleanQuery();
        
        
        //create BooleanQueries to hold the variants for each token/field pair and ensure it
        // has no coord factor
        //Step 1: sort the termqueries by term/field
        HashMap<Term,ArrayList variantQueries=new HashMap>();
        int size = q.size();
        for(int i = 0; i < size; i++)
        {
          ScoreTerm st = q.pop();
          ArrayList<ScoreTerm> l= variantQueries.get(st.fuzziedSourceTerm);
          if(l==null)
          {
              l=new ArrayList<ScoreTerm>();
              variantQueries.put(st.fuzziedSourceTerm,l);
          }
          l.add(st);
        }
        //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
        for (Iterator<ArrayList iter = variantQueries.values().iterator(); iter.hasNext();)
        {
            ArrayList<ScoreTerm> variants = iter.next();
            if(variants.size()==1)
            {
                //optimize where only one selected variant
                ScoreTerm st= variants.get(0);
                TermQuery tq = new FuzzyTermQuery(st.term,ignoreTF);
                tq.setBoost(st.score); // set the boost to a mix of IDF and score
                bq.add(tq, BooleanClause.Occur.SHOULD); 
            }
            else
            {
                BooleanQuery termVariants=new BooleanQuery(true); //disable coord and IDF for these term variants
                for (Iterator<ScoreTerm> iterator2 = variants.iterator(); iterator2
                        .hasNext();)
                {
                    ScoreTerm st = iterator2.next();
                    TermQuery tq = new FuzzyTermQuery(st.term,ignoreTF);      // found a match
                    tq.setBoost(st.score); // set the boost using the ScoreTerm's score
                    termVariants.add(tq, BooleanClause.Occur.SHOULD);          // add to query                    
                }
                bq.add(termVariants, BooleanClause.Occur.SHOULD);          // add to query
            }
        }
        //TODO possible alternative step 3 - organize above booleans into a new layer of field-based
        // booleans with a minimum-should-match of NumFields-1?
        bq.setBoost(getBoost());
        this.rewrittenQuery=bq;
        return bq;
    }
    
    //Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best
    // term variants) then is reset with IDF for use in ranking against all other
    // terms/fields
    private static class ScoreTerm{
        public Term term;
        public float score;
        Term fuzziedSourceTerm;
        
        public ScoreTerm(Term term, float score, Term fuzziedSourceTerm){
          this.term = term;
          this.score = score;
          this.fuzziedSourceTerm=fuzziedSourceTerm;
        }
      }
      
      private static class ScoreTermQueue extends PriorityQueue<ScoreTerm> {        
        public ScoreTermQueue(int size){
          initialize(size);
        }
        
        /* (non-Javadoc)
         * @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
         */
        @Override
        protected boolean lessThan(ScoreTerm termA, ScoreTerm termB) {
          if (termA.score== termB.score)
            return termA.term.compareTo(termB.term) > 0;
          else
            return termA.score < termB.score;
        }
        
      }
      
      //overrides basic TermQuery to negate effects of IDF (idf is factored into boost of containing BooleanQuery)
      private static class FuzzyTermQuery extends TermQuery
      {
    	  boolean ignoreTF;
          public FuzzyTermQuery(Term t, boolean ignoreTF)
          {
        	  super(t);
        	  this.ignoreTF=ignoreTF;
          }
          @Override
          public Similarity getSimilarity(Searcher searcher)
          {            
              Similarity result = super.getSimilarity(searcher);
              result = new SimilarityDelegator(result) {
                  
                  @Override
                  public float tf(float freq)
                  {
                	  if(ignoreTF)
                	  {
                          return 1; //ignore tf
                	  }
            		  return super.tf(freq);
                  }
                  @Override
                  public float idf(int docFreq, int numDocs)
                  {
                      //IDF is already factored into individual term boosts
                      return 1;
                  }               
              };
              return result;
          }        
      }
      
      

    /* (non-Javadoc)
     * @see org.apache.lucene.search.Query#toString(java.lang.String)
     */
    @Override
    public String toString(String field)
    {
        return null;
    }


	public boolean isIgnoreTF()
	{
		return ignoreTF;
	}


	public void setIgnoreTF(boolean ignoreTF)
	{
		this.ignoreTF = ignoreTF;
	}   
    
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene FuzzyLikeThisQuery.java source code file:



my book on functional programming

 

new blog posts

 

Copyright 1998-2019 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.