alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (SweetSpotSimilarity.java)

This example Lucene source code file (SweetSpotSimilarity.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

boolean, defaultsimilarity, fieldinvertstate, hashmap, hashmap, map, map, override, override, sweetspotsimilarity, sweetspotsimilarity, util

The Lucene SweetSpotSimilarity.java source code

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.misc;

import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.index.FieldInvertState;

import java.util.Map;
import java.util.HashMap;

/**
 * A similarity with a lengthNorm that provides for a "plateau" of
 * equally good lengths, and tf helper functions.
 *
 * <p>
 * For lengthNorm, A global min/max can be specified to define the
 * plateau of lengths that should all have a norm of 1.0.
 * Below the min, and above the max the lengthNorm drops off in a
 * sqrt function.
 * </p>
 * <p>
 * A per field min/max can be specified if different fields have
 * different sweet spots.
 * </p>
 *
 * <p>
 * For tf, baselineTf and hyperbolicTf functions are provided, which
 * subclasses can choose between.
 * </p>
 *
 */
public class SweetSpotSimilarity extends DefaultSimilarity {

  private int ln_min = 1;
  private int ln_max = 1;
  private float ln_steep = 0.5f;

  private Map<String,Number> ln_maxs = new HashMap(7);
  private Map<String,Number> ln_mins = new HashMap(7);
  private Map<String,Float> ln_steeps = new HashMap(7);
  private Map<String,Boolean> ln_overlaps = new HashMap(7);

  private float tf_base = 0.0f;
  private float tf_min = 0.0f;

  private float tf_hyper_min = 0.0f;
  private float tf_hyper_max = 2.0f;
  private double tf_hyper_base = 1.3d;
  private float tf_hyper_xoffset = 10.0f;
    
  public SweetSpotSimilarity() {
    super();
  }

  /**
   * Sets the baseline and minimum function variables for baselineTf
   *
   * @see #baselineTf
   */
  public void setBaselineTfFactors(float base, float min) {
    tf_min = min;
    tf_base = base;
  }
  
  /**
   * Sets the function variables for the hyperbolicTf functions
   *
   * @param min the minimum tf value to ever be returned (default: 0.0)
   * @param max the maximum tf value to ever be returned (default: 2.0)
   * @param base the base value to be used in the exponential for the hyperbolic function (default: e)
   * @param xoffset the midpoint of the hyperbolic function (default: 10.0)
   * @see #hyperbolicTf
   */
  public void setHyperbolicTfFactors(float min, float max,
                                     double base, float xoffset) {
    tf_hyper_min = min;
    tf_hyper_max = max;
    tf_hyper_base = base;
    tf_hyper_xoffset = xoffset;
  }
    
  /**
   * Sets the default function variables used by lengthNorm when no field
   * specific variables have been set.
   *
   * @see #lengthNorm
   */
  public void setLengthNormFactors(int min, int max, float steepness) {
    this.ln_min = min;
    this.ln_max = max;
    this.ln_steep = steepness;
  }

  /**
   * Sets the function variables used by lengthNorm for a specific named field.
   * 
   * @param field field name
   * @param min minimum value
   * @param max maximum value
   * @param steepness steepness of the curve
   * @param discountOverlaps if true, <code>numOverlapTokens will be
   * subtracted from <code>numTokens; if false then
   * <code>numOverlapTokens will be assumed to be 0 (see
   * {@link DefaultSimilarity#computeNorm(String, FieldInvertState)} for details).
   *
   * @see #lengthNorm
   */
  public void setLengthNormFactors(String field, int min, int max,
                                   float steepness, boolean discountOverlaps) {
    ln_mins.put(field, Integer.valueOf(min));
    ln_maxs.put(field, Integer.valueOf(max));
    ln_steeps.put(field, Float.valueOf(steepness));
    ln_overlaps.put(field, new Boolean(discountOverlaps));
  }
    
  /**
   * Implemented as <code> state.getBoost() *
   * lengthNorm(fieldName, numTokens) </code> where
   * numTokens does not count overlap tokens if
   * discountOverlaps is true by default or true for this
   * specific field. */
  @Override
  public float computeNorm(String fieldName, FieldInvertState state) {
    final int numTokens;
    boolean overlaps = discountOverlaps;
    if (ln_overlaps.containsKey(fieldName)) {
      overlaps = ln_overlaps.get(fieldName).booleanValue();
    }
    if (overlaps)
      numTokens = state.getLength() - state.getNumOverlap();
    else
      numTokens = state.getLength();

    return state.getBoost() * computeLengthNorm(fieldName, numTokens);
  }

  /**
   * Implemented as:
   * <code>
   * 1/sqrt( steepness * (abs(x-min) + abs(x-max) - (max-min)) + 1 )
   * </code>.
   *
   * <p>
   * This degrades to <code>1/sqrt(x) when min and max are both 1 and
   * steepness is 0.5
   * </p>
   *
   * <p>
   * :TODO: potential optimization is to just flat out return 1.0f if numTerms
   * is between min and max.
   * </p>
   *
   * @see #setLengthNormFactors
   */
  public float computeLengthNorm(String fieldName, int numTerms) {
    int l = ln_min;
    int h = ln_max;
    float s = ln_steep;
  
    if (ln_mins.containsKey(fieldName)) {
      l = ln_mins.get(fieldName).intValue();
    }
    if (ln_maxs.containsKey(fieldName)) {
      h = ln_maxs.get(fieldName).intValue();
    }
    if (ln_steeps.containsKey(fieldName)) {
      s = ln_steeps.get(fieldName).floatValue();
    }
  
    return (float)
      (1.0f /
       Math.sqrt
       (
        (
         s *
         (float)(Math.abs(numTerms - l) + Math.abs(numTerms - h) - (h-l))
         )
        + 1.0f
        )
       );
  }

  /**
   * Delegates to baselineTf
   *
   * @see #baselineTf
   */
  @Override
  public float tf(int freq) {
    return baselineTf(freq);
  }
  
  /**
   * Implemented as:
   * <code>
   *  (x <= min) ? base : sqrt(x+(base**2)-min)
   * </code>
   * ...but with a special case check for 0.
   * <p>
   * This degrates to <code>sqrt(x) when min and base are both 0
   * </p>
   *
   * @see #setBaselineTfFactors
   */
  public float baselineTf(float freq) {

    if (0.0f == freq) return 0.0f;
  
    return (freq <= tf_min)
      ? tf_base
      : (float)Math.sqrt(freq + (tf_base * tf_base) - tf_min);
  }

  /**
   * Uses a hyperbolic tangent function that allows for a hard max...
   *
   * <code>
   * tf(x)=min+(max-min)/2*(((base**(x-xoffset)-base**-(x-xoffset))/(base**(x-xoffset)+base**-(x-xoffset)))+1)
   * </code>
   *
   * <p>
   * This code is provided as a convenience for subclasses that want
   * to use a hyperbolic tf function.
   * </p>
   *
   * @see #setHyperbolicTfFactors
   */
  public float hyperbolicTf(float freq) {
    if (0.0f == freq) return 0.0f;

    final float min = tf_hyper_min;
    final float max = tf_hyper_max;
    final double base = tf_hyper_base;
    final float xoffset = tf_hyper_xoffset;
    final double x = (double)(freq - xoffset);
  
    final float result = min +
      (float)(
              (max-min) / 2.0f
              *
              (
               ( ( Math.pow(base,x) - Math.pow(base,-x) )
                 / ( Math.pow(base,x) + Math.pow(base,-x) )
                 )
               + 1.0d
               )
              );

    return Float.isNaN(result) ? max : result;
    
  }

}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene SweetSpotSimilarity.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.