alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (StopFilter.java)

This example Lucene source code file (StopFilter.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

chararrayset, chararrayset, deprecated, deprecated, filteringtokenfilter, io, ioexception, list, override, set, set, stopfilter, stopfilter, string, tokenstream, util

The Lucene StopFilter.java source code

package org.apache.lucene.analysis;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.Arrays;
import java.util.Set;
import java.util.List;

import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.queryParser.QueryParser; // for javadoc
import org.apache.lucene.util.Version;

/**
 * Removes stop words from a token stream.
 * 
 * <a name="version"/>
 * <p>You must specify the required {@link Version}
 * compatibility when creating StopFilter:
 * <ul>
 *   <li> As of 3.1, StopFilter correctly handles Unicode 4.0
 *         supplementary characters in stopwords and position
 *         increments are preserved
 * </ul>
 */
public final class StopFilter extends FilteringTokenFilter {

  private final CharArraySet stopWords;
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

  /**
   * Construct a token stream filtering the given input.
   * If <code>stopWords is an instance of {@link CharArraySet} (true if
   * <code>makeStopSet() was used to construct the set) it will be directly used
   * and <code>ignoreCase will be ignored since CharArraySet
   * directly controls case sensitivity.
   * <p/>
   * If <code>stopWords is not an instance of {@link CharArraySet},
   * a new CharArraySet will be constructed and <code>ignoreCase will be
   * used to specify the case sensitivity of that set.
   *
   * @param enablePositionIncrements true if token positions should record the removed stop words
   * @param input Input TokenStream
   * @param stopWords A Set of Strings or char[] or any other toString()-able set representing the stopwords
   * @param ignoreCase if true, all words are lower cased first
   * @deprecated use {@link #StopFilter(Version, TokenStream, Set, boolean)} instead
   */
  @Deprecated
  public StopFilter(boolean enablePositionIncrements, TokenStream input, Set<?> stopWords, boolean ignoreCase)
  {
    this(Version.LUCENE_30, enablePositionIncrements, input, stopWords, ignoreCase);
  }
  
  /**
   * Construct a token stream filtering the given input. If
   * <code>stopWords is an instance of {@link CharArraySet} (true if
   * <code>makeStopSet() was used to construct the set) it will be
   * directly used and <code>ignoreCase will be ignored since
   * <code>CharArraySet directly controls case sensitivity.
   * <p/>
   * If <code>stopWords is not an instance of {@link CharArraySet}, a new
   * CharArraySet will be constructed and <code>ignoreCase will be used
   * to specify the case sensitivity of that set.
   * 
   * @param matchVersion
   *          Lucene version to enable correct Unicode 4.0 behavior in the stop
   *          set if Version > 3.0. See <a href="#version">above for details.
   * @param input
   *          Input TokenStream
   * @param stopWords
   *          A Set of Strings or char[] or any other toString()-able set
   *          representing the stopwords
   * @param ignoreCase
   *          if true, all words are lower cased first
   */
  public StopFilter(Version matchVersion, TokenStream input, Set<?> stopWords, boolean ignoreCase)
  {
   this(matchVersion, matchVersion.onOrAfter(Version.LUCENE_29), input, stopWords, ignoreCase);
  }
  
  /*
   * convenience ctor to enable deprecated ctors to set posInc explicitly
   */
  private StopFilter(Version matchVersion, boolean enablePositionIncrements, TokenStream input, Set<?> stopWords, boolean ignoreCase){
    super(enablePositionIncrements, input);
    this.stopWords = stopWords instanceof CharArraySet ? (CharArraySet)stopWords : new CharArraySet(matchVersion, stopWords, ignoreCase);  
  }

  /**
   * Constructs a filter which removes words from the input
   * TokenStream that are named in the Set.
   *
   * @param enablePositionIncrements true if token positions should record the removed stop words
   * @param in Input stream
   * @param stopWords A Set of Strings or char[] or any other toString()-able set representing the stopwords
   * @see #makeStopSet(Version, java.lang.String[])
   * @deprecated use {@link #StopFilter(Version, TokenStream, Set)} instead
   */
  @Deprecated
  public StopFilter(boolean enablePositionIncrements, TokenStream in, Set<?> stopWords) {
    this(Version.LUCENE_30, enablePositionIncrements, in, stopWords, false);
  }
  
  /**
   * Constructs a filter which removes words from the input TokenStream that are
   * named in the Set.
   * 
   * @param matchVersion
   *          Lucene version to enable correct Unicode 4.0 behavior in the stop
   *          set if Version > 3.0.  See <a href="#version">above for details.
   * @param in
   *          Input stream
   * @param stopWords
   *          A Set of Strings or char[] or any other toString()-able set
   *          representing the stopwords
   * @see #makeStopSet(Version, java.lang.String[])
   */
  public StopFilter(Version matchVersion, TokenStream in, Set<?> stopWords) {
    this(matchVersion, in, stopWords, false);
  }

  /**
   * Builds a Set from an array of stop words,
   * appropriate for passing into the StopFilter constructor.
   * This permits this stopWords construction to be cached once when
   * an Analyzer is constructed.
   * 
   * @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
   * @deprecated use {@link #makeStopSet(Version, String...)} instead
   */
  @Deprecated
  public static final Set<Object> makeStopSet(String... stopWords) {
    return makeStopSet(Version.LUCENE_30, stopWords, false);
  }

  /**
   * Builds a Set from an array of stop words,
   * appropriate for passing into the StopFilter constructor.
   * This permits this stopWords construction to be cached once when
   * an Analyzer is constructed.
   * 
   * @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
   * @param stopWords An array of stopwords
   * @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
   */
  public static final Set<Object> makeStopSet(Version matchVersion, String... stopWords) {
    return makeStopSet(matchVersion, stopWords, false);
  }
  
  /**
   * Builds a Set from an array of stop words,
   * appropriate for passing into the StopFilter constructor.
   * This permits this stopWords construction to be cached once when
   * an Analyzer is constructed.
   * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
   * @return A Set ({@link CharArraySet}) containing the words
   * @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
   * @deprecated use {@link #makeStopSet(Version, List)} instead
   */
  @Deprecated
  public static final Set<Object> makeStopSet(List stopWords) {
    return makeStopSet(Version.LUCENE_30, stopWords, false);
  }

  /**
   * Builds a Set from an array of stop words,
   * appropriate for passing into the StopFilter constructor.
   * This permits this stopWords construction to be cached once when
   * an Analyzer is constructed.
   * 
   * @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
   * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
   * @return A Set ({@link CharArraySet}) containing the words
   * @see #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase
   */
  public static final Set<Object> makeStopSet(Version matchVersion, List stopWords) {
    return makeStopSet(matchVersion, stopWords, false);
  }
    
  /**
   * Creates a stopword set from the given stopword array.
   * @param stopWords An array of stopwords
   * @param ignoreCase If true, all words are lower cased first.  
   * @return a Set containing the words
   * @deprecated use {@link #makeStopSet(Version, String[], boolean)} instead;
   */  
  @Deprecated
  public static final Set<Object> makeStopSet(String[] stopWords, boolean ignoreCase) {
    return makeStopSet(Version.LUCENE_30, stopWords, ignoreCase);
  }
  /**
   * Creates a stopword set from the given stopword array.
   * 
   * @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
   * @param stopWords An array of stopwords
   * @param ignoreCase If true, all words are lower cased first.  
   * @return a Set containing the words
   */    
  public static final Set<Object> makeStopSet(Version matchVersion, String[] stopWords, boolean ignoreCase) {
    CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.length, ignoreCase);
    stopSet.addAll(Arrays.asList(stopWords));
    return stopSet;
  }
  
  /**
   * Creates a stopword set from the given stopword list.
   * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
   * @param ignoreCase if true, all words are lower cased first
   * @return A Set ({@link CharArraySet}) containing the words
   * @deprecated use {@link #makeStopSet(Version, List, boolean)} instead
   */
  @Deprecated
  public static final Set<Object> makeStopSet(List stopWords, boolean ignoreCase){
    return makeStopSet(Version.LUCENE_30, stopWords, ignoreCase);
  }

  /**
   * Creates a stopword set from the given stopword list.
   * @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
   * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
   * @param ignoreCase if true, all words are lower cased first
   * @return A Set ({@link CharArraySet}) containing the words
   */
  public static final Set<Object> makeStopSet(Version matchVersion, List stopWords, boolean ignoreCase){
    CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.size(), ignoreCase);
    stopSet.addAll(stopWords);
    return stopSet;
  }
  
  /**
   * Returns the next input Token whose term() is not a stop word.
   */
  @Override
  protected boolean accept() throws IOException {
    return !stopWords.contains(termAtt.buffer(), 0, termAtt.length());
  }

  /**
   * Returns version-dependent default for
   * enablePositionIncrements.  Analyzers that embed
   * StopFilter use this method when creating the
   * StopFilter.  Prior to 2.9, this returns false.  On 2.9
   * or later, it returns true.
   * @deprecated use {@link #StopFilter(Version, TokenStream, Set)} instead
   */
  @Deprecated
  public static boolean getEnablePositionIncrementsVersionDefault(Version matchVersion) {
    return matchVersion.onOrAfter(Version.LUCENE_29);
  }
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene StopFilter.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.