alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (ShingleFilter.java)

This example Lucene source code file (ShingleFilter.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

attributesource, circularsequence, circularsequence, default_max_shingle_size, default_min_shingle_size, illegalargumentexception, inputwindowtoken, inputwindowtoken, io, ioexception, shinglefilter, shinglefilter, string, string, token_separator, util

The Lucene ShingleFilter.java source code

package org.apache.lucene.analysis.shingle;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.Iterator;
import java.util.LinkedList;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;


/**
 * <p>A ShingleFilter constructs shingles (token n-grams) from a token stream.
 * In other words, it creates combinations of tokens as a single token.
 *
 * <p>For example, the sentence "please divide this sentence into shingles"
 * might be tokenized into shingles "please divide", "divide this",
 * "this sentence", "sentence into", and "into shingles".
 *
 * <p>This filter handles position increments > 1 by inserting filler tokens
 * (tokens with termtext "_"). It does not handle a position increment of 0.
 */
public final class ShingleFilter extends TokenFilter {

  /**
   * filler token for when positionIncrement is more than 1
   */
  public static final char[] FILLER_TOKEN = { '_' };

  /**
   * default maximum shingle size is 2.
   */
  public static final int DEFAULT_MAX_SHINGLE_SIZE = 2;

  /**
   * default minimum shingle size is 2.
   */
  public static final int DEFAULT_MIN_SHINGLE_SIZE = 2;

  /**
   * default token type attribute value is "shingle" 
   */
  public static final String DEFAULT_TOKEN_TYPE = "shingle";
  
  /**
   * The default string to use when joining adjacent tokens to form a shingle
   */
  public static final String TOKEN_SEPARATOR = " ";

  /**
   * The sequence of input stream tokens (or filler tokens, if necessary)
   * that will be composed to form output shingles.
   */
  private LinkedList<InputWindowToken> inputWindow
    = new LinkedList<InputWindowToken>();
  
  /**
   * The number of input tokens in the next output token.  This is the "n" in
   * "token n-grams".
   */
  private CircularSequence gramSize;

  /**
   * Shingle and unigram text is composed here.
   */
  private StringBuilder gramBuilder = new StringBuilder();

  /**
   * The token type attribute value to use - default is "shingle"
   */
  private String tokenType = DEFAULT_TOKEN_TYPE;

  /**
   * The string to use when joining adjacent tokens to form a shingle
   */
  private String tokenSeparator = TOKEN_SEPARATOR;

  /**
   * By default, we output unigrams (individual tokens) as well as shingles
   * (token n-grams).
   */
  private boolean outputUnigrams = true;

  /**
   * By default, we don't override behavior of outputUnigrams.
   */
  private boolean outputUnigramsIfNoShingles = false;
 
  /**
   * maximum shingle size (number of tokens)
   */
  private int maxShingleSize;

  /**
   * minimum shingle size (number of tokens)
   */
  private int minShingleSize;

  /**
   * The remaining number of filler tokens to be inserted into the input stream
   * from which shingles are composed, to handle position increments greater
   * than one.
   */
  private int numFillerTokensToInsert;

  /**
   * When the next input stream token has a position increment greater than
   * one, it is stored in this field until sufficient filler tokens have been
   * inserted to account for the position increment. 
   */
  private AttributeSource nextInputStreamToken;

  /**
   * Whether or not there is a next input stream token.
   */
  private boolean isNextInputStreamToken = false;

  /**
   * Whether at least one unigram or shingle has been output at the current 
   * position.
   */
  private boolean isOutputHere = false;

  /**
   * true if no shingles have been output yet (for outputUnigramsIfNoShingles).
   */
  boolean noShingleOutput = true;
  
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);


  /**
   * Constructs a ShingleFilter with the specified shingle size from the
   * {@link TokenStream} <code>input
   *
   * @param input input stream
   * @param minShingleSize minimum shingle size produced by the filter.
   * @param maxShingleSize maximum shingle size produced by the filter.
   */
  public ShingleFilter(TokenStream input, int minShingleSize, int maxShingleSize) {
    super(input);
    setMaxShingleSize(maxShingleSize);
    setMinShingleSize(minShingleSize);
  }

  /**
   * Constructs a ShingleFilter with the specified shingle size from the
   * {@link TokenStream} <code>input
   *
   * @param input input stream
   * @param maxShingleSize maximum shingle size produced by the filter.
   */
  public ShingleFilter(TokenStream input, int maxShingleSize) {
    this(input, DEFAULT_MIN_SHINGLE_SIZE, maxShingleSize);
  }
  
  /**
   * Construct a ShingleFilter with default shingle size: 2.
   *
   * @param input input stream
   */
  public ShingleFilter(TokenStream input) {
    this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE);
  }

  /**
   * Construct a ShingleFilter with the specified token type for shingle tokens
   * and the default shingle size: 2
   *
   * @param input input stream
   * @param tokenType token type for shingle tokens
   */
  public ShingleFilter(TokenStream input, String tokenType) {
    this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE);
    setTokenType(tokenType);
  }

  /**
   * Set the type of the shingle tokens produced by this filter.
   * (default: "shingle")
   *
   * @param tokenType token tokenType
   */
  public void setTokenType(String tokenType) {
    this.tokenType = tokenType;
  }

  /**
   * Shall the output stream contain the input tokens (unigrams) as well as
   * shingles? (default: true.)
   *
   * @param outputUnigrams Whether or not the output stream shall contain
   * the input tokens (unigrams)
   */
  public void setOutputUnigrams(boolean outputUnigrams) {
    this.outputUnigrams = outputUnigrams;
    gramSize = new CircularSequence();
  }

  /**
   * <p>Shall we override the behavior of outputUnigrams==false for those
   * times when no shingles are available (because there are fewer than
   * minShingleSize tokens in the input stream)? (default: false.)
   * <p>Note that if outputUnigrams==true, then unigrams are always output,
   * regardless of whether any shingles are available.
   *
   * @param outputUnigramsIfNoShingles Whether or not to output a single
   * unigram when no shingles are available.
   */
  public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) {
    this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
  }

  /**
   * Set the max shingle size (default: 2)
   *
   * @param maxShingleSize max size of output shingles
   */
  public void setMaxShingleSize(int maxShingleSize) {
    if (maxShingleSize < 2) {
      throw new IllegalArgumentException("Max shingle size must be >= 2");
    }
    this.maxShingleSize = maxShingleSize;
  }

  /**
   * <p>Set the min shingle size (default: 2).
   * <p>This method requires that the passed in minShingleSize is not greater
   * than maxShingleSize, so make sure that maxShingleSize is set before
   * calling this method.
   * <p>The unigram output option is independent of the min shingle size.
   *
   * @param minShingleSize min size of output shingles
   */
  public void setMinShingleSize(int minShingleSize) {
    if (minShingleSize < 2) {
      throw new IllegalArgumentException("Min shingle size must be >= 2");
    }
    if (minShingleSize > maxShingleSize) {
      throw new IllegalArgumentException
        ("Min shingle size must be <= max shingle size");
    }
    this.minShingleSize = minShingleSize;
    gramSize = new CircularSequence();
  }

  /**
   * Sets the string to use when joining adjacent tokens to form a shingle
   * @param tokenSeparator used to separate input stream tokens in output shingles
   */
  public void setTokenSeparator(String tokenSeparator) {
    this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator;
  }

  @Override
  public final boolean incrementToken() throws IOException {
    boolean tokenAvailable = false;
    int builtGramSize = 0;
    if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) {
      shiftInputWindow();
      gramBuilder.setLength(0);
    } else {
      builtGramSize = gramSize.getPreviousValue();
    }
    if (inputWindow.size() >= gramSize.getValue()) {
      boolean isAllFiller = true;
      InputWindowToken nextToken = null;
      Iterator<InputWindowToken> iter = inputWindow.iterator();
      for (int gramNum = 1 ;
           iter.hasNext() && builtGramSize < gramSize.getValue() ;
           ++gramNum) {
        nextToken = iter.next();
        if (builtGramSize < gramNum) {
          if (builtGramSize > 0) {
            gramBuilder.append(tokenSeparator);
          }
          gramBuilder.append(nextToken.termAtt.buffer(), 0, 
                             nextToken.termAtt.length());
          ++builtGramSize;
        }
        if (isAllFiller && nextToken.isFiller) {
          if (gramNum == gramSize.getValue()) {
            gramSize.advance();
          }
        } else { 
          isAllFiller = false;
        }
      }
      if ( ! isAllFiller && builtGramSize == gramSize.getValue()) {
        inputWindow.getFirst().attSource.copyTo(this);
        posIncrAtt.setPositionIncrement(isOutputHere ? 0 : 1);
        termAtt.setEmpty().append(gramBuilder);
        if (gramSize.getValue() > 1) {
          typeAtt.setType(tokenType);
          noShingleOutput = false;
        }
        offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
        isOutputHere = true;
        gramSize.advance();
        tokenAvailable = true;
      }
    }
    return tokenAvailable;
  }

  private boolean exhausted;

  /**
   * <p>Get the next token from the input stream.
   * <p>If the next token has positionIncrement > 1,
   * <code>positionIncrement - 1 {@link #FILLER_TOKEN}s are
   * inserted first.
   * @param target Where to put the new token; if null, a new instance is created.
   * @return On success, the populated token; null otherwise
   * @throws IOException if the input stream has a problem
   */
  private InputWindowToken getNextToken(InputWindowToken target) 
    throws IOException {
    InputWindowToken newTarget = target;
    if (numFillerTokensToInsert > 0) {
      if (null == target) {
        newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes());
      } else {
        nextInputStreamToken.copyTo(target.attSource);
      }
      // A filler token occupies no space
      newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(), 
                                    newTarget.offsetAtt.startOffset());
      newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
      newTarget.isFiller = true;
      --numFillerTokensToInsert;
    } else if (isNextInputStreamToken) {
      if (null == target) {
        newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes());
      } else {
        nextInputStreamToken.copyTo(target.attSource);
      }
      isNextInputStreamToken = false;
      newTarget.isFiller = false;
    } else if (!exhausted && input.incrementToken()) {
      if (null == target) {
        newTarget = new InputWindowToken(cloneAttributes());
      } else {
        this.copyTo(target.attSource);
      }
      if (posIncrAtt.getPositionIncrement() > 1) {
        // Each output shingle must contain at least one input token, 
        // so no more than (maxShingleSize - 1) filler tokens will be inserted.
        numFillerTokensToInsert 
          = Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1);
        // Save the current token as the next input stream token
        if (null == nextInputStreamToken) {
          nextInputStreamToken = cloneAttributes();
        } else {
          this.copyTo(nextInputStreamToken);
        }
        isNextInputStreamToken = true;
        // A filler token occupies no space
        newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
        newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
        newTarget.isFiller = true;
        --numFillerTokensToInsert;
      } else {
        newTarget.isFiller = false;
      }
    } else {
      newTarget = null;
      exhausted = true;
    }
    return newTarget;
	}

  /**
   * <p>Fills {@link #inputWindow} with input stream tokens, if available, 
   * shifting to the right if the window was previously full.
   * <p>Resets {@link #gramSize} to its minimum value.
   *
   * @throws IOException if there's a problem getting the next token
   */
  private void shiftInputWindow() throws IOException {
    InputWindowToken firstToken = null;
    if (inputWindow.size() > 0) {
      firstToken = inputWindow.removeFirst();
    }
    while (inputWindow.size() < maxShingleSize) {
      if (null != firstToken) {  // recycle the firstToken, if available
        if (null != getNextToken(firstToken)) {
          inputWindow.add(firstToken); // the firstToken becomes the last
          firstToken = null;
        } else {
          break; // end of input stream
        }
      } else {
        InputWindowToken nextToken = getNextToken(null);
        if (null != nextToken) {
          inputWindow.add(nextToken);
        } else {
          break; // end of input stream
        }
      }
    }
    if (outputUnigramsIfNoShingles && noShingleOutput 
        && gramSize.minValue > 1 && inputWindow.size() < minShingleSize) {
      gramSize.minValue = 1;
    }
    gramSize.reset();
    isOutputHere = false;
  }

  @Override
  public void reset() throws IOException {
    super.reset();
    gramSize.reset();
    inputWindow.clear();
    numFillerTokensToInsert = 0;
    isOutputHere = false;
    noShingleOutput = true;
    exhausted = false;
    if (outputUnigramsIfNoShingles && ! outputUnigrams) {
      // Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles
      gramSize.minValue = minShingleSize;
    }
  }


  /**
   * <p>An instance of this class is used to maintain the number of input
   * stream tokens that will be used to compose the next unigram or shingle:
   * {@link #gramSize}.
   * <p>gramSize will take on values from the circular sequence
   * <b>{ [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }.
   * <p>1 is included in the circular sequence only if 
   * {@link #outputUnigrams} = true.
   */
  private class CircularSequence {
    private int value;
    private int previousValue;
    private int minValue;
    
    public CircularSequence() {
      minValue = outputUnigrams ? 1 : minShingleSize;
      reset();
    }

    /**
     * @return the current value.  
     * @see #advance()
     */
    public int getValue() {
      return value;
    }
    
    /**
     * <p>Increments this circular number's value to the next member in the
     * circular sequence
     * <code>gramSize will take on values from the circular sequence
     * <b>{ [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }.
     * <p>1 is included in the circular sequence only if 
     * {@link #outputUnigrams} = true.
     */
    public void advance() {
      previousValue = value;
      if (value == 1) {
        value = minShingleSize;
      } else if (value == maxShingleSize) {
        reset();
      } else {
        ++value;
      }
    }

    /**
     * <p>Sets this circular number's value to the first member of the 
     * circular sequence
     * <p>gramSize will take on values from the circular sequence
     * <b>{ [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }.
     * <p>1 is included in the circular sequence only if 
     * {@link #outputUnigrams} = true.
     */
    public void reset() {
      previousValue = value = minValue;
    }

    /**
     * <p>Returns true if the current value is the first member of the circular
     * sequence.
     * <p>If {@link #outputUnigrams} = true, the first member of the circular
     * sequence will be 1; otherwise, it will be {@link #minShingleSize}.
     * 
     * @return true if the current value is the first member of the circular
     *  sequence; false otherwise
     */
    public boolean atMinValue() {
      return value == minValue;
    }

    /**
     * @return the value this instance had before the last advance() call
     */
    public int getPreviousValue() {
      return previousValue;
    }
  }
    
  private class InputWindowToken {
    final AttributeSource attSource;
    final CharTermAttribute termAtt;
    final OffsetAttribute offsetAtt;
    boolean isFiller = false;
      
    public InputWindowToken(AttributeSource attSource) {
      this.attSource = attSource;
      this.termAtt = attSource.getAttribute(CharTermAttribute.class);
      this.offsetAtt = attSource.getAttribute(OffsetAttribute.class);
    }
  }
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene ShingleFilter.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.