|
Lucene example source code file (ShingleFilter.java)
The Lucene ShingleFilter.java source codepackage org.apache.lucene.analysis.shingle; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Iterator; import java.util.LinkedList; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.util.AttributeSource; /** * <p>A ShingleFilter constructs shingles (token n-grams) from a token stream. * In other words, it creates combinations of tokens as a single token. * * <p>For example, the sentence "please divide this sentence into shingles" * might be tokenized into shingles "please divide", "divide this", * "this sentence", "sentence into", and "into shingles". * * <p>This filter handles position increments > 1 by inserting filler tokens * (tokens with termtext "_"). It does not handle a position increment of 0. */ public final class ShingleFilter extends TokenFilter { /** * filler token for when positionIncrement is more than 1 */ public static final char[] FILLER_TOKEN = { '_' }; /** * default maximum shingle size is 2. */ public static final int DEFAULT_MAX_SHINGLE_SIZE = 2; /** * default minimum shingle size is 2. */ public static final int DEFAULT_MIN_SHINGLE_SIZE = 2; /** * default token type attribute value is "shingle" */ public static final String DEFAULT_TOKEN_TYPE = "shingle"; /** * The default string to use when joining adjacent tokens to form a shingle */ public static final String TOKEN_SEPARATOR = " "; /** * The sequence of input stream tokens (or filler tokens, if necessary) * that will be composed to form output shingles. */ private LinkedList<InputWindowToken> inputWindow = new LinkedList<InputWindowToken>(); /** * The number of input tokens in the next output token. This is the "n" in * "token n-grams". */ private CircularSequence gramSize; /** * Shingle and unigram text is composed here. */ private StringBuilder gramBuilder = new StringBuilder(); /** * The token type attribute value to use - default is "shingle" */ private String tokenType = DEFAULT_TOKEN_TYPE; /** * The string to use when joining adjacent tokens to form a shingle */ private String tokenSeparator = TOKEN_SEPARATOR; /** * By default, we output unigrams (individual tokens) as well as shingles * (token n-grams). */ private boolean outputUnigrams = true; /** * By default, we don't override behavior of outputUnigrams. */ private boolean outputUnigramsIfNoShingles = false; /** * maximum shingle size (number of tokens) */ private int maxShingleSize; /** * minimum shingle size (number of tokens) */ private int minShingleSize; /** * The remaining number of filler tokens to be inserted into the input stream * from which shingles are composed, to handle position increments greater * than one. */ private int numFillerTokensToInsert; /** * When the next input stream token has a position increment greater than * one, it is stored in this field until sufficient filler tokens have been * inserted to account for the position increment. */ private AttributeSource nextInputStreamToken; /** * Whether or not there is a next input stream token. */ private boolean isNextInputStreamToken = false; /** * Whether at least one unigram or shingle has been output at the current * position. */ private boolean isOutputHere = false; /** * true if no shingles have been output yet (for outputUnigramsIfNoShingles). */ boolean noShingleOutput = true; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); /** * Constructs a ShingleFilter with the specified shingle size from the * {@link TokenStream} <code>input * * @param input input stream * @param minShingleSize minimum shingle size produced by the filter. * @param maxShingleSize maximum shingle size produced by the filter. */ public ShingleFilter(TokenStream input, int minShingleSize, int maxShingleSize) { super(input); setMaxShingleSize(maxShingleSize); setMinShingleSize(minShingleSize); } /** * Constructs a ShingleFilter with the specified shingle size from the * {@link TokenStream} <code>input * * @param input input stream * @param maxShingleSize maximum shingle size produced by the filter. */ public ShingleFilter(TokenStream input, int maxShingleSize) { this(input, DEFAULT_MIN_SHINGLE_SIZE, maxShingleSize); } /** * Construct a ShingleFilter with default shingle size: 2. * * @param input input stream */ public ShingleFilter(TokenStream input) { this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE); } /** * Construct a ShingleFilter with the specified token type for shingle tokens * and the default shingle size: 2 * * @param input input stream * @param tokenType token type for shingle tokens */ public ShingleFilter(TokenStream input, String tokenType) { this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE); setTokenType(tokenType); } /** * Set the type of the shingle tokens produced by this filter. * (default: "shingle") * * @param tokenType token tokenType */ public void setTokenType(String tokenType) { this.tokenType = tokenType; } /** * Shall the output stream contain the input tokens (unigrams) as well as * shingles? (default: true.) * * @param outputUnigrams Whether or not the output stream shall contain * the input tokens (unigrams) */ public void setOutputUnigrams(boolean outputUnigrams) { this.outputUnigrams = outputUnigrams; gramSize = new CircularSequence(); } /** * <p>Shall we override the behavior of outputUnigrams==false for those * times when no shingles are available (because there are fewer than * minShingleSize tokens in the input stream)? (default: false.) * <p>Note that if outputUnigrams==true, then unigrams are always output, * regardless of whether any shingles are available. * * @param outputUnigramsIfNoShingles Whether or not to output a single * unigram when no shingles are available. */ public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) { this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles; } /** * Set the max shingle size (default: 2) * * @param maxShingleSize max size of output shingles */ public void setMaxShingleSize(int maxShingleSize) { if (maxShingleSize < 2) { throw new IllegalArgumentException("Max shingle size must be >= 2"); } this.maxShingleSize = maxShingleSize; } /** * <p>Set the min shingle size (default: 2). * <p>This method requires that the passed in minShingleSize is not greater * than maxShingleSize, so make sure that maxShingleSize is set before * calling this method. * <p>The unigram output option is independent of the min shingle size. * * @param minShingleSize min size of output shingles */ public void setMinShingleSize(int minShingleSize) { if (minShingleSize < 2) { throw new IllegalArgumentException("Min shingle size must be >= 2"); } if (minShingleSize > maxShingleSize) { throw new IllegalArgumentException ("Min shingle size must be <= max shingle size"); } this.minShingleSize = minShingleSize; gramSize = new CircularSequence(); } /** * Sets the string to use when joining adjacent tokens to form a shingle * @param tokenSeparator used to separate input stream tokens in output shingles */ public void setTokenSeparator(String tokenSeparator) { this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator; } @Override public final boolean incrementToken() throws IOException { boolean tokenAvailable = false; int builtGramSize = 0; if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) { shiftInputWindow(); gramBuilder.setLength(0); } else { builtGramSize = gramSize.getPreviousValue(); } if (inputWindow.size() >= gramSize.getValue()) { boolean isAllFiller = true; InputWindowToken nextToken = null; Iterator<InputWindowToken> iter = inputWindow.iterator(); for (int gramNum = 1 ; iter.hasNext() && builtGramSize < gramSize.getValue() ; ++gramNum) { nextToken = iter.next(); if (builtGramSize < gramNum) { if (builtGramSize > 0) { gramBuilder.append(tokenSeparator); } gramBuilder.append(nextToken.termAtt.buffer(), 0, nextToken.termAtt.length()); ++builtGramSize; } if (isAllFiller && nextToken.isFiller) { if (gramNum == gramSize.getValue()) { gramSize.advance(); } } else { isAllFiller = false; } } if ( ! isAllFiller && builtGramSize == gramSize.getValue()) { inputWindow.getFirst().attSource.copyTo(this); posIncrAtt.setPositionIncrement(isOutputHere ? 0 : 1); termAtt.setEmpty().append(gramBuilder); if (gramSize.getValue() > 1) { typeAtt.setType(tokenType); noShingleOutput = false; } offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset()); isOutputHere = true; gramSize.advance(); tokenAvailable = true; } } return tokenAvailable; } private boolean exhausted; /** * <p>Get the next token from the input stream. * <p>If the next token has Other Lucene examples (source code examples)Here is a short list of links related to this Lucene ShingleFilter.java source code file: |
... this post is sponsored by my books ... | |
#1 New Release! |
FP Best Seller |
Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.
A percentage of advertising revenue from
pages under the /java/jwarehouse
URI on this website is
paid back to open source projects.