alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (TokenSources.java)

This example Lucene source code file (TokenSources.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

analyzer, arraylist, illegalargumentexception, io, ioexception, ioexception, storedtokenstream, string, string, termfreqvector, termpositionvector, token, token, tokenstream, tokenstream, util

The Lucene TokenSources.java source code

/*
 * Created on 28-Oct-2004
 */
package org.apache.lucene.search.highlight;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Comparator;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.index.TermVectorOffsetInfo;
import org.apache.lucene.util.ArrayUtil;

/**
 * Hides implementation issues associated with obtaining a TokenStream for use
 * with the higlighter - can obtain from TermFreqVectors with offsets and
 * (optionally) positions or from Analyzer class reparsing the stored content.
 */
public class TokenSources {
  /**
   * A convenience method that tries to first get a TermPositionVector for the
   * specified docId, then, falls back to using the passed in
   * {@link org.apache.lucene.document.Document} to retrieve the TokenStream.
   * This is useful when you already have the document, but would prefer to use
   * the vector first.
   * 
   * @param reader The {@link org.apache.lucene.index.IndexReader} to use to try
   *        and get the vector from
   * @param docId The docId to retrieve.
   * @param field The field to retrieve on the document
   * @param doc The document to fall back on
   * @param analyzer The analyzer to use for creating the TokenStream if the
   *        vector doesn't exist
   * @return The {@link org.apache.lucene.analysis.TokenStream} for the
   *         {@link org.apache.lucene.document.Fieldable} on the
   *         {@link org.apache.lucene.document.Document}
   * @throws IOException if there was an error loading
   */
  public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
      String field, Document doc, Analyzer analyzer) throws IOException {
    TokenStream ts = null;

    TermFreqVector tfv = reader.getTermFreqVector(docId, field);
    if (tfv != null) {
      if (tfv instanceof TermPositionVector) {
        ts = getTokenStream((TermPositionVector) tfv);
      }
    }
    // No token info stored so fall back to analyzing raw content
    if (ts == null) {
      ts = getTokenStream(doc, field, analyzer);
    }
    return ts;
  }

  /**
   * A convenience method that tries a number of approaches to getting a token
   * stream. The cost of finding there are no termVectors in the index is
   * minimal (1000 invocations still registers 0 ms). So this "lazy" (flexible?)
   * approach to coding is probably acceptable
   * 
   * @param reader
   * @param docId
   * @param field
   * @param analyzer
   * @return null if field not stored correctly
   * @throws IOException
   */
  public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
      String field, Analyzer analyzer) throws IOException {
    TokenStream ts = null;

    TermFreqVector tfv = reader.getTermFreqVector(docId, field);
    if (tfv != null) {
      if (tfv instanceof TermPositionVector) {
        ts = getTokenStream((TermPositionVector) tfv);
      }
    }
    // No token info stored so fall back to analyzing raw content
    if (ts == null) {
      ts = getTokenStream(reader, docId, field, analyzer);
    }
    return ts;
  }

  public static TokenStream getTokenStream(TermPositionVector tpv) {
    // assumes the worst and makes no assumptions about token position
    // sequences.
    return getTokenStream(tpv, false);
  }

  /**
   * Low level api. Returns a token stream or null if no offset info available
   * in index. This can be used to feed the highlighter with a pre-parsed token
   * stream
   * 
   * In my tests the speeds to recreate 1000 token streams using this method
   * are: - with TermVector offset only data stored - 420 milliseconds - with
   * TermVector offset AND position data stored - 271 milliseconds (nb timings
   * for TermVector with position data are based on a tokenizer with contiguous
   * positions - no overlaps or gaps) The cost of not using TermPositionVector
   * to store pre-parsed content and using an analyzer to re-parse the original
   * content: - reanalyzing the original content - 980 milliseconds
   * 
   * The re-analyze timings will typically vary depending on - 1) The complexity
   * of the analyzer code (timings above were using a
   * stemmer/lowercaser/stopword combo) 2) The number of other fields (Lucene
   * reads ALL fields off the disk when accessing just one document field - can
   * cost dear!) 3) Use of compression on field storage - could be faster due to
   * compression (less disk IO) or slower (more CPU burn) depending on the
   * content.
   * 
   * @param tpv
   * @param tokenPositionsGuaranteedContiguous true if the token position
   *        numbers have no overlaps or gaps. If looking to eek out the last
   *        drops of performance, set to true. If in doubt, set to false.
   */
  public static TokenStream getTokenStream(TermPositionVector tpv,
      boolean tokenPositionsGuaranteedContiguous) {
    if (!tokenPositionsGuaranteedContiguous && tpv.getTermPositions(0) != null) {
      return new TokenStreamFromTermPositionVector(tpv);
    }

    // an object used to iterate across an array of tokens
    final class StoredTokenStream extends TokenStream {
      Token tokens[];

      int currentToken = 0;

      CharTermAttribute termAtt;

      OffsetAttribute offsetAtt;

      PositionIncrementAttribute posincAtt;

      StoredTokenStream(Token tokens[]) {
        this.tokens = tokens;
        termAtt = addAttribute(CharTermAttribute.class);
        offsetAtt = addAttribute(OffsetAttribute.class);
        posincAtt = addAttribute(PositionIncrementAttribute.class);
      }

      @Override
      public boolean incrementToken() throws IOException {
        if (currentToken >= tokens.length) {
          return false;
        }
        Token token = tokens[currentToken++];
        clearAttributes();
        termAtt.setEmpty().append(token);
        offsetAtt.setOffset(token.startOffset(), token.endOffset());
        posincAtt
            .setPositionIncrement(currentToken <= 1
                || tokens[currentToken - 1].startOffset() > tokens[currentToken - 2]
                    .startOffset() ? 1 : 0);
        return true;
      }
    }
    // code to reconstruct the original sequence of Tokens
    String[] terms = tpv.getTerms();
    int[] freq = tpv.getTermFrequencies();
    int totalTokens = 0;
    for (int t = 0; t < freq.length; t++) {
      totalTokens += freq[t];
    }
    Token tokensInOriginalOrder[] = new Token[totalTokens];
    ArrayList<Token> unsortedTokens = null;
    for (int t = 0; t < freq.length; t++) {
      TermVectorOffsetInfo[] offsets = tpv.getOffsets(t);
      if (offsets == null) {
        throw new IllegalArgumentException(
            "Required TermVector Offset information was not found");
      }

      int[] pos = null;
      if (tokenPositionsGuaranteedContiguous) {
        // try get the token position info to speed up assembly of tokens into
        // sorted sequence
        pos = tpv.getTermPositions(t);
      }
      if (pos == null) {
        // tokens NOT stored with positions or not guaranteed contiguous - must
        // add to list and sort later
        if (unsortedTokens == null) {
          unsortedTokens = new ArrayList<Token>();
        }
        for (int tp = 0; tp < offsets.length; tp++) {
          Token token = new Token(terms[t], offsets[tp].getStartOffset(), offsets[tp]
              .getEndOffset());
          unsortedTokens.add(token);
        }
      } else {
        // We have positions stored and a guarantee that the token position
        // information is contiguous

        // This may be fast BUT wont work if Tokenizers used which create >1
        // token in same position or
        // creates jumps in position numbers - this code would fail under those
        // circumstances

        // tokens stored with positions - can use this to index straight into
        // sorted array
        for (int tp = 0; tp < pos.length; tp++) {
          Token token = new Token(terms[t], offsets[tp].getStartOffset(),
              offsets[tp].getEndOffset());
          tokensInOriginalOrder[pos[tp]] = token;
        }
      }
    }
    // If the field has been stored without position data we must perform a sort
    if (unsortedTokens != null) {
      tokensInOriginalOrder = unsortedTokens.toArray(new Token[unsortedTokens
          .size()]);
      ArrayUtil.mergeSort(tokensInOriginalOrder, new Comparator<Token>() {
        public int compare(Token t1, Token t2) {
          if (t1.startOffset() == t2.startOffset()) return t1.endOffset()
              - t2.endOffset();
          else return t1.startOffset() - t2.startOffset();
        }
      });
    }
    return new StoredTokenStream(tokensInOriginalOrder);
  }

  public static TokenStream getTokenStream(IndexReader reader, int docId,
      String field) throws IOException {
    TermFreqVector tfv = reader.getTermFreqVector(docId, field);
    if (tfv == null) {
      throw new IllegalArgumentException(field + " in doc #" + docId
          + "does not have any term position data stored");
    }
    if (tfv instanceof TermPositionVector) {
      TermPositionVector tpv = (TermPositionVector) reader.getTermFreqVector(
          docId, field);
      return getTokenStream(tpv);
    }
    throw new IllegalArgumentException(field + " in doc #" + docId
        + "does not have any term position data stored");
  }

  // convenience method
  public static TokenStream getTokenStream(IndexReader reader, int docId,
      String field, Analyzer analyzer) throws IOException {
    Document doc = reader.document(docId);
    return getTokenStream(doc, field, analyzer);
  }

  public static TokenStream getTokenStream(Document doc, String field,
      Analyzer analyzer) {
    String contents = doc.get(field);
    if (contents == null) {
      throw new IllegalArgumentException("Field " + field
          + " in document is not stored and cannot be analyzed");
    }
    return getTokenStream(field, contents, analyzer);
  }

  // convenience method
  public static TokenStream getTokenStream(String field, String contents,
      Analyzer analyzer) {
    try {
      return analyzer.reusableTokenStream(field, new StringReader(contents));
    } catch (IOException ex) {
      throw new RuntimeException(ex);
    }
  }

}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene TokenSources.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.