alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (AnalyzingQueryParser.java)

This example Lucene source code file (AnalyzingQueryParser.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

cannot, cannot, chartermattribute, io, ioexception, ioexception, override, parseexception, parseexception, rangequery, string, string, stringbuilder, stringreader, tokenstream, util

The Lucene AnalyzingQueryParser.java source code

package org.apache.lucene.queryParser.analyzing;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.Version;

/**
 * Overrides Lucene's default QueryParser so that Fuzzy-, Prefix-, Range-, and WildcardQuerys
 * are also passed through the given analyzer, but wild card characters (like <code>*) 
 * don't get removed from the search terms.
 * 
 * <p>Warning: This class should only be used with analyzers that do not use stopwords
 * or that add tokens. Also, several stemming analyzers are inappropriate: for example, GermanAnalyzer 
 * will turn <code>Häuser into hau, but H?user will 
 * become <code>h?user when using this parser and thus no match would be found (i.e.
 * using this parser will be no improvement over QueryParser in such cases). 
 *
 * @version $Revision$, $Date$
 */
public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryParser {

  /**
   * Constructs a query parser.
   * @param field    the default field for query terms.
   * @param analyzer used to find terms in the query text.
   */
  public AnalyzingQueryParser(Version matchVersion, String field, Analyzer analyzer) {
    super(matchVersion, field, analyzer);
  }

  /**
   * Called when parser
   * parses an input term token that contains one or more wildcard
   * characters (like <code>*), but is not a prefix term token (one
   * that has just a single * character at the end).
   * <p>
   * Example: will be called for <code>H?user or for H*user 
   * but not for <code>*user.
   * <p>
   * Depending on analyzer and settings, a wildcard term may (most probably will)
   * be lower-cased automatically. It <b>will go through the default Analyzer.
   * <p>
   * Overrides super class, by passing terms through analyzer.
   *
   * @param  field   Name of the field query will use.
   * @param  termStr Term token that contains one or more wild card
   *                 characters (? or *), but is not simple prefix term
   *
   * @return Resulting {@link Query} built for the term
   * @throws ParseException
   */
  @Override
  protected Query getWildcardQuery(String field, String termStr) throws ParseException {
    List<String> tlist = new ArrayList();
    List<String> wlist = new ArrayList();
    /* somewhat a hack: find/store wildcard chars
     * in order to put them back after analyzing */
    boolean isWithinToken = (!termStr.startsWith("?") && !termStr.startsWith("*"));
    StringBuilder tmpBuffer = new StringBuilder();
    char[] chars = termStr.toCharArray();
    for (int i = 0; i < termStr.length(); i++) {
      if (chars[i] == '?' || chars[i] == '*') {
        if (isWithinToken) {
          tlist.add(tmpBuffer.toString());
          tmpBuffer.setLength(0);
        }
        isWithinToken = false;
      } else {
        if (!isWithinToken) {
          wlist.add(tmpBuffer.toString());
          tmpBuffer.setLength(0);
        }
        isWithinToken = true;
      }
      tmpBuffer.append(chars[i]);
    }
    if (isWithinToken) {
      tlist.add(tmpBuffer.toString());
    } else {
      wlist.add(tmpBuffer.toString());
    }

    // get Analyzer from superclass and tokenize the term
    TokenStream source;
    
    int countTokens = 0;
    try {
      source = getAnalyzer().reusableTokenStream(field, new StringReader(termStr));
      source.reset();
    } catch (IOException e1) {
      throw new RuntimeException(e1);
    }
    CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
    while (true) {
      try {
        if (!source.incrementToken()) break;
      } catch (IOException e) {
        break;
      }
      String term = termAtt.toString();
      if (!"".equals(term)) {
        try {
          tlist.set(countTokens++, term);
        } catch (IndexOutOfBoundsException ioobe) {
          countTokens = -1;
        }
      }
    }
    try {
      source.end();
      source.close();
    } catch (IOException e) {
      // ignore
    }

    if (countTokens != tlist.size()) {
      /* this means that the analyzer used either added or consumed 
       * (common for a stemmer) tokens, and we can't build a WildcardQuery */
      throw new ParseException("Cannot build WildcardQuery with analyzer "
          + getAnalyzer().getClass() + " - tokens added or lost");
    }

    if (tlist.size() == 0) {
      return null;
    } else if (tlist.size() == 1) {
      if (wlist != null && wlist.size() == 1) {
        /* if wlist contains one wildcard, it must be at the end, because:
         * 1) wildcards are not allowed in 1st position of a term by QueryParser
         * 2) if wildcard was *not* in end, there would be *two* or more tokens */
        return super.getWildcardQuery(field, tlist.get(0)
            + wlist.get(0).toString());
      } else {
        /* we should never get here! if so, this method was called
         * with a termStr containing no wildcard ... */
        throw new IllegalArgumentException("getWildcardQuery called without wildcard");
      }
    } else {
      /* the term was tokenized, let's rebuild to one token
       * with wildcards put back in postion */
      StringBuilder sb = new StringBuilder();
      for (int i = 0; i < tlist.size(); i++) {
        sb.append( tlist.get(i));
        if (wlist != null && wlist.size() > i) {
          sb.append(wlist.get(i));
        }
      }
      return super.getWildcardQuery(field, sb.toString());
    }
  }

  /**
   * Called when parser parses an input term
   * token that uses prefix notation; that is, contains a single '*' wildcard
   * character as its last character. Since this is a special case
   * of generic wildcard term, and such a query can be optimized easily,
   * this usually results in a different query object.
   * <p>
   * Depending on analyzer and settings, a prefix term may (most probably will)
   * be lower-cased automatically. It <b>will go through the default Analyzer.
   * <p>
   * Overrides super class, by passing terms through analyzer.
   *
   * @param  field   Name of the field query will use.
   * @param  termStr Term token to use for building term for the query
   *                 (<b>without trailing '*' character!)
   *
   * @return Resulting {@link Query} built for the term
   * @throws ParseException
   */
  @Override
  protected Query getPrefixQuery(String field, String termStr) throws ParseException {
    // get Analyzer from superclass and tokenize the term
    TokenStream source;
    List<String> tlist = new ArrayList();
    try {
      source = getAnalyzer().reusableTokenStream(field, new StringReader(termStr));
      source.reset();
    } catch (IOException e1) {
      throw new RuntimeException(e1);
    }
    CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
    while (true) {
      try {
        if (!source.incrementToken()) break;
      } catch (IOException e) {
        break;
      }
      tlist.add(termAtt.toString());
    }

    try {
      source.end();
      source.close();
    } catch (IOException e) {
      // ignore
    }

    if (tlist.size() == 1) {
      return super.getPrefixQuery(field, tlist.get(0));
    } else {
      /* this means that the analyzer used either added or consumed
       * (common for a stemmer) tokens, and we can't build a PrefixQuery */
      throw new ParseException("Cannot build PrefixQuery with analyzer "
          + getAnalyzer().getClass()
          + (tlist.size() > 1 ? " - token(s) added" : " - token consumed"));
    }
  }

  /**
   * Called when parser parses an input term token that has the fuzzy suffix (~) appended.
   * <p>
   * Depending on analyzer and settings, a fuzzy term may (most probably will)
   * be lower-cased automatically. It <b>will go through the default Analyzer.
   * <p>
   * Overrides super class, by passing terms through analyzer.
   *
   * @param field Name of the field query will use.
   * @param termStr Term token to use for building term for the query
   *
   * @return Resulting {@link Query} built for the term
   * @exception ParseException
   */
  @Override
  protected Query getFuzzyQuery(String field, String termStr, float minSimilarity)
      throws ParseException {
    // get Analyzer from superclass and tokenize the term
    TokenStream source = null;
    String nextToken = null;
    boolean multipleTokens = false;
    
    try {
      source = getAnalyzer().reusableTokenStream(field, new StringReader(termStr));
      CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
      source.reset();
      if (source.incrementToken()) {
        nextToken = termAtt.toString();
      }
      multipleTokens = source.incrementToken();
    } catch (IOException e) {
      nextToken = null;
    }

    try {
      source.end();
      source.close();
    } catch (IOException e) {
      // ignore
    }

    if (multipleTokens) {
      throw new ParseException("Cannot build FuzzyQuery with analyzer " + getAnalyzer().getClass()
          + " - tokens were added");
    }

    return (nextToken == null) ? null : super.getFuzzyQuery(field, nextToken, minSimilarity);
  }

  /**
   * Overrides super class, by passing terms through analyzer.
   * @exception ParseException
   */
  @Override
  protected Query getRangeQuery(String field, String part1, String part2, boolean inclusive)
      throws ParseException {
    // get Analyzer from superclass and tokenize the terms
    TokenStream source = null;
    CharTermAttribute termAtt = null;
    boolean multipleTokens = false;

    if (part1 != null) {
      // part1
      try {
        source = getAnalyzer().reusableTokenStream(field, new StringReader(part1));
        termAtt = source.addAttribute(CharTermAttribute.class);
        source.reset();
        multipleTokens = false;


        if (source.incrementToken()) {
          part1 = termAtt.toString();
        }
        multipleTokens = source.incrementToken();
      } catch (IOException e) {
        // ignore
      }

      try {
        source.end();
        source.close();
      } catch (IOException e) {
        // ignore
      }
      if (multipleTokens) {
        throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
            + " - tokens were added to part1");
      }
    }
    try {
      source.close();
    } catch (IOException e) {
      // ignore
    }
    if (multipleTokens) {
      throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
          + " - tokens were added to part1");
    }

    if (part2 != null) {
      try {
        // part2
        source = getAnalyzer().reusableTokenStream(field, new StringReader(part2));
        termAtt = source.addAttribute(CharTermAttribute.class);
        source.reset();
        if (source.incrementToken()) {
          part2 = termAtt.toString();
        }
        multipleTokens = source.incrementToken();
      } catch (IOException e) {
        // ignore
      }
      try {
        source.end();
        source.close();
      } catch (IOException e) {
        // ignore
      }
      if (multipleTokens) {
        throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
            + " - tokens were added to part2");
      }
    }
    try {
      source.close();
    } catch (IOException e) {
      // ignore
    }
    if (multipleTokens) {
      throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
          + " - tokens were added to part2");
    }
    return super.getRangeQuery(field, part1, part2, inclusive);
  }

}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene AnalyzingQueryParser.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.