alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (QueryParser.jj)

This example Lucene source code file (QueryParser.jj) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

carat, default, exception, io, ioexception, multiphrasequery, parseexception, parseexception, query, query, string, string, term, term, text, token, util

The Lucene QueryParser.jj source code

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

options {
  STATIC=false;
  JAVA_UNICODE_ESCAPE=true;
  USER_CHAR_STREAM=true;
}

PARSER_BEGIN(QueryParser)

package org.apache.lucene.queryParser;

import java.io.IOException;
import java.io.StringReader;
import java.text.Collator;
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.DateField;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.VirtualMethod;

/**
 * This class is generated by JavaCC.  The most important method is
 * {@link #parse(String)}.
 *
 * The syntax for query strings is as follows:
 * A Query is a series of clauses.
 * A clause may be prefixed by:
 * <ul>
 * <li> a plus (+) or a minus (-) sign, indicating
 * that the clause is required or prohibited respectively; or
 * <li> a term followed by a colon, indicating the field to be searched.
 * This enables one to construct queries which search multiple fields.
 * </ul>
 *
 * A clause may be either:
 * <ul>
 * <li> a term, indicating all the documents that contain this term; or
 * <li> a nested query, enclosed in parentheses.  Note that this may be used
 * with a <code>+/- prefix to require any of a set of
 * terms.
 * </ul>
 *
 * Thus, in BNF, the query grammar is:
 * <pre>
 *   Query  ::= ( Clause )*
 *   Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" )
 * </pre>
 *
 * <p>
 * Examples of appropriately formatted queries can be found in the <a
 * href="../../../../../../queryparsersyntax.html">query syntax
 * documentation</a>.
 * </p>
 *
 * <p>
 * In {@link TermRangeQuery}s, QueryParser tries to detect date values, e.g.
 * <tt>date:[6/1/2005 TO 6/4/2005] produces a range query that searches
 * for "date" fields between 2005-06-01 and 2005-06-04. Note that the format
 * of the accepted input depends on {@link #setLocale(Locale) the locale}.
 * By default a date is converted into a search term using the deprecated
 * {@link DateField} for compatibility reasons.
 * To use the new {@link DateTools} to convert dates, a
 * {@link org.apache.lucene.document.DateTools.Resolution} has to be set.
 * </p>
 * <p>
 * The date resolution that shall be used for RangeQueries can be set
 * using {@link #setDateResolution(DateTools.Resolution)}
 * or {@link #setDateResolution(String, DateTools.Resolution)}. The former
 * sets the default date resolution for all fields, whereas the latter can
 * be used to set field specific date resolutions. Field specific date
 * resolutions take, if set, precedence over the default date resolution.
 * </p>
 * <p>
 * If you use neither {@link DateField} nor {@link DateTools} in your
 * index, you can create your own
 * query parser that inherits QueryParser and overwrites
 * {@link #getRangeQuery(String, String, String, boolean)} to
 * use a different method for date conversion.
 * </p>
 *
 * <p>Note that QueryParser is not thread-safe.

* * <p>NOTE: there is a new QueryParser in contrib, which matches * the same syntax as this class, but is more modular, * enabling substantial customization to how a query is created. * * <a name="version"/> * <p>NOTE: You must specify the required {@link Version} * compatibility when creating QueryParser: * <ul> * <li> As of 2.9, {@link #setEnablePositionIncrements} is true by * default. * <li> As of 3.1, {@link #setAutoGeneratePhraseQueries} is false by * default. * </ul> */ public class QueryParser { private static final int CONJ_NONE = 0; private static final int CONJ_AND = 1; private static final int CONJ_OR = 2; private static final int MOD_NONE = 0; private static final int MOD_NOT = 10; private static final int MOD_REQ = 11; // make it possible to call setDefaultOperator() without accessing // the nested class: /** Alternative form of QueryParser.Operator.AND */ public static final Operator AND_OPERATOR = Operator.AND; /** Alternative form of QueryParser.Operator.OR */ public static final Operator OR_OPERATOR = Operator.OR; /** The actual operator that parser uses to combine query terms */ private Operator operator = OR_OPERATOR; boolean lowercaseExpandedTerms = true; MultiTermQuery.RewriteMethod multiTermRewriteMethod = MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT; boolean allowLeadingWildcard = false; boolean enablePositionIncrements = true; Analyzer analyzer; String field; int phraseSlop = 0; float fuzzyMinSim = FuzzyQuery.defaultMinSimilarity; int fuzzyPrefixLength = FuzzyQuery.defaultPrefixLength; Locale locale = Locale.getDefault(); // the default date resolution DateTools.Resolution dateResolution = null; // maps field names to date resolutions Map<String,DateTools.Resolution> fieldToDateResolution = null; // The collator to use when determining range inclusion, // for use when constructing RangeQuerys. Collator rangeCollator = null; /** @deprecated remove when getFieldQuery is removed */ @Deprecated private static final VirtualMethod<QueryParser> getFieldQueryMethod = new VirtualMethod<QueryParser>(QueryParser.class, "getFieldQuery", String.class, String.class); /** @deprecated remove when getFieldQuery is removed */ @Deprecated private static final VirtualMethod<QueryParser> getFieldQueryWithQuotedMethod = new VirtualMethod<QueryParser>(QueryParser.class, "getFieldQuery", String.class, String.class, boolean.class); /** @deprecated remove when getFieldQuery is removed */ @Deprecated private final boolean hasNewAPI = VirtualMethod.compareImplementationDistance(getClass(), getFieldQueryWithQuotedMethod, getFieldQueryMethod) >= 0; // its ok for both to be overridden private boolean autoGeneratePhraseQueries; /** The default operator for parsing queries. * Use {@link QueryParser#setDefaultOperator} to change it. */ static public enum Operator { OR, AND } /** Constructs a query parser. * @param matchVersion Lucene version to match. See <a href="#version">above. * @param f the default field for query terms. * @param a used to find terms in the query text. */ public QueryParser(Version matchVersion, String f, Analyzer a) { this(new FastCharStream(new StringReader(""))); analyzer = a; field = f; if (matchVersion.onOrAfter(Version.LUCENE_29)) { enablePositionIncrements = true; } else { enablePositionIncrements = false; } if (matchVersion.onOrAfter(Version.LUCENE_31)) { setAutoGeneratePhraseQueries(false); } else { setAutoGeneratePhraseQueries(true); } } /** Parses a query string, returning a {@link org.apache.lucene.search.Query}. * @param query the query string to be parsed. * @throws ParseException if the parsing fails */ public Query parse(String query) throws ParseException { ReInit(new FastCharStream(new StringReader(query))); try { // TopLevelQuery is a Query followed by the end-of-input (EOF) Query res = TopLevelQuery(field); return res!=null ? res : newBooleanQuery(false); } catch (ParseException tme) { // rethrow to include the original query: ParseException e = new ParseException("Cannot parse '" +query+ "': " + tme.getMessage()); e.initCause(tme); throw e; } catch (TokenMgrError tme) { ParseException e = new ParseException("Cannot parse '" +query+ "': " + tme.getMessage()); e.initCause(tme); throw e; } catch (BooleanQuery.TooManyClauses tmc) { ParseException e = new ParseException("Cannot parse '" +query+ "': too many boolean clauses"); e.initCause(tmc); throw e; } } /** * @return Returns the analyzer. */ public Analyzer getAnalyzer() { return analyzer; } /** * @return Returns the field. */ public String getField() { return field; } /** * @see #setAutoGeneratePhraseQueries(boolean) */ public final boolean getAutoGeneratePhraseQueries() { return autoGeneratePhraseQueries; } /** * Set to true if phrase queries will be automatically generated * when the analyzer returns more than one term from whitespace * delimited text. * NOTE: this behavior may not be suitable for all languages. * <p> * Set to false if phrase queries should only be generated when * surrounded by double quotes. */ public final void setAutoGeneratePhraseQueries(boolean value) { if (value == false && !hasNewAPI) throw new IllegalArgumentException("You must implement the new API: getFieldQuery(String,String,boolean)" + " to use setAutoGeneratePhraseQueries(false)"); this.autoGeneratePhraseQueries = value; } /** * Get the minimal similarity for fuzzy queries. */ public float getFuzzyMinSim() { return fuzzyMinSim; } /** * Set the minimum similarity for fuzzy queries. * Default is 0.5f. */ public void setFuzzyMinSim(float fuzzyMinSim) { this.fuzzyMinSim = fuzzyMinSim; } /** * Get the prefix length for fuzzy queries. * @return Returns the fuzzyPrefixLength. */ public int getFuzzyPrefixLength() { return fuzzyPrefixLength; } /** * Set the prefix length for fuzzy queries. Default is 0. * @param fuzzyPrefixLength The fuzzyPrefixLength to set. */ public void setFuzzyPrefixLength(int fuzzyPrefixLength) { this.fuzzyPrefixLength = fuzzyPrefixLength; } /** * Sets the default slop for phrases. If zero, then exact phrase matches * are required. Default value is zero. */ public void setPhraseSlop(int phraseSlop) { this.phraseSlop = phraseSlop; } /** * Gets the default slop for phrases. */ public int getPhraseSlop() { return phraseSlop; } /** * Set to <code>true to allow leading wildcard characters. * <p> * When set, <code>* or ? are allowed as * the first character of a PrefixQuery and WildcardQuery. * Note that this can produce very slow * queries on big indexes. * <p> * Default: false. */ public void setAllowLeadingWildcard(boolean allowLeadingWildcard) { this.allowLeadingWildcard = allowLeadingWildcard; } /** * @see #setAllowLeadingWildcard(boolean) */ public boolean getAllowLeadingWildcard() { return allowLeadingWildcard; } /** * Set to <code>true to enable position increments in result query. * <p> * When set, result phrase and multi-phrase queries will * be aware of position increments. * Useful when e.g. a StopFilter increases the position increment of * the token that follows an omitted token. * <p> * Default: false. */ public void setEnablePositionIncrements(boolean enable) { this.enablePositionIncrements = enable; } /** * @see #setEnablePositionIncrements(boolean) */ public boolean getEnablePositionIncrements() { return enablePositionIncrements; } /** * Sets the boolean operator of the QueryParser. * In default mode (<code>OR_OPERATOR) terms without any modifiers * are considered optional: for example <code>capital of Hungary is equal to * <code>capital OR of OR Hungary.
* In <code>AND_OPERATOR mode terms are considered to be in conjunction: the * above mentioned query is parsed as <code>capital AND of AND Hungary */ public void setDefaultOperator(Operator op) { this.operator = op; } /** * Gets implicit operator setting, which will be either AND_OPERATOR * or OR_OPERATOR. */ public Operator getDefaultOperator() { return operator; } /** * Whether terms of wildcard, prefix, fuzzy and range queries are to be automatically * lower-cased or not. Default is <code>true. */ public void setLowercaseExpandedTerms(boolean lowercaseExpandedTerms) { this.lowercaseExpandedTerms = lowercaseExpandedTerms; } /** * @see #setLowercaseExpandedTerms(boolean) */ public boolean getLowercaseExpandedTerms() { return lowercaseExpandedTerms; } /** * By default QueryParser uses {@link MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} * when creating a PrefixQuery, WildcardQuery or RangeQuery. This implementation is generally preferable because it * a) Runs faster b) Does not have the scarcity of terms unduly influence score * c) avoids any "TooManyBooleanClauses" exception. * However, if your application really needs to use the * old-fashioned BooleanQuery expansion rewriting and the above * points are not relevant then use this to change * the rewrite method. */ public void setMultiTermRewriteMethod(MultiTermQuery.RewriteMethod method) { multiTermRewriteMethod = method; } /** * @see #setMultiTermRewriteMethod */ public MultiTermQuery.RewriteMethod getMultiTermRewriteMethod() { return multiTermRewriteMethod; } /** * Set locale used by date range parsing. */ public void setLocale(Locale locale) { this.locale = locale; } /** * Returns current locale, allowing access by subclasses. */ public Locale getLocale() { return locale; } /** * Sets the default date resolution used by RangeQueries for fields for which no * specific date resolutions has been set. Field specific resolutions can be set * with {@link #setDateResolution(String, DateTools.Resolution)}. * * @param dateResolution the default date resolution to set */ public void setDateResolution(DateTools.Resolution dateResolution) { this.dateResolution = dateResolution; } /** * Sets the date resolution used by RangeQueries for a specific field. * * @param fieldName field for which the date resolution is to be set * @param dateResolution date resolution to set */ public void setDateResolution(String fieldName, DateTools.Resolution dateResolution) { if (fieldName == null) { throw new IllegalArgumentException("Field cannot be null."); } if (fieldToDateResolution == null) { // lazily initialize HashMap fieldToDateResolution = new HashMap<String,DateTools.Resolution>(); } fieldToDateResolution.put(fieldName, dateResolution); } /** * Returns the date resolution that is used by RangeQueries for the given field. * Returns null, if no default or field specific date resolution has been set * for the given field. * */ public DateTools.Resolution getDateResolution(String fieldName) { if (fieldName == null) { throw new IllegalArgumentException("Field cannot be null."); } if (fieldToDateResolution == null) { // no field specific date resolutions set; return default date resolution instead return this.dateResolution; } DateTools.Resolution resolution = fieldToDateResolution.get(fieldName); if (resolution == null) { // no date resolutions set for the given field; return default date resolution instead resolution = this.dateResolution; } return resolution; } /** * Sets the collator used to determine index term inclusion in ranges * for RangeQuerys. * <p/> * <strong>WARNING: Setting the rangeCollator to a non-null * collator using this method will cause every single index Term in the * Field referenced by lowerTerm and/or upperTerm to be examined. * Depending on the number of index Terms in this Field, the operation could * be very slow. * * @param rc the collator to use when constructing RangeQuerys */ public void setRangeCollator(Collator rc) { rangeCollator = rc; } /** * @return the collator used to determine index term inclusion in ranges * for RangeQuerys. */ public Collator getRangeCollator() { return rangeCollator; } protected void addClause(List<BooleanClause> clauses, int conj, int mods, Query q) { boolean required, prohibited; // If this term is introduced by AND, make the preceding term required, // unless it's already prohibited if (clauses.size() > 0 && conj == CONJ_AND) { BooleanClause c = clauses.get(clauses.size()-1); if (!c.isProhibited()) c.setOccur(BooleanClause.Occur.MUST); } if (clauses.size() > 0 && operator == AND_OPERATOR && conj == CONJ_OR) { // If this term is introduced by OR, make the preceding term optional, // unless it's prohibited (that means we leave -a OR b but +a OR b-->a OR b) // notice if the input is a OR b, first term is parsed as required; without // this modification a OR b would parsed as +a OR b BooleanClause c = clauses.get(clauses.size()-1); if (!c.isProhibited()) c.setOccur(BooleanClause.Occur.SHOULD); } // We might have been passed a null query; the term might have been // filtered away by the analyzer. if (q == null) return; if (operator == OR_OPERATOR) { // We set REQUIRED if we're introduced by AND or +; PROHIBITED if // introduced by NOT or -; make sure not to set both. prohibited = (mods == MOD_NOT); required = (mods == MOD_REQ); if (conj == CONJ_AND && !prohibited) { required = true; } } else { // We set PROHIBITED if we're introduced by NOT or -; We set REQUIRED // if not PROHIBITED and not introduced by OR prohibited = (mods == MOD_NOT); required = (!prohibited && conj != CONJ_OR); } if (required && !prohibited) clauses.add(newBooleanClause(q, BooleanClause.Occur.MUST)); else if (!required && !prohibited) clauses.add(newBooleanClause(q, BooleanClause.Occur.SHOULD)); else if (!required && prohibited) clauses.add(newBooleanClause(q, BooleanClause.Occur.MUST_NOT)); else throw new RuntimeException("Clause cannot be both required and prohibited"); } /** * @deprecated Use {@link #getFieldQuery(String,String,boolean)} instead. */ @Deprecated protected Query getFieldQuery(String field, String queryText) throws ParseException { // treat the text as if it was quoted, to drive phrase logic with old versions. return getFieldQuery(field, queryText, true); } /** * @exception ParseException throw in overridden method to disallow */ protected Query getFieldQuery(String field, String queryText, boolean quoted) throws ParseException { // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or nothing based on the term count TokenStream source; try { source = analyzer.reusableTokenStream(field, new StringReader(queryText)); source.reset(); } catch (IOException e) { source = analyzer.tokenStream(field, new StringReader(queryText)); } CachingTokenFilter buffer = new CachingTokenFilter(source); CharTermAttribute termAtt = null; PositionIncrementAttribute posIncrAtt = null; int numTokens = 0; boolean success = false; try { buffer.reset(); success = true; } catch (IOException e) { // success==false if we hit an exception } if (success) { if (buffer.hasAttribute(CharTermAttribute.class)) { termAtt = buffer.getAttribute(CharTermAttribute.class); } if (buffer.hasAttribute(PositionIncrementAttribute.class)) { posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class); } } int positionCount = 0; boolean severalTokensAtSamePosition = false; boolean hasMoreTokens = false; if (termAtt != null) { try { hasMoreTokens = buffer.incrementToken(); while (hasMoreTokens) { numTokens++; int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1; if (positionIncrement != 0) { positionCount += positionIncrement; } else { severalTokensAtSamePosition = true; } hasMoreTokens = buffer.incrementToken(); } } catch (IOException e) { // ignore } } try { // rewind the buffer stream buffer.reset(); // close original stream - all tokens buffered source.close(); } catch (IOException e) { // ignore } if (numTokens == 0) return null; else if (numTokens == 1) { String term = null; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.toString(); } catch (IOException e) { // safe to ignore, because we know the number of tokens } return newTermQuery(new Term(field, term)); } else { if (severalTokensAtSamePosition || (!quoted && !autoGeneratePhraseQueries)) { if (positionCount == 1 || (!quoted && !autoGeneratePhraseQueries)) { // no phrase query: BooleanQuery q = newBooleanQuery(positionCount == 1); BooleanClause.Occur occur = positionCount > 1 && operator == AND_OPERATOR ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD; for (int i = 0; i < numTokens; i++) { String term = null; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.toString(); } catch (IOException e) { // safe to ignore, because we know the number of tokens } Query currentQuery = newTermQuery( new Term(field, term)); q.add(currentQuery, occur); } return q; } else { // phrase query: MultiPhraseQuery mpq = newMultiPhraseQuery(); mpq.setSlop(phraseSlop); List<Term> multiTerms = new ArrayList(); int position = -1; for (int i = 0; i < numTokens; i++) { String term = null; int positionIncrement = 1; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.toString(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { // safe to ignore, because we know the number of tokens } if (positionIncrement > 0 && multiTerms.size() > 0) { if (enablePositionIncrements) { mpq.add(multiTerms.toArray(new Term[0]),position); } else { mpq.add(multiTerms.toArray(new Term[0])); } multiTerms.clear(); } position += positionIncrement; multiTerms.add(new Term(field, term)); } if (enablePositionIncrements) { mpq.add(multiTerms.toArray(new Term[0]),position); } else { mpq.add(multiTerms.toArray(new Term[0])); } return mpq; } } else { PhraseQuery pq = newPhraseQuery(); pq.setSlop(phraseSlop); int position = -1; for (int i = 0; i < numTokens; i++) { String term = null; int positionIncrement = 1; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.toString(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { // safe to ignore, because we know the number of tokens } if (enablePositionIncrements) { position += positionIncrement; pq.add(new Term(field, term),position); } else { pq.add(new Term(field, term)); } } return pq; } } } /** * Base implementation delegates to {@link #getFieldQuery(String,String,boolean)}. * This method may be overridden, for example, to return * a SpanNearQuery instead of a PhraseQuery. * * @exception ParseException throw in overridden method to disallow */ protected Query getFieldQuery(String field, String queryText, int slop) throws ParseException { Query query = hasNewAPI ? getFieldQuery(field, queryText, true) : getFieldQuery(field, queryText); if (query instanceof PhraseQuery) { ((PhraseQuery) query).setSlop(slop); } if (query instanceof MultiPhraseQuery) { ((MultiPhraseQuery) query).setSlop(slop); } return query; } /** * @exception ParseException throw in overridden method to disallow */ protected Query getRangeQuery(String field, String part1, String part2, boolean inclusive) throws ParseException { if (lowercaseExpandedTerms) { part1 = part1.toLowerCase(); part2 = part2.toLowerCase(); } try { DateFormat df = DateFormat.getDateInstance(DateFormat.SHORT, locale); df.setLenient(true); Date d1 = df.parse(part1); Date d2 = df.parse(part2); if (inclusive) { // The user can only specify the date, not the time, so make sure // the time is set to the latest possible time of that date to really // include all documents: Calendar cal = Calendar.getInstance(locale); cal.setTime(d2); cal.set(Calendar.HOUR_OF_DAY, 23); cal.set(Calendar.MINUTE, 59); cal.set(Calendar.SECOND, 59); cal.set(Calendar.MILLISECOND, 999); d2 = cal.getTime(); } DateTools.Resolution resolution = getDateResolution(field); if (resolution == null) { // no default or field specific date resolution has been set, // use deprecated DateField to maintain compatibility with // pre-1.9 Lucene versions. part1 = DateField.dateToString(d1); part2 = DateField.dateToString(d2); } else { part1 = DateTools.dateToString(d1, resolution); part2 = DateTools.dateToString(d2, resolution); } } catch (Exception e) { } return newRangeQuery(field, part1, part2, inclusive); } /** * Builds a new BooleanQuery instance * @param disableCoord disable coord * @return new BooleanQuery instance */ protected BooleanQuery newBooleanQuery(boolean disableCoord) { return new BooleanQuery(disableCoord); } /** * Builds a new BooleanClause instance * @param q sub query * @param occur how this clause should occur when matching documents * @return new BooleanClause instance */ protected BooleanClause newBooleanClause(Query q, BooleanClause.Occur occur) { return new BooleanClause(q, occur); } /** * Builds a new TermQuery instance * @param term term * @return new TermQuery instance */ protected Query newTermQuery(Term term){ return new TermQuery(term); } /** * Builds a new PhraseQuery instance * @return new PhraseQuery instance */ protected PhraseQuery newPhraseQuery(){ return new PhraseQuery(); } /** * Builds a new MultiPhraseQuery instance * @return new MultiPhraseQuery instance */ protected MultiPhraseQuery newMultiPhraseQuery(){ return new MultiPhraseQuery(); } /** * Builds a new PrefixQuery instance * @param prefix Prefix term * @return new PrefixQuery instance */ protected Query newPrefixQuery(Term prefix){ PrefixQuery query = new PrefixQuery(prefix); query.setRewriteMethod(multiTermRewriteMethod); return query; } /** * Builds a new FuzzyQuery instance * @param term Term * @param minimumSimilarity minimum similarity * @param prefixLength prefix length * @return new FuzzyQuery Instance */ protected Query newFuzzyQuery(Term term, float minimumSimilarity, int prefixLength) { // FuzzyQuery doesn't yet allow constant score rewrite return new FuzzyQuery(term,minimumSimilarity,prefixLength); } /** * Builds a new TermRangeQuery instance * @param field Field * @param part1 min * @param part2 max * @param inclusive true if range is inclusive * @return new TermRangeQuery instance */ protected Query newRangeQuery(String field, String part1, String part2, boolean inclusive) { final TermRangeQuery query = new TermRangeQuery(field, part1, part2, inclusive, inclusive, rangeCollator); query.setRewriteMethod(multiTermRewriteMethod); return query; } /** * Builds a new MatchAllDocsQuery instance * @return new MatchAllDocsQuery instance */ protected Query newMatchAllDocsQuery() { return new MatchAllDocsQuery(); } /** * Builds a new WildcardQuery instance * @param t wildcard term * @return new WildcardQuery instance */ protected Query newWildcardQuery(Term t) { WildcardQuery query = new WildcardQuery(t); query.setRewriteMethod(multiTermRewriteMethod); return query; } /** * Factory method for generating query, given a set of clauses. * By default creates a boolean query composed of clauses passed in. * * Can be overridden by extending classes, to modify query being * returned. * * @param clauses List that contains {@link BooleanClause} instances * to join. * * @return Resulting {@link Query} object. * @exception ParseException throw in overridden method to disallow */ protected Query getBooleanQuery(List<BooleanClause> clauses) throws ParseException { return getBooleanQuery(clauses, false); } /** * Factory method for generating query, given a set of clauses. * By default creates a boolean query composed of clauses passed in. * * Can be overridden by extending classes, to modify query being * returned. * * @param clauses List that contains {@link BooleanClause} instances * to join. * @param disableCoord true if coord scoring should be disabled. * * @return Resulting {@link Query} object. * @exception ParseException throw in overridden method to disallow */ protected Query getBooleanQuery(List<BooleanClause> clauses, boolean disableCoord) throws ParseException { if (clauses.size()==0) { return null; // all clause words were filtered away by the analyzer. } BooleanQuery query = newBooleanQuery(disableCoord); for(final BooleanClause clause: clauses) { query.add(clause); } return query; } /** * Factory method for generating a query. Called when parser * parses an input term token that contains one or more wildcard * characters (? and *), but is not a prefix term token (one * that has just a single * character at the end) *<p> * Depending on settings, prefix term may be lower-cased * automatically. It will not go through the default Analyzer, * however, since normal Analyzers are unlikely to work properly * with wildcard templates. *<p> * Can be overridden by extending classes, to provide custom handling for * wildcard queries, which may be necessary due to missing analyzer calls. * * @param field Name of the field query will use. * @param termStr Term token that contains one or more wild card * characters (? or *), but is not simple prefix term * * @return Resulting {@link Query} built for the term * @exception ParseException throw in overridden method to disallow */ protected Query getWildcardQuery(String field, String termStr) throws ParseException { if ("*".equals(field)) { if ("*".equals(termStr)) return newMatchAllDocsQuery(); } if (!allowLeadingWildcard && (termStr.startsWith("*") || termStr.startsWith("?"))) throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery"); if (lowercaseExpandedTerms) { termStr = termStr.toLowerCase(); } Term t = new Term(field, termStr); return newWildcardQuery(t); } /** * Factory method for generating a query (similar to * {@link #getWildcardQuery}). Called when parser parses an input term * token that uses prefix notation; that is, contains a single '*' wildcard * character as its last character. Since this is a special case * of generic wildcard term, and such a query can be optimized easily, * this usually results in a different query object. *<p> * Depending on settings, a prefix term may be lower-cased * automatically. It will not go through the default Analyzer, * however, since normal Analyzers are unlikely to work properly * with wildcard templates. *<p> * Can be overridden by extending classes, to provide custom handling for * wild card queries, which may be necessary due to missing analyzer calls. * * @param field Name of the field query will use. * @param termStr Term token to use for building term for the query * (<b>without trailing '*' character!) * * @return Resulting {@link Query} built for the term * @exception ParseException throw in overridden method to disallow */ protected Query getPrefixQuery(String field, String termStr) throws ParseException { if (!allowLeadingWildcard && termStr.startsWith("*")) throw new ParseException("'*' not allowed as first character in PrefixQuery"); if (lowercaseExpandedTerms) { termStr = termStr.toLowerCase(); } Term t = new Term(field, termStr); return newPrefixQuery(t); } /** * Factory method for generating a query (similar to * {@link #getWildcardQuery}). Called when parser parses * an input term token that has the fuzzy suffix (~) appended. * * @param field Name of the field query will use. * @param termStr Term token to use for building term for the query * * @return Resulting {@link Query} built for the term * @exception ParseException throw in overridden method to disallow */ protected Query getFuzzyQuery(String field, String termStr, float minSimilarity) throws ParseException { if (lowercaseExpandedTerms) { termStr = termStr.toLowerCase(); } Term t = new Term(field, termStr); return newFuzzyQuery(t, minSimilarity, fuzzyPrefixLength); } /** * Returns a String where the escape char has been * removed, or kept only once if there was a double escape. * * Supports escaped unicode characters, e. g. translates * <code>\\u0041 to A. * */ private String discardEscapeChar(String input) throws ParseException { // Create char array to hold unescaped char sequence char[] output = new char[input.length()]; // The length of the output can be less than the input // due to discarded escape chars. This variable holds // the actual length of the output int length = 0; // We remember whether the last processed character was // an escape character boolean lastCharWasEscapeChar = false; // The multiplier the current unicode digit must be multiplied with. // E. g. the first digit must be multiplied with 16^3, the second with 16^2... int codePointMultiplier = 0; // Used to calculate the codepoint of the escaped unicode character int codePoint = 0; for (int i = 0; i < input.length(); i++) { char curChar = input.charAt(i); if (codePointMultiplier > 0) { codePoint += hexToInt(curChar) * codePointMultiplier; codePointMultiplier >>>= 4; if (codePointMultiplier == 0) { output[length++] = (char)codePoint; codePoint = 0; } } else if (lastCharWasEscapeChar) { if (curChar == 'u') { // found an escaped unicode character codePointMultiplier = 16 * 16 * 16; } else { // this character was escaped output[length] = curChar; length++; } lastCharWasEscapeChar = false; } else { if (curChar == '\\') { lastCharWasEscapeChar = true; } else { output[length] = curChar; length++; } } } if (codePointMultiplier > 0) { throw new ParseException("Truncated unicode escape sequence."); } if (lastCharWasEscapeChar) { throw new ParseException("Term can not end with escape character."); } return new String(output, 0, length); } /** Returns the numeric value of the hexadecimal character */ private static final int hexToInt(char c) throws ParseException { if ('0' <= c && c <= '9') { return c - '0'; } else if ('a' <= c && c <= 'f'){ return c - 'a' + 10; } else if ('A' <= c && c <= 'F') { return c - 'A' + 10; } else { throw new ParseException("None-hex character in unicode escape sequence: " + c); } } /** * Returns a String where those characters that QueryParser * expects to be escaped are escaped by a preceding <code>\. */ public static String escape(String s) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); // These characters are part of the query syntax and must be escaped if (c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':' || c == '^' || c == '[' || c == ']' || c == '\"' || c == '{' || c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&') { sb.append('\\'); } sb.append(c); } return sb.toString(); } /** * Command line tool to test QueryParser, using {@link org.apache.lucene.analysis.SimpleAnalyzer}. * Usage:<br> * <code>java org.apache.lucene.queryParser.QueryParser <input> */ public static void main(String[] args) throws Exception { if (args.length == 0) { System.out.println("Usage: java org.apache.lucene.queryParser.QueryParser <input>"); System.exit(0); } QueryParser qp = new QueryParser(Version.LUCENE_CURRENT, "field", new org.apache.lucene.analysis.SimpleAnalyzer()); Query q = qp.parse(args[0]); System.out.println(q.toString("field")); } } PARSER_END(QueryParser) /* ***************** */ /* Token Definitions */ /* ***************** */ <*> TOKEN : { <#_NUM_CHAR: ["0"-"9"] > // every character that follows a backslash is considered as an escaped character | <#_ESCAPED_CHAR: "\\" ~[] > | <#_TERM_START_CHAR: ( ~[ " ", "\t", "\n", "\r", "\u3000", "+", "-", "!", "(", ")", ":", "^", "[", "]", "\"", "{", "}", "~", "*", "?", "\\" ] | <_ESCAPED_CHAR> ) > | <#_TERM_CHAR: ( <_TERM_START_CHAR> | <_ESCAPED_CHAR> | "-" | "+" ) > | <#_WHITESPACE: ( " " | "\t" | "\n" | "\r" | "\u3000") > | <#_QUOTED_CHAR: ( ~[ "\"", "\\" ] | <_ESCAPED_CHAR> ) > } <DEFAULT, RangeIn, RangeEx> SKIP : { < <_WHITESPACE>> } <DEFAULT> TOKEN : { <AND: ("AND" | "&&") > | <OR: ("OR" | "||") > | <NOT: ("NOT" | "!") > | <PLUS: "+" > | <MINUS: "-" > | <LPAREN: "(" > | <RPAREN: ")" > | <COLON: ":" > | <STAR: "*" > | <CARAT: "^" > : Boost | <QUOTED: "\"" (<_QUOTED_CHAR>)* "\""> | <TERM: <_TERM_START_CHAR> (<_TERM_CHAR>)* > | <FUZZY_SLOP: "~" ( (<_NUM_CHAR>)+ ( "." (<_NUM_CHAR>)+ )? )? > | <PREFIXTERM: ("*") | ( <_TERM_START_CHAR> (<_TERM_CHAR>)* "*" ) > | <WILDTERM: (<_TERM_START_CHAR> | [ "*", "?" ]) (<_TERM_CHAR> | ( [ "*", "?" ] ))* > | <RANGEIN_START: "[" > : RangeIn | <RANGEEX_START: "{" > : RangeEx } <Boost> TOKEN : { <NUMBER: (<_NUM_CHAR>)+ ( "." (<_NUM_CHAR>)+ )? > : DEFAULT } <RangeIn> TOKEN : { <RANGEIN_TO: "TO"> | <RANGEIN_END: "]"> : DEFAULT | <RANGEIN_QUOTED: "\"" (~["\""] | "\\\"")+ "\""> | <RANGEIN_GOOP: (~[ " ", "]" ])+ > } <RangeEx> TOKEN : { <RANGEEX_TO: "TO"> | <RANGEEX_END: "}"> : DEFAULT | <RANGEEX_QUOTED: "\"" (~["\""] | "\\\"")+ "\""> | <RANGEEX_GOOP: (~[ " ", "}" ])+ > } // * Query ::= ( Clause )* // * Clause ::= ["+", "-"] [<TERM> ":"] ( | "(" Query ")" ) int Conjunction() : { int ret = CONJ_NONE; } { [ <AND> { ret = CONJ_AND; } | <OR> { ret = CONJ_OR; } ] { return ret; } } int Modifiers() : { int ret = MOD_NONE; } { [ <PLUS> { ret = MOD_REQ; } | <MINUS> { ret = MOD_NOT; } | <NOT> { ret = MOD_NOT; } ] { return ret; } } // This makes sure that there is no garbage after the query string Query TopLevelQuery(String field) : { Query q; } { q=Query(field) <EOF> { return q; } } Query Query(String field) : { List<BooleanClause> clauses = new ArrayList(); Query q, firstQuery=null; int conj, mods; } { mods=Modifiers() q=Clause(field) { addClause(clauses, CONJ_NONE, mods, q); if (mods == MOD_NONE) firstQuery=q; } ( conj=Conjunction() mods=Modifiers() q=Clause(field) { addClause(clauses, conj, mods, q); } )* { if (clauses.size() == 1 && firstQuery != null) return firstQuery; else { return getBooleanQuery(clauses); } } } Query Clause(String field) : { Query q; Token fieldToken=null, boost=null; } { [ LOOKAHEAD(2) ( fieldToken=<TERM> {field=discardEscapeChar(fieldToken.image);} | <STAR> {field="*";} ) ] ( q=Term(field) | <LPAREN> q=Query(field) ( boost=)? ) { if (boost != null) { float f = (float)1.0; try { f = Float.valueOf(boost.image).floatValue(); q.setBoost(f); } catch (Exception ignored) { } } return q; } } Query Term(String field) : { Token term, boost=null, fuzzySlop=null, goop1, goop2; boolean prefix = false; boolean wildcard = false; boolean fuzzy = false; Query q; } { ( ( term=<TERM> | term=<STAR> { wildcard=true; } | term=<PREFIXTERM> { prefix=true; } | term=<WILDTERM> { wildcard=true; } | term=<NUMBER> ) [ fuzzySlop=<FUZZY_SLOP> { fuzzy=true; } ] [ <CARAT> boost= [ fuzzySlop= { fuzzy=true; } ] ] { String termImage=discardEscapeChar(term.image); if (wildcard) { q = getWildcardQuery(field, termImage); } else if (prefix) { q = getPrefixQuery(field, discardEscapeChar(term.image.substring (0, term.image.length()-1))); } else if (fuzzy) { float fms = fuzzyMinSim; try { fms = Float.valueOf(fuzzySlop.image.substring(1)).floatValue(); } catch (Exception ignored) { } if(fms < 0.0f || fms > 1.0f){ throw new ParseException("Minimum similarity for a FuzzyQuery has to be between 0.0f and 1.0f !"); } q = getFuzzyQuery(field, termImage,fms); } else { q = hasNewAPI ? getFieldQuery(field, termImage, false) : getFieldQuery(field, termImage); } } | ( <RANGEIN_START> ( goop1=|goop1= ) [ <RANGEIN_TO> ] ( goop2=|goop2= ) <RANGEIN_END> ) [ <CARAT> boost= ] { if (goop1.kind == RANGEIN_QUOTED) { goop1.image = goop1.image.substring(1, goop1.image.length()-1); } if (goop2.kind == RANGEIN_QUOTED) { goop2.image = goop2.image.substring(1, goop2.image.length()-1); } q = getRangeQuery(field, discardEscapeChar(goop1.image), discardEscapeChar(goop2.image), true); } | ( <RANGEEX_START> ( goop1=|goop1= ) [ <RANGEEX_TO> ] ( goop2=|goop2= ) <RANGEEX_END> ) [ <CARAT> boost= ] { if (goop1.kind == RANGEEX_QUOTED) { goop1.image = goop1.image.substring(1, goop1.image.length()-1); } if (goop2.kind == RANGEEX_QUOTED) { goop2.image = goop2.image.substring(1, goop2.image.length()-1); } q = getRangeQuery(field, discardEscapeChar(goop1.image), discardEscapeChar(goop2.image), false); } | term=<QUOTED> [ fuzzySlop=<FUZZY_SLOP> ] [ <CARAT> boost= ] { int s = phraseSlop; if (fuzzySlop != null) { try { s = Float.valueOf(fuzzySlop.image.substring(1)).intValue(); } catch (Exception ignored) { } } q = getFieldQuery(field, discardEscapeChar(term.image.substring(1, term.image.length()-1)), s); } ) { if (boost != null) { float f = (float) 1.0; try { f = Float.valueOf(boost.image).floatValue(); } catch (Exception ignored) { /* Should this be handled somehow? (defaults to "no boost", if * boost number is invalid) */ } // avoid boosting null queries, such as those caused by stop words if (q != null) { q.setBoost(f); } } return q; } }

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene QueryParser.jj source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.