alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (StandardSyntaxParser.jj)

This example Lucene source code file (StandardSyntaxParser.jj) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

and, carat, carat, default, exception, io, or, parametricquerynode, querynode, querynode, rangein_quoted, token, token, util, vector, vector

The Lucene StandardSyntaxParser.jj source code

/**
 * Standard file is based on the TextParser.jj from lucene 2.3
 */

options {
  STATIC=false;
  JAVA_UNICODE_ESCAPE=true;
  USER_CHAR_STREAM=false;
  IGNORE_CASE=false;
  JDK_VERSION="1.5";
}

PARSER_BEGIN(StandardSyntaxParser)
package org.apache.lucene.queryParser.standard.parser;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Vector;

import org.apache.lucene.messages.Message;
import org.apache.lucene.messages.MessageImpl;
import org.apache.lucene.queryParser.core.QueryNodeError;
import org.apache.lucene.queryParser.core.QueryNodeException;
import org.apache.lucene.queryParser.core.QueryNodeParseException;
import org.apache.lucene.queryParser.core.messages.QueryParserMessages;
import org.apache.lucene.queryParser.core.nodes.AndQueryNode;
import org.apache.lucene.queryParser.core.nodes.BooleanQueryNode;
import org.apache.lucene.queryParser.core.nodes.BoostQueryNode;
import org.apache.lucene.queryParser.core.nodes.FieldQueryNode;
import org.apache.lucene.queryParser.core.nodes.FuzzyQueryNode;
import org.apache.lucene.queryParser.core.nodes.ModifierQueryNode;
import org.apache.lucene.queryParser.core.nodes.GroupQueryNode;
import org.apache.lucene.queryParser.core.nodes.OpaqueQueryNode;
import org.apache.lucene.queryParser.core.nodes.OrQueryNode;
import org.apache.lucene.queryParser.core.nodes.ParametricQueryNode;
import org.apache.lucene.queryParser.core.nodes.ParametricRangeQueryNode;
import org.apache.lucene.queryParser.core.nodes.SlopQueryNode;
import org.apache.lucene.queryParser.core.nodes.ProximityQueryNode;
import org.apache.lucene.queryParser.core.nodes.QueryNode;
import org.apache.lucene.queryParser.core.nodes.QueryNodeImpl;
import org.apache.lucene.queryParser.core.nodes.QuotedFieldQueryNode;
import org.apache.lucene.queryParser.core.parser.SyntaxParser;

public class StandardSyntaxParser implements SyntaxParser {

	private static final int CONJ_NONE =0;
	private static final int CONJ_AND =2;
	private static final int CONJ_OR =2;

 
   // syntax parser constructor
   public StandardSyntaxParser() {
   	this(new StringReader(""));
  }
     /** Parses a query string, returning a {@link org.apache.lucene.queryParser.core.nodes.QueryNode}.
     *  @param query  the query string to be parsed.
     *  @throws ParseException if the parsing fails
     */
    public QueryNode parse(CharSequence query, CharSequence field) throws QueryNodeParseException {
      ReInit(new StringReader(query.toString()));
      try {
        // TopLevelQuery is a Query followed by the end-of-input (EOF)
        QueryNode querynode = TopLevelQuery(field);
        return querynode;
      }
      catch (ParseException tme) {
            tme.setQuery(query);
            throw tme;
      }
      catch (Error tme) {
          Message message = new MessageImpl(QueryParserMessages.INVALID_SYNTAX_CANNOT_PARSE, query, tme.getMessage());
          QueryNodeParseException e = new QueryNodeParseException(tme);
            e.setQuery(query);
            e.setNonLocalizedMessage(message);
            throw e;
      }
    }
   
}

PARSER_END(StandardSyntaxParser)

/* ***************** */
/* Token Definitions */
/* ***************** */

<*> TOKEN : {
  <#_NUM_CHAR:   ["0"-"9"] >
// every character that follows a backslash is considered as an escaped character
| <#_ESCAPED_CHAR: "\\" ~[] >
| <#_TERM_START_CHAR: ( ~[ " ", "\t", "\n", "\r", "\u3000", "+", "-", "!", "(", ")", ":", "^",
                           "[", "]", "\"", "{", "}", "~", "\\" ]
                       | <_ESCAPED_CHAR> ) >
| <#_TERM_CHAR: ( <_TERM_START_CHAR> | <_ESCAPED_CHAR> | "-" | "+" ) >
| <#_WHITESPACE: ( " " | "\t" | "\n" | "\r" | "\u3000") >
| <#_QUOTED_CHAR: ( ~[ "\"", "\\" ] | <_ESCAPED_CHAR> ) >
}

<DEFAULT, RangeIn, RangeEx> SKIP : {
  < <_WHITESPACE>>
}

<DEFAULT> TOKEN : {
  <AND:       ("AND" | "&&") >
| <OR:        ("OR" | "||") >
| <NOT:       ("NOT" | "!") >
| <PLUS:      "+" >
| <MINUS:     "-" >
| <LPAREN:    "(" >
| <RPAREN:    ")" >
| <COLON:     ":" >
| <CARAT:     "^" > : Boost
| <QUOTED:     "\"" (<_QUOTED_CHAR>)* "\"">
| <TERM:      <_TERM_START_CHAR> (<_TERM_CHAR>)*  >
| <FUZZY_SLOP:     "~" ( (<_NUM_CHAR>)+ ( "." (<_NUM_CHAR>)+ )? )? >
| <RANGEIN_START: "[" > : RangeIn
| <RANGEEX_START: "{" > : RangeEx
}

<Boost> TOKEN : {
<NUMBER:    (<_NUM_CHAR>)+ ( "." (<_NUM_CHAR>)+ )? > : DEFAULT
}

<RangeIn> TOKEN : {
<RANGEIN_TO: "TO">
| <RANGEIN_END: "]"> : DEFAULT
| <RANGEIN_QUOTED: "\"" (~["\""] | "\\\"")+ "\"">
| <RANGEIN_GOOP: (~[ " ", "]" ])+ >
}

<RangeEx> TOKEN : {
<RANGEEX_TO: "TO">
| <RANGEEX_END: "}"> : DEFAULT
| <RANGEEX_QUOTED: "\"" (~["\""] | "\\\"")+ "\"">
| <RANGEEX_GOOP: (~[ " ", "}" ])+ >
}

// *   Query  ::= ( Clause )*
// *   Clause ::= ["+", "-"] [<TERM> ":"] (  | "(" Query ")" )

int Conjunction() : {
  int ret = CONJ_NONE;
}
{
  [
    <AND> { ret = CONJ_AND; }
    | <OR>  { ret = CONJ_OR; }
  ]
  { return ret; }
}

ModifierQueryNode.Modifier Modifiers() : {
  ModifierQueryNode.Modifier ret = ModifierQueryNode.Modifier.MOD_NONE;
}
{
  [
     <PLUS> { ret = ModifierQueryNode.Modifier.MOD_REQ; }
     | <MINUS> { ret = ModifierQueryNode.Modifier.MOD_NOT; }
     | <NOT> { ret = ModifierQueryNode.Modifier.MOD_NOT; }
  ]
  { return ret; }
}

// This makes sure that there is no garbage after the query string
QueryNode TopLevelQuery(CharSequence field) : 
{
	QueryNode q;
}
{
	q=Query(field) <EOF>
	{
		return q;
	}
}

// These changes were made to introduce operator precedence:
// - Clause() now returns a QueryNode. 
// - The modifiers are consumed by Clause() and returned as part of the QueryNode Object
// - Query does not consume conjunctions (AND, OR) anymore. 
// - This is now done by two new non-terminals: ConjClause and DisjClause
// The parse tree looks similar to this:
//       Query ::= DisjQuery ( DisjQuery )*
//   DisjQuery ::= ConjQuery ( OR ConjQuery )* 
//   ConjQuery ::= Clause ( AND Clause )*
//      Clause ::= [ Modifier ] ... 


QueryNode Query(CharSequence field) :
{
  Vector<QueryNode> clauses = null;
  QueryNode c, first=null;
}
{
  first=DisjQuery(field)
  (
    c=DisjQuery(field)
    { 
	     if (clauses == null) {
	         clauses = new Vector<QueryNode>();
	         clauses.addElement(first); 
	     } 
    	 clauses.addElement(c);
    }
    )*
    {
        if (clauses != null) { 
	    	return new BooleanQueryNode(clauses);
    	} else {
        	return first;
    	}
    }
}

QueryNode DisjQuery(CharSequence field) : {
	QueryNode first, c;
	Vector<QueryNode> clauses = null;
}
{
  first = ConjQuery(field)
  (
   <OR> c=ConjQuery(field)
   { 
     if (clauses == null) {
         clauses = new Vector<QueryNode>();
         clauses.addElement(first); 
     } 
     clauses.addElement(c);
   }
  )*
  {
    if (clauses != null) { 
	    return new OrQueryNode(clauses);
    } else {
        return first;
    }
  }
}

QueryNode ConjQuery(CharSequence field) : {
	QueryNode first, c;
	Vector<QueryNode> clauses = null;
}
{
  first = ModClause(field)
  (
   <AND> c=ModClause(field)
   { 
     if (clauses == null) {
         clauses = new Vector<QueryNode>();
         clauses.addElement(first);
     } 
     clauses.addElement(c); 
   }
  )*
  {
    if (clauses != null) {     
	    return new AndQueryNode(clauses);
    } else {
        return first;
    }
  }
}

// QueryNode Query(CharSequence field) :
// {
// List clauses = new ArrayList();
//   List modifiers = new ArrayList();
//   QueryNode q, firstQuery=null;
//   ModifierQueryNode.Modifier mods;
//   int conj;
// }
// {
//   mods=Modifiers() q=Clause(field)
//   {
//     if (mods == ModifierQueryNode.Modifier.MOD_NONE) firstQuery=q;
//     
//     // do not create modifier nodes with MOD_NONE
//    	if (mods != ModifierQueryNode.Modifier.MOD_NONE) {
//    		q = new ModifierQueryNode(q, mods);
//    	}
//    	clauses.add(q);
//   }
//   (
//     conj=Conjunction() mods=Modifiers() q=Clause(field)
//     { 
// 	    // do not create modifier nodes with MOD_NONE
// 	   	if (mods != ModifierQueryNode.Modifier.MOD_NONE) {
// 	   		q = new ModifierQueryNode(q, mods);
// 	   	}
// 	   	clauses.add(q);
// 	   	//TODO: figure out what to do with AND and ORs
//   }
//   )*
//     {
//      if (clauses.size() == 1 && firstQuery != null)
//         return firstQuery;
//       else {
//   		return new BooleanQueryNode(clauses);
//       }
//     }
// }

QueryNode ModClause(CharSequence field) : {
  QueryNode q; 
  ModifierQueryNode.Modifier mods;
}
{
   mods=Modifiers() q= Clause(field) {
 	   	if (mods != ModifierQueryNode.Modifier.MOD_NONE) {
 	   		q = new ModifierQueryNode(q, mods);
 	   	}
 	   	return q;
   }
}

QueryNode Clause(CharSequence field) : {
  QueryNode q;
  Token fieldToken=null, boost=null;
  boolean group = false;
}
{
  [
    LOOKAHEAD(2)
    (
    fieldToken=<TERM>  {field=EscapeQuerySyntaxImpl.discardEscapeChar(fieldToken.image);}    
    )
  ]

  (
   q=Term(field)
   | <LPAREN> q=Query(field)  ( boost=)? {group=true;}

  )
    {
      if (boost != null) {
		  float f = (float)1.0;
		  try {
		    f = Float.valueOf(boost.image).floatValue();
		    // avoid boosting null queries, such as those caused by stop words
	      	if (q != null) {
	        	q = new BoostQueryNode(q, f);
	      	}
		  } catch (Exception ignored) {
		  	/* Should this be handled somehow? (defaults to "no boost", if
             * boost number is invalid)
             */		  
		  }
      }
      if (group) { q = new GroupQueryNode(q);}
      return q;
    }
}


QueryNode Term(CharSequence field) : {
  Token term, boost=null, fuzzySlop=null, goop1, goop2;
  boolean fuzzy = false;
  QueryNode q =null; 
  ParametricQueryNode qLower, qUpper;
  float defaultMinSimilarity = 0.5f;
}
{
  (
     (
 	   term=<TERM> { q = new FieldQueryNode(field, EscapeQuerySyntaxImpl.discardEscapeChar(term.image), term.beginColumn, term.endColumn); }
       | term=<NUMBER>
     )
     [ fuzzySlop=<FUZZY_SLOP> { fuzzy=true; } ]
     [ <CARAT> boost= [ fuzzySlop= { fuzzy=true; } ] ]
     {
       if (fuzzy) {
       	  float fms = defaultMinSimilarity;
       	  try {
            fms = Float.valueOf(fuzzySlop.image.substring(1)).floatValue();
       	  } catch (Exception ignored) { }
       	 if(fms < 0.0f || fms > 1.0f){
       	   throw new ParseException(new MessageImpl(QueryParserMessages.INVALID_SYNTAX_FUZZY_LIMITS));
       	 }
       	 q = new FuzzyQueryNode(field, EscapeQuerySyntaxImpl.discardEscapeChar(term.image), fms, term.beginColumn, term.endColumn);
       }
     }
     | ( <RANGEIN_START> ( goop1=|goop1= )
         [ <RANGEIN_TO> ] ( goop2=|goop2= )
         <RANGEIN_END> )
       [ <CARAT> boost= ]
        {
          if (goop1.kind == RANGEIN_QUOTED) {
            goop1.image = goop1.image.substring(1, goop1.image.length()-1);
          }
          if (goop2.kind == RANGEIN_QUOTED) {
            goop2.image = goop2.image.substring(1, goop2.image.length()-1);
          }
          
          qLower = new ParametricQueryNode(field, ParametricQueryNode.CompareOperator.GE, 
		                               EscapeQuerySyntaxImpl.discardEscapeChar(goop1.image), goop1.beginColumn, goop1.endColumn);
		  qUpper = new ParametricQueryNode(field, ParametricQueryNode.CompareOperator.LE, 
		                               EscapeQuerySyntaxImpl.discardEscapeChar(goop2.image), goop2.beginColumn, goop2.endColumn);
          q = new ParametricRangeQueryNode(qLower, qUpper);
        }
     | ( <RANGEEX_START> ( goop1=|goop1= )
         [ <RANGEEX_TO> ] ( goop2=|goop2= )
         <RANGEEX_END> )
       [ <CARAT> boost= ]
        {
          if (goop1.kind == RANGEEX_QUOTED) {
            goop1.image = goop1.image.substring(1, goop1.image.length()-1);
          }
          if (goop2.kind == RANGEEX_QUOTED) {
            goop2.image = goop2.image.substring(1, goop2.image.length()-1);
          }
          qLower = new ParametricQueryNode(field, ParametricQueryNode.CompareOperator.GT, 
		                               EscapeQuerySyntaxImpl.discardEscapeChar(goop1.image), goop1.beginColumn, goop1.endColumn);
		  qUpper = new ParametricQueryNode(field, ParametricQueryNode.CompareOperator.LT, 
		                               EscapeQuerySyntaxImpl.discardEscapeChar(goop2.image), goop2.beginColumn, goop2.endColumn);
          q = new ParametricRangeQueryNode(qLower, qUpper);		  
        }
     | term=<QUOTED> {q = new QuotedFieldQueryNode(field, EscapeQuerySyntaxImpl.discardEscapeChar(term.image.substring(1, term.image.length()-1)), term.beginColumn + 1, term.endColumn - 1);}
       [ fuzzySlop=<FUZZY_SLOP> ]
       [ <CARAT> boost= ]
       {       
         int phraseSlop = 0;

         if (fuzzySlop != null) {
           try {
             phraseSlop = Float.valueOf(fuzzySlop.image.substring(1)).intValue();
             q = new SlopQueryNode(q, phraseSlop);    
           }
           catch (Exception ignored) {
            /* Should this be handled somehow? (defaults to "no PhraseSlop", if
	         * slop number is invalid)
	         */		
           }
         }
              
       }
  )
  {
	  if (boost != null) {
		  float f = (float)1.0;
		  try {
		    f = Float.valueOf(boost.image).floatValue();
		    // avoid boosting null queries, such as those caused by stop words
	      	if (q != null) {
	        	q = new BoostQueryNode(q, f);
	      	}
		  } catch (Exception ignored) {
		  	/* Should this be handled somehow? (defaults to "no boost", if
	         * boost number is invalid)
	         */		  
		  }
	  }
      return q;
  }
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene StandardSyntaxParser.jj source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.