alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (EscapeQuerySyntaxImpl.java)

This example Lucene source code file (EscapeQuerySyntaxImpl.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

a, charsequence, charsequence, inorder, locale, locale, messageimpl, parseexception, parseexception, sentence, string, string, stringbuilder, unescapedcharsequence, util

The Lucene EscapeQuerySyntaxImpl.java source code

package org.apache.lucene.queryParser.standard.parser;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.util.Locale;

import org.apache.lucene.messages.MessageImpl;
import org.apache.lucene.queryParser.core.messages.QueryParserMessages;
import org.apache.lucene.queryParser.core.parser.EscapeQuerySyntax;
import org.apache.lucene.queryParser.core.util.UnescapedCharSequence;

/**
 */
public class EscapeQuerySyntaxImpl implements EscapeQuerySyntax {

  private static final char[] wildcardChars = { '*', '?' };

  private static final String[] escapableTermExtraFirstChars = { "+", "-", "@" };

  private static final String[] escapableTermChars = { "\"", "<", ">", "=",
      "!", "(", ")", "^", "[", "{", ":", "]", "}", "~" };

  // TODO: check what to do with these "*", "?", "\\"
  private static final String[] escapableQuotedChars = { "\"" };
  private static final String[] escapableWhiteChars = { " ", "\t", "\n", "\r",
      "\f", "\b", "\u3000" };
  private static final String[] escapableWordTokens = { "AND", "OR", "NOT",
      "TO", "WITHIN", "SENTENCE", "PARAGRAPH", "INORDER" };

  private static final CharSequence escapeChar(CharSequence str, Locale locale) {
    if (str == null || str.length() == 0)
      return str;

    CharSequence buffer = str;

    // regular escapable Char for terms
    for (int i = 0; i < escapableTermChars.length; i++) {
      buffer = replaceIgnoreCase(buffer, escapableTermChars[i].toLowerCase(),
          "\\", locale);
    }

    // First Character of a term as more escaping chars
    for (int i = 0; i < escapableTermExtraFirstChars.length; i++) {
      if (buffer.charAt(0) == escapableTermExtraFirstChars[i].charAt(0)) {
        buffer = "\\" + buffer.charAt(0)
            + buffer.subSequence(1, buffer.length());
        break;
      }
    }

    return buffer;
  }

  private final CharSequence escapeQuoted(CharSequence str, Locale locale) {
    if (str == null || str.length() == 0)
      return str;

    CharSequence buffer = str;

    for (int i = 0; i < escapableQuotedChars.length; i++) {
      buffer = replaceIgnoreCase(buffer, escapableTermChars[i].toLowerCase(),
          "\\", locale);
    }
    return buffer;
  }

  private static final CharSequence escapeTerm(CharSequence term, Locale locale) {
    if (term == null)
      return term;

    // Escape single Chars
    term = escapeChar(term, locale);
    term = escapeWhiteChar(term, locale);

    // Escape Parser Words
    for (int i = 0; i < escapableWordTokens.length; i++) {
      if (escapableWordTokens[i].equalsIgnoreCase(term.toString()))
        return "\\" + term;
    }
    return term;
  }

  /**
   * replace with ignore case
   * 
   * @param string
   *          string to get replaced
   * @param sequence1
   *          the old character sequence in lowercase
   * @param escapeChar
   *          the new character to prefix sequence1 in return string.
   * @return the new String
   */
  private static CharSequence replaceIgnoreCase(CharSequence string,
      CharSequence sequence1, CharSequence escapeChar, Locale locale) {
    if (escapeChar == null || sequence1 == null || string == null)
      throw new NullPointerException();

    // empty string case
    int count = string.length();
    int sequence1Length = sequence1.length();
    if (sequence1Length == 0) {
      StringBuilder result = new StringBuilder((count + 1)
          * escapeChar.length());
      result.append(escapeChar);
      for (int i = 0; i < count; i++) {
        result.append(string.charAt(i));
        result.append(escapeChar);
      }
      return result.toString();
    }

    // normal case
    StringBuilder result = new StringBuilder();
    char first = sequence1.charAt(0);
    int start = 0, copyStart = 0, firstIndex;
    while (start < count) {
      if ((firstIndex = string.toString().toLowerCase(locale).indexOf(first,
          start)) == -1)
        break;
      boolean found = true;
      if (sequence1.length() > 1) {
        if (firstIndex + sequence1Length > count)
          break;
        for (int i = 1; i < sequence1Length; i++) {
          if (string.toString().toLowerCase(locale).charAt(firstIndex + i) != sequence1
              .charAt(i)) {
            found = false;
            break;
          }
        }
      }
      if (found) {
        result.append(string.toString().substring(copyStart, firstIndex));
        result.append(escapeChar);
        result.append(string.toString().substring(firstIndex,
            firstIndex + sequence1Length));
        copyStart = start = firstIndex + sequence1Length;
      } else {
        start = firstIndex + 1;
      }
    }
    if (result.length() == 0 && copyStart == 0)
      return string;
    result.append(string.toString().substring(copyStart));
    return result.toString();
  }

  /**
   * escape all tokens that are part of the parser syntax on a given string
   * 
   * @param str
   *          string to get replaced
   * @param locale
   *          locale to be used when performing string compares
   * @return the new String
   */
  private static final CharSequence escapeWhiteChar(CharSequence str,
      Locale locale) {
    if (str == null || str.length() == 0)
      return str;

    CharSequence buffer = str;

    for (int i = 0; i < escapableWhiteChars.length; i++) {
      buffer = replaceIgnoreCase(buffer, escapableWhiteChars[i].toLowerCase(),
          "\\", locale);
    }
    return buffer;
  }

  public CharSequence escape(CharSequence text, Locale locale, Type type) {
    if (text == null || text.length() == 0)
      return text;

    // escape wildcards and the escape char (this has to be perform before
    // anything else)
    // since we need to preserve the UnescapedCharSequence and escape the
    // original escape chars
    if (text instanceof UnescapedCharSequence) {
      text = ((UnescapedCharSequence) text).toStringEscaped(wildcardChars);
    } else {
      text = new UnescapedCharSequence(text).toStringEscaped(wildcardChars);
    }

    if (type == Type.STRING) {
      return escapeQuoted(text, locale);
    } else {
      return escapeTerm(text, locale);
    }
  }

  /**
   * Returns a String where the escape char has been removed, or kept only once
   * if there was a double escape.
   * 
   * Supports escaped unicode characters, e. g. translates <code>A to
   * <code>A.
   * 
   */
  public static UnescapedCharSequence discardEscapeChar(CharSequence input)
      throws ParseException {
    // Create char array to hold unescaped char sequence
    char[] output = new char[input.length()];
    boolean[] wasEscaped = new boolean[input.length()];

    // The length of the output can be less than the input
    // due to discarded escape chars. This variable holds
    // the actual length of the output
    int length = 0;

    // We remember whether the last processed character was
    // an escape character
    boolean lastCharWasEscapeChar = false;

    // The multiplier the current unicode digit must be multiplied with.
    // E. g. the first digit must be multiplied with 16^3, the second with
    // 16^2...
    int codePointMultiplier = 0;

    // Used to calculate the codepoint of the escaped unicode character
    int codePoint = 0;

    for (int i = 0; i < input.length(); i++) {
      char curChar = input.charAt(i);
      if (codePointMultiplier > 0) {
        codePoint += hexToInt(curChar) * codePointMultiplier;
        codePointMultiplier >>>= 4;
        if (codePointMultiplier == 0) {
          output[length++] = (char) codePoint;
          codePoint = 0;
        }
      } else if (lastCharWasEscapeChar) {
        if (curChar == 'u') {
          // found an escaped unicode character
          codePointMultiplier = 16 * 16 * 16;
        } else {
          // this character was escaped
          output[length] = curChar;
          wasEscaped[length] = true;
          length++;
        }
        lastCharWasEscapeChar = false;
      } else {
        if (curChar == '\\') {
          lastCharWasEscapeChar = true;
        } else {
          output[length] = curChar;
          length++;
        }
      }
    }

    if (codePointMultiplier > 0) {
      throw new ParseException(new MessageImpl(
          QueryParserMessages.INVALID_SYNTAX_ESCAPE_UNICODE_TRUNCATION));
    }

    if (lastCharWasEscapeChar) {
      throw new ParseException(new MessageImpl(
          QueryParserMessages.INVALID_SYNTAX_ESCAPE_CHARACTER));
    }

    return new UnescapedCharSequence(output, wasEscaped, 0, length);
  }

  /** Returns the numeric value of the hexadecimal character */
  private static final int hexToInt(char c) throws ParseException {
    if ('0' <= c && c <= '9') {
      return c - '0';
    } else if ('a' <= c && c <= 'f') {
      return c - 'a' + 10;
    } else if ('A' <= c && c <= 'F') {
      return c - 'A' + 10;
    } else {
      throw new ParseException(new MessageImpl(
          QueryParserMessages.INVALID_SYNTAX_ESCAPE_NONE_HEX_UNICODE, c));
    }
  }

}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene EscapeQuerySyntaxImpl.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.