alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (ThaiWordFilter.java)

This example Lucene source code file (ThaiWordFilter.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

attributesource, breakiterator, breakiterator, chartermattribute, dbbi_available, dbbi_available, deprecated, io, ioexception, ioexception, offsetattribute, override, segment, text, thaiwordfilter, thaiwordfilter, util

The Lucene ThaiWordFilter.java source code

package org.apache.lucene.analysis.th;

/**
 * Copyright 2006 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.Locale;
import java.lang.Character.UnicodeBlock;
import javax.swing.text.Segment;
import java.text.BreakIterator;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;

/**
 * {@link TokenFilter} that use {@link java.text.BreakIterator} to break each 
 * Token that is Thai into separate Token(s) for each Thai word.
 * <p>Please note: Since matchVersion 3.1 on, this filter no longer lowercases non-thai text.
 * {@link ThaiAnalyzer} will insert a {@link LowerCaseFilter} before this filter
 * so the behaviour of the Analyzer does not change. With version 3.1, the filter handles
 * position increments correctly.
 * <p>WARNING: this filter may not be supported by all JREs.
 *    It is known to work with Sun/Oracle and Harmony JREs.
 *    If your application needs to be fully portable, consider using ICUTokenizer instead,
 *    which uses an ICU Thai BreakIterator that will always be available.
 */
public final class ThaiWordFilter extends TokenFilter {
  /** 
   * True if the JRE supports a working dictionary-based breakiterator for Thai.
   * If this is false, this filter will not work at all!
   */
  public static final boolean DBBI_AVAILABLE;
  private static final BreakIterator proto = BreakIterator.getWordInstance(new Locale("th"));
  static {
    // check that we have a working dictionary-based break iterator for thai
    proto.setText("ภาษาไทย");
    DBBI_AVAILABLE = proto.isBoundary(4);
  }
  private final BreakIterator breaker = (BreakIterator) proto.clone();
  private final Segment charIterator = new Segment();
  
  private final boolean handlePosIncr;
  
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
  
  private AttributeSource clonedToken = null;
  private CharTermAttribute clonedTermAtt = null;
  private OffsetAttribute clonedOffsetAtt = null;
  private boolean hasMoreTokensInClone = false;

  /** Creates a new ThaiWordFilter that also lowercases non-thai text.
   * @deprecated Use the ctor with {@code matchVersion} instead!
   */
  @Deprecated
  public ThaiWordFilter(TokenStream input) {
    this(Version.LUCENE_30, input);
  }
  
  /** Creates a new ThaiWordFilter with the specified match version. */
  public ThaiWordFilter(Version matchVersion, TokenStream input) {
    super(matchVersion.onOrAfter(Version.LUCENE_31) ?
      input : new LowerCaseFilter(matchVersion, input));
    if (!DBBI_AVAILABLE)
      throw new UnsupportedOperationException("This JRE does not have support for Thai segmentation");
    handlePosIncr = matchVersion.onOrAfter(Version.LUCENE_31);
  }
  
  @Override
  public boolean incrementToken() throws IOException {
    if (hasMoreTokensInClone) {
      int start = breaker.current();
      int end = breaker.next();
      if (end != BreakIterator.DONE) {
        clonedToken.copyTo(this);
        termAtt.copyBuffer(clonedTermAtt.buffer(), start, end - start);
        offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
        if (handlePosIncr) posAtt.setPositionIncrement(1);
        return true;
      }
      hasMoreTokensInClone = false;
    }

    if (!input.incrementToken()) {
      return false;
    }
    
    if (termAtt.length() == 0 || UnicodeBlock.of(termAtt.charAt(0)) != UnicodeBlock.THAI) {
      return true;
    }
    
    hasMoreTokensInClone = true;

    // we lazy init the cloned token, as in ctor not all attributes may be added
    if (clonedToken == null) {
      clonedToken = cloneAttributes();
      clonedTermAtt = clonedToken.getAttribute(CharTermAttribute.class);
      clonedOffsetAtt = clonedToken.getAttribute(OffsetAttribute.class);
    } else {
      this.copyTo(clonedToken);
    }
    
    // reinit CharacterIterator
    charIterator.array = clonedTermAtt.buffer();
    charIterator.offset = 0;
    charIterator.count = clonedTermAtt.length();
    breaker.setText(charIterator);
    int end = breaker.next();
    if (end != BreakIterator.DONE) {
      termAtt.setLength(end);
      offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end);
      // position increment keeps as it is for first token
      return true;
    }
    return false;
  }
  
  @Override
  public void reset() throws IOException {
    super.reset();
    hasMoreTokensInClone = false;
    clonedToken = null;
    clonedTermAtt = null;
    clonedOffsetAtt = null;
  }
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene ThaiWordFilter.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.