alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (BreakIteratorWrapper.java)

This example Lucene source code file (BreakIteratorWrapper.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

biwrapper, biwrapper, breakiterator, breakiteratorwrapper, breakiteratorwrapper, chararrayiterator, dictionarybasedbreakiterator, override, override, rbbiwrapper, rbbiwrapper, rulebasedbreakiterator, text

The Lucene BreakIteratorWrapper.java source code

package org.apache.lucene.analysis.icu.segmentation;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.text.CharacterIterator;

import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.DictionaryBasedBreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.UTF16;

/**
 * Contain all the issues surrounding BreakIterators in ICU in one place.
 * Basically this boils down to the fact that they aren't very friendly to any
 * sort of OO design.
 * <p>
 * http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to
 * BreakIterator from RuleBasedBreakIterator
 * <p>
 * DictionaryBasedBreakIterator is a subclass of RuleBasedBreakIterator, but
 * doesn't actually behave as a subclass: it always returns 0 for
 * getRuleStatus(): 
 * http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type
 * tags
 * @lucene.experimental
 */
abstract class BreakIteratorWrapper {
  protected final CharArrayIterator textIterator = new CharArrayIterator();
  protected char text[];
  protected int start;
  protected int length;

  abstract int next();
  abstract int current();
  abstract int getRuleStatus();
  abstract void setText(CharacterIterator text);

  void setText(char text[], int start, int length) {
    this.text = text;
    this.start = start;
    this.length = length;
    textIterator.setText(text, start, length);
    setText(textIterator);
  }

  /**
   * If its a DictionaryBasedBreakIterator, it doesn't return rulestatus, so
   * treat it like a generic BreakIterator If its any other
   * RuleBasedBreakIterator, the rule status can be used for token type. If its
   * any other BreakIterator, the rulestatus method is not available, so treat
   * it like a generic BreakIterator.
   */
  static BreakIteratorWrapper wrap(BreakIterator breakIterator) {
    if (breakIterator instanceof RuleBasedBreakIterator
        && !(breakIterator instanceof DictionaryBasedBreakIterator))
      return new RBBIWrapper((RuleBasedBreakIterator) breakIterator);
    else
      return new BIWrapper(breakIterator);
  }

  /**
   * RuleBasedBreakIterator wrapper: RuleBasedBreakIterator (as long as its not
   * a DictionaryBasedBreakIterator) behaves correctly.
   */
  static final class RBBIWrapper extends BreakIteratorWrapper {
    private final RuleBasedBreakIterator rbbi;

    RBBIWrapper(RuleBasedBreakIterator rbbi) {
      this.rbbi = rbbi;
    }

    @Override
    int current() {
      return rbbi.current();
    }

    @Override
    int getRuleStatus() {
      return rbbi.getRuleStatus();
    }

    @Override
    int next() {
      return rbbi.next();
    }

    @Override
    void setText(CharacterIterator text) {
      rbbi.setText(text);
    }
  }

  /**
   * Generic BreakIterator wrapper: Either the rulestatus method is not
   * available or always returns 0. Calculate a rulestatus here so it behaves
   * like RuleBasedBreakIterator.
   * 
   * Note: This is slower than RuleBasedBreakIterator.
   */
  static final class BIWrapper extends BreakIteratorWrapper {
    private final BreakIterator bi;
    private int status;

    BIWrapper(BreakIterator bi) {
      this.bi = bi;
    }

    @Override
    int current() {
      return bi.current();
    }

    @Override
    int getRuleStatus() {
      return status;
    }

    @Override
    int next() {
      int current = bi.current();
      int next = bi.next();
      status = calcStatus(current, next);
      return next;
    }

    private int calcStatus(int current, int next) {
      if (current == BreakIterator.DONE || next == BreakIterator.DONE)
        return RuleBasedBreakIterator.WORD_NONE;

      int begin = start + current;
      int end = start + next;

      int codepoint;
      for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) {
        codepoint = UTF16.charAt(text, 0, end, begin);

        if (UCharacter.isDigit(codepoint))
          return RuleBasedBreakIterator.WORD_NUMBER;
        else if (UCharacter.isLetter(codepoint)) {
          // TODO: try to separately specify ideographic, kana? 
          // [currently all bundled as letter for this case]
          return RuleBasedBreakIterator.WORD_LETTER;
        }
      }

      return RuleBasedBreakIterator.WORD_NONE;
    }

    @Override
    void setText(CharacterIterator text) {
      bi.setText(text);
      status = RuleBasedBreakIterator.WORD_NONE;
    }
  }
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene BreakIteratorWrapper.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.