alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (LaoBreakIterator.java)

This example Lucene source code file (LaoBreakIterator.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

backwards, breakiterator, characteriterator, chararrayiterator, chararrayiterator, laobreakiterator, object, override, override, rulebasedbreakiterator, rulebasedbreakiterator, text, unicodeset, unsupportedoperationexception, unsupportedoperationexception

The Lucene LaoBreakIterator.java source code

package org.apache.lucene.analysis.icu.segmentation;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.text.CharacterIterator;

import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.UnicodeSet;

/**
 * Syllable iterator for Lao text.
 * <p>
 * This breaks Lao text into syllables according to:
 * <i>Syllabification of Lao Script for Line Breaking
 * Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak, 
 * Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP.
 * <ul>
 *  <li>http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
 *  <li>http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
 * </ul>
 * <p>
 * Most work is accomplished with RBBI rules, however some additional special logic is needed
 * that cannot be coded in a grammar, and this is implemented here.
 * <p>
 * For example, what appears to be a final consonant might instead be part of the next syllable.
 * Rules match in a greedy fashion, leaving an illegal sequence that matches no rules.
 * <p>
 * Take for instance the text ກວ່າດອກ
 * The first rule greedily matches ກວ່າດ, but then ອກ is encountered, which is illegal.
 * What LaoBreakIterator does, according to the paper:
 * <ol>
 *  <li>backtrack and remove the ດ from the last syllable, placing it on the current syllable.
 *  <li>verify the modified previous syllable (ກວ່າ ) is still legal.
 *  <li>verify the modified current syllable (ດອກ) is now legal.
 *  <li>If 2 or 3 fails, then restore the ດ to the last syllable and skip the current character.
 * </ol>
 * <p>
 * Finally, LaoBreakIterator also takes care of the second concern mentioned in the paper.
 * This is the issue of combining marks being in the wrong order (typos).
 * @lucene.experimental
 */
public class LaoBreakIterator extends BreakIterator {
  RuleBasedBreakIterator rules;
  CharArrayIterator text;
  
  CharArrayIterator working = new CharArrayIterator();
  int workingOffset = 0;
  
  CharArrayIterator verifyText = new CharArrayIterator();
  RuleBasedBreakIterator verify;
  
  private static final UnicodeSet laoSet;
  static {
    laoSet = new UnicodeSet("[:Lao:]");
    laoSet.compact();
    laoSet.freeze();
  }
  
  public LaoBreakIterator(RuleBasedBreakIterator rules) {
    this.rules = (RuleBasedBreakIterator) rules.clone();
    this.verify = (RuleBasedBreakIterator) rules.clone();
  }

  @Override
  public int current() {
    int current = rules.current();
    return current == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + current;
  }

  @Override
  public int first() {
    working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
    rules.setText(working);
    workingOffset = 0;
    int first = rules.first();
    return first == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + first;
  }

  @Override
  public int following(int offset) {
    throw new UnsupportedOperationException();
  }

  @Override
  public CharacterIterator getText() {
    return text;
  }

  @Override
  public int last() {
    throw new UnsupportedOperationException();
  }
  
  @Override
  public int next() {
    int current = current();
    int next = rules.next();
    if (next == BreakIterator.DONE)
      return next;
    else
      next += workingOffset;
    
    char c = working.current();
    int following = rules.next(); // lookahead
    if (following != BreakIterator.DONE) {
      following += workingOffset;
      if (rules.getRuleStatus() == 0 && laoSet.contains(c) && verifyPushBack(current, next)) {
        workingOffset = next - 1;
        working.setText(text.getText(), text.getStart() + workingOffset, text.getLength() - workingOffset);
        return next - 1;
      }
    rules.previous(); // undo the lookahead
    }
    
    return next;
  }

  @Override
  public int next(int n) {
    if (n < 0)
      throw new UnsupportedOperationException("Backwards traversal is unsupported");

    int result = current();
    while (n > 0) {
        result = next();
        --n;
    }
    return result;
  }

  @Override
  public int previous() {
    throw new UnsupportedOperationException("Backwards traversal is unsupported");
  }

  @Override
  public void setText(CharacterIterator text) {
    if (!(text instanceof CharArrayIterator))
      throw new UnsupportedOperationException("unsupported CharacterIterator");
    this.text = (CharArrayIterator) text;
    ccReorder(this.text.getText(), this.text.getStart(), this.text.getLength());
    working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
    rules.setText(working);
    workingOffset = 0;
  }
  
  @Override
  public void setText(String newText) {
    CharArrayIterator ci = new CharArrayIterator();
    ci.setText(newText.toCharArray(), 0, newText.length());
    setText(ci);
  }
  
  private boolean verifyPushBack(int current, int next) {
    int shortenedSyllable = next - current - 1;

    verifyText.setText(text.getText(), text.getStart() + current, shortenedSyllable);
    verify.setText(verifyText);
    if (verify.next() != shortenedSyllable || verify.getRuleStatus() == 0)
      return false;
    

    verifyText.setText(text.getText(), text.getStart() + next - 1, text.getLength() - next + 1);
    verify.setText(verifyText);

    return (verify.next() != BreakIterator.DONE && verify.getRuleStatus() != 0);
  }

  // TODO: only bubblesort around runs of combining marks, instead of the entire text.
  private void ccReorder(char[] text, int start, int length) {
    boolean reordered;
    do {
      int prevCC = 0;
      reordered = false;
      for (int i = start; i < start + length; i++) {
        final char c = text[i];
        final int cc = UCharacter.getCombiningClass(c);
        if (cc > 0 && cc < prevCC) {
          // swap
          text[i] = text[i - 1];
          text[i - 1] = c;
          reordered = true;
        } else {
          prevCC = cc;
        }
      }

    } while (reordered == true);
  }
  
  /**
   * Clone method.  Creates another LaoBreakIterator with the same behavior 
   * and current state as this one.
   * @return The clone.
   */
  @Override
  public Object clone() {
    LaoBreakIterator other = (LaoBreakIterator) super.clone();
    other.rules = (RuleBasedBreakIterator) rules.clone();
    other.verify = (RuleBasedBreakIterator) verify.clone();
    if (text != null)
      other.text = (CharArrayIterator) text.clone();
    if (working != null)
      other.working = (CharArrayIterator) working.clone();
    if (verifyText != null)
      other.verifyText = (CharArrayIterator) verifyText.clone();
    return other;
  }
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene LaoBreakIterator.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.