Jazzy example source code file (DocumentWordTokenizer.java)

This example Jazzy source code file (DocumentWordTokenizer.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" ^TM.

Java - Jazzy tags/keywords

attributeset, badlocationexception, badlocationexception, breakiterator, document, documentwordtokenizer, documentwordtokenizer, segment, segment, string, string, styleddocument, text, wordtokenizer

The Jazzy DocumentWordTokenizer.java source code

/*
Jazzy - a Java library for Spell Checking
Copyright (C) 2001 Mindaugas Idzelis
Full text of license can be found in LICENSE.txt

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
*/
package com.swabunga.spell.event;


import javax.swing.text.BadLocationException;
import javax.swing.text.Document;
import javax.swing.text.Segment;
import javax.swing.text.StyledDocument;
import javax.swing.text.AttributeSet;
import java.text.BreakIterator;


/** This class tokenizes a swing document model. It also allows for the
 *  document model to be changed when corrections occur.
 *
 * @author Jason Height (jheight@chariot.net.au)
 */
public class DocumentWordTokenizer implements WordTokenizer {
  /** Holds the start character position of the current word*/
  private int currentWordPos = 0;
  /** Holds the end character position of the current word*/
  private int currentWordEnd = 0;
  /** Holds the start character position of the next word*/
  private int nextWordPos = -1;
  /** The actual text that is being tokenized*/
  private Document document;
  /** The character iterator over the document*/
  private Segment text;
  /** The cumulative word count that have been processed*/
  private int wordCount = 0;
  /** Flag indicating if there are any more tokens (words) left*/
  private boolean moreTokens = true;
  /** Is this a special case where the currentWordStart, currntWordEnd and
   *  nextWordPos have already been calculated. (see nextWord)
   */
  private boolean first = true;
  private BreakIterator sentenceIterator;
  private boolean startsSentence = true;
  
  public DocumentWordTokenizer(Document document) {
    this.document = document;
    //Create a text segment over the etire document
    text = new Segment();
    sentenceIterator = BreakIterator.getSentenceInstance();
    try {
      document.getText(0, document.getLength(), text);
      sentenceIterator.setText(text);
      currentWordPos = getNextWordStart(text, text.getBeginIndex());
      //If the current word pos is -1 then the string was all white space
      if (currentWordPos != -1) {
        currentWordEnd = getNextWordEnd(text, currentWordPos);
        nextWordPos = getNextWordStart(text, currentWordEnd);
      } else {
        moreTokens = false;
      }
    } catch (BadLocationException ex) {
      moreTokens = false;
    }
  }

  /** This helper method will return the start character of the next
   * word in the buffer from the start position
   */
  private static int getNextWordStart(Segment text, int startPos) {
    if (startPos <= text.getEndIndex())
      for (char ch = text.setIndex(startPos); ch != Segment.DONE; ch = text.next()) {
        if (Character.isLetterOrDigit(ch)) {
          return text.getIndex();
        }
      }
    return -1;
  }

  /** This helper method will return the end of the next word in the buffer.
   *
   */
  private static int getNextWordEnd(Segment text, int startPos) {
    for (char ch = text.setIndex(startPos); ch != Segment.DONE; ch = text.next()) {
      if (!Character.isLetterOrDigit(ch)) {
        if (ch == '-' || ch == '\'') { // handle ' and - inside words
          char ch2 = text.next();
          text.previous();
          if (ch2 != Segment.DONE && Character.isLetterOrDigit(ch2))
            continue;
        }
        return text.getIndex();
      }
    }
    return text.getEndIndex();
  }

  /** Returns true if there are more words that can be processed in the string
   *
   */
  public boolean hasMoreWords() {
    return moreTokens;
  }
  
  /**
   * Sets the current word position at the start of the word containing
   * the char at position pos. This way a call to nextWord() will return
   * this word.
   * 
   * @param pos position in the word we want to set as current.
   */
  public void posStartFullWordFrom(int pos){
  	currentWordPos=text.getBeginIndex();
  	if(pos>text.getEndIndex())
  		pos=text.getEndIndex();
  	for (char ch = text.setIndex(pos); ch != Segment.DONE; ch = text.previous()) {
  		if (!Character.isLetterOrDigit(ch)) {
  			if (ch == '-' || ch == '\'') { // handle ' and - inside words
  				char ch2 = text.previous();
  				text.next();
  				if (ch2 != Segment.DONE && Character.isLetterOrDigit(ch2))
  					continue;
  			}
  			currentWordPos=text.getIndex()+1;
  			break;
  		}
  	}
  	//System.out.println("CurPos:"+currentWordPos);
  	if(currentWordPos==0)
  		first=true;
  	moreTokens=true;
  	currentWordEnd = getNextWordEnd(text, currentWordPos);
  	nextWordPos = getNextWordStart(text, currentWordEnd + 1);
  }
  
  /** Returns the current character position in the text
   *
   */
  public int getCurrentWordPosition() {
    return currentWordPos;
  }

  /** Returns the current end word position in the text
   *
   */
  public int getCurrentWordEnd() {
    return currentWordEnd;
  }

  /** Returns the next word in the text
   *
   */
  public String nextWord() {
    if (!first) {
      currentWordPos = nextWordPos;
      currentWordEnd = getNextWordEnd(text, currentWordPos);
      nextWordPos = getNextWordStart(text, currentWordEnd + 1);
    }
    int current = sentenceIterator.current();
    if (current == currentWordPos)
      startsSentence = true;
    else {
      startsSentence = false;
      if (currentWordEnd > current)
        sentenceIterator.next();
    }
    //The nextWordPos has already been populated
    String word = null;
    try {
      word = document.getText(currentWordPos, currentWordEnd - currentWordPos);
    } catch (BadLocationException ex) {
      moreTokens = false;
    }
    wordCount++;
    first = false;
    if (nextWordPos == -1)
      moreTokens = false;
    return word;
  }

  /** Returns the current number of words that have been processed
   *
   */
  public int getCurrentWordCount() {
    return wordCount;
  }

  /** Replaces the current word token*/
  public void replaceWord(String newWord) {
    AttributeSet attr=null;
    if (currentWordPos != -1) {
      try {
        if(document instanceof StyledDocument)
            attr=((StyledDocument)document).getCharacterElement(currentWordPos).getAttributes();
        document.remove(currentWordPos, currentWordEnd - currentWordPos);
        document.insertString(currentWordPos, newWord, attr);
        //Need to reset the segment
        document.getText(0, document.getLength(), text);
      } catch (BadLocationException ex) {
        throw new RuntimeException(ex.getMessage());
      }
      //Position after the newly replaced word(s)
      first = true;
      currentWordPos = getNextWordStart(text, currentWordPos + newWord.length());
      if (currentWordPos != -1) {
        currentWordEnd = getNextWordEnd(text, currentWordPos);
        nextWordPos = getNextWordStart(text, currentWordEnd);
        sentenceIterator.setText(text);
        sentenceIterator.following(currentWordPos);
      } else
        moreTokens = false;
    }
  }

  /** Returns the current text that is being tokenized (includes any changes
   *  that have been made)
   */
  public String getContext() {
    return text.toString();
  }

  /** Returns true if the current word is at the start of a sentence*/
  public boolean isNewSentence() {
    // BreakIterator doesn't work when the first word in a sentence is not capitalised,
    // but we need to check for capitalisation
    if (startsSentence || currentWordPos < 2)
      return(true);
    
    String textBefore = null;
    try {
      textBefore = document.getText(currentWordPos-2, 2);
    } catch (BadLocationException ex) {
      return(false);
    }
    return(textBefore != null && ".".equals(textBefore.trim()));
  }
}

Other Jazzy examples (source code examples)

Here is a short list of links related to this Jazzy DocumentWordTokenizer.java source code file:

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.

... this post is sponsored by my books ...
#1 New Release!	FP Best Seller

Jazzy example source code file (DocumentWordTokenizer.java)

Java - Jazzy tags/keywords

The Jazzy DocumentWordTokenizer.java source code

Other Jazzy examples (source code examples)

new blog posts