alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (CJKTokenizer.java)

This example Lucene source code file (CJKTokenizer.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

cjktokenizer, cjktokenizer, double_token_type, double_token_type, io, ioexception, max_word_len, override, override, reader, single_token_type, single_token_type, tokenizer, typeattribute, word_type

The Lucene CJKTokenizer.java source code

package org.apache.lucene.analysis.cjk;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.Reader;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;

/**
 * CJKTokenizer is designed for Chinese, Japanese, and Korean languages.
 * <p>  
 * The tokens returned are every two adjacent characters with overlap match.
 * </p>
 * <p>
 * Example: "java C1C2C3C4" will be segmented to: "java" "C1C2" "C2C3" "C3C4".
 * </p>
 * Additionally, the following is applied to Latin text (such as English):
 * <ul>
 * <li>Text is converted to lowercase.
 * <li>Numeric digits, '+', '#', and '_' are tokenized as letters.
 * <li>Full-width forms are converted to half-width forms.
 * </ul>
 * For more info on Asian language (Chinese, Japanese, and Korean) text segmentation:
 * please search  <a
 * href="http://www.google.com/search?q=word+chinese+segment">google</a>
 *
 */
public final class CJKTokenizer extends Tokenizer {
    //~ Static fields/initializers ---------------------------------------------
    /** Word token type */
    static final int WORD_TYPE = 0;
  
    /** Single byte token type */
    static final int SINGLE_TOKEN_TYPE = 1;

    /** Double byte token type */
    static final int DOUBLE_TOKEN_TYPE = 2;
  
    /** Names for token types */
    static final String[] TOKEN_TYPE_NAMES = { "word", "single", "double" };
  
    /** Max word length */
    private static final int MAX_WORD_LEN = 255;

    /** buffer size: */
    private static final int IO_BUFFER_SIZE = 256;

    //~ Instance fields --------------------------------------------------------

    /** word offset, used to imply which character(in ) is parsed */
    private int offset = 0;

    /** the index used only for ioBuffer */
    private int bufferIndex = 0;

    /** data length */
    private int dataLen = 0;

    /**
     * character buffer, store the characters which are used to compose <br>
     * the returned Token
     */
    private final char[] buffer = new char[MAX_WORD_LEN];

    /**
     * I/O buffer, used to store the content of the input(one of the <br>
     * members of Tokenizer)
     */
    private final char[] ioBuffer = new char[IO_BUFFER_SIZE];

    /** word type: single=>ASCII  double=>non-ASCII word=>default */
    private int tokenType = WORD_TYPE;

    /**
     * tag: previous character is a cached double-byte character  "C1C2C3C4"
     * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
     * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
     */
    private boolean preIsTokened = false;

    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
    
    //~ Constructors -----------------------------------------------------------

    /**
     * Construct a token stream processing the given input.
     *
     * @param in I/O reader
     */
    public CJKTokenizer(Reader in) {
      super(in);
    }

    public CJKTokenizer(AttributeSource source, Reader in) {
      super(source, in);
    }

    public CJKTokenizer(AttributeFactory factory, Reader in) {
      super(factory, in);
    }
    
    //~ Methods ----------------------------------------------------------------

    /**
     * Returns true for the next token in the stream, or false at EOS.
     * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
     * for detail.
     *
     * @return false for end of stream, true otherwise
     *
     * @throws java.io.IOException - throw IOException when read error <br>
     *         happened in the InputStream
     *
     */
    @Override
    public boolean incrementToken() throws IOException {
        clearAttributes();
        /** how many character(s) has been stored in buffer */

        while(true) { // loop until we find a non-empty token

          int length = 0;

          /** the position used to create Token */
          int start = offset;

          while (true) { // loop until we've found a full token
            /** current character */
            char c;

            /** unicode block of current character for detail */
            Character.UnicodeBlock ub;

            offset++;

            if (bufferIndex >= dataLen) {
                dataLen = input.read(ioBuffer);
                bufferIndex = 0;
            }

            if (dataLen == -1) {
                if (length > 0) {
                    if (preIsTokened == true) {
                        length = 0;
                        preIsTokened = false;
                    }
                    else{
                      offset--;
                    }

                    break;
                } else {
                    offset--;
                    return false;
                }
            } else {
                //get current character
                c = ioBuffer[bufferIndex++];

                //get the UnicodeBlock of the current character
                ub = Character.UnicodeBlock.of(c);
            }

            //if the current character is ASCII or Extend ASCII
            if ((ub == Character.UnicodeBlock.BASIC_LATIN)
                    || (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)
               ) {
                if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
                  int i = (int) c;
                  if (i >= 65281 && i <= 65374) {
                    // convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
                    i = i - 65248;
                    c = (char) i;
                  }
                }

                // if the current character is a letter or "_" "+" "#"
                if (Character.isLetterOrDigit(c)
                        || ((c == '_') || (c == '+') || (c == '#'))
                   ) {
                    if (length == 0) {
                        // "javaC1C2C3C4linux" <br>
                        //      ^--: the current character begin to token the ASCII
                        // letter
                        start = offset - 1;
                    } else if (tokenType == DOUBLE_TOKEN_TYPE) {
                        // "javaC1C2C3C4linux" <br>
                        //              ^--: the previous non-ASCII
                        // : the current character
                        offset--;
                        bufferIndex--;

                        if (preIsTokened == true) {
                            // there is only one non-ASCII has been stored
                            length = 0;
                            preIsTokened = false;
                            break;
                        } else {
                            break;
                        }
                    }

                    // store the LowerCase(c) in the buffer
                    buffer[length++] = Character.toLowerCase(c);
                    tokenType = SINGLE_TOKEN_TYPE;

                    // break the procedure if buffer overflowed!
                    if (length == MAX_WORD_LEN) {
                        break;
                    }
                } else if (length > 0) {
                    if (preIsTokened == true) {
                        length = 0;
                        preIsTokened = false;
                    } else {
                        break;
                    }
                }
            } else {
                // non-ASCII letter, e.g."C1C2C3C4"
                if (Character.isLetter(c)) {
                    if (length == 0) {
                        start = offset - 1;
                        buffer[length++] = c;
                        tokenType = DOUBLE_TOKEN_TYPE;
                    } else {
                      if (tokenType == SINGLE_TOKEN_TYPE) {
                            offset--;
                            bufferIndex--;

                            //return the previous ASCII characters
                            break;
                        } else {
                            buffer[length++] = c;
                            tokenType = DOUBLE_TOKEN_TYPE;

                            if (length == 2) {
                                offset--;
                                bufferIndex--;
                                preIsTokened = true;

                                break;
                            }
                        }
                    }
                } else if (length > 0) {
                    if (preIsTokened == true) {
                        // empty the buffer
                        length = 0;
                        preIsTokened = false;
                    } else {
                        break;
                    }
                }
            }
        }
      
        if (length > 0) {
          termAtt.copyBuffer(buffer, 0, length);
          offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
          typeAtt.setType(TOKEN_TYPE_NAMES[tokenType]);
          return true;
        } else if (dataLen == -1) {
          offset--;
          return false;
        }

        // Cycle back and try for the next token (don't
        // return an empty string)
      }
    }
    
    @Override
    public final void end() {
      // set final offset
      final int finalOffset = correctOffset(offset);
      this.offsetAtt.setOffset(finalOffset, finalOffset);
    }
    
    @Override
    public void reset() throws IOException {
      super.reset();
      offset = bufferIndex = dataLen = 0;
      preIsTokened = false;
      tokenType = WORD_TYPE;
    }
    
    @Override
    public void reset(Reader reader) throws IOException {
      super.reset(reader);
      reset();
    }
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene CJKTokenizer.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.