alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (AbstractDictionary.java)

This example Lucene source code file (AbstractDictionary.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

a1, abstractdictionary, char_num_in_file, char_num_in_file, code, gb2312, gb2312, gb2312_char_num, gb2312_first_char, io, string, string, there, unsupportedencodingexception, unsupportedencodingexception

The Lucene AbstractDictionary.java source code

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.analysis.cn.smart.hhmm;

import java.io.UnsupportedEncodingException;

/**
 * <p>
 * SmartChineseAnalyzer abstract dictionary implementation.
 * </p>
 * <p>
 * Contains methods for dealing with GB2312 encoding.
 * </p>
 * @lucene.experimental
 */
abstract class AbstractDictionary {
  /**
   * First Chinese Character in GB2312 (15 * 94)
   * Characters in GB2312 are arranged in a grid of 94 * 94, 0-14 are unassigned or punctuation.
   */
  public static final int GB2312_FIRST_CHAR = 1410;

  /**
   * Last Chinese Character in GB2312 (87 * 94). 
   * Characters in GB2312 are arranged in a grid of 94 * 94, 88-94 are unassigned.
   */
  public static final int GB2312_CHAR_NUM = 87 * 94;

  /**
   * Dictionary data contains 6768 Chinese characters with frequency statistics.
   */
  public static final int CHAR_NUM_IN_FILE = 6768;

  // =====================================================
  // code +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +A +B +C +D +E +F
  // B0A0 啊 阿 埃 挨 哎 唉 哀 皑 癌 蔼 矮 艾 碍 爱 隘
  // B0B0 鞍 氨 安 俺 按 暗 岸 胺 案 肮 昂 盎 凹 敖 熬 翱
  // B0C0 袄 傲 奥 懊 澳 芭 捌 扒 叭 吧 笆 八 疤 巴 拔 跋
  // B0D0 靶 把 耙 坝 霸 罢 爸 白 柏 百 摆 佰 败 拜 稗 斑
  // B0E0 班 搬 扳 般 颁 板 版 扮 拌 伴 瓣 半 办 绊 邦 帮
  // B0F0 梆 榜 膀 绑 棒 磅 蚌 镑 傍 谤 苞 胞 包 褒 剥
  // =====================================================
  //
  // GB2312 character set:
  // 01 94 Symbols
  // 02 72 Numbers
  // 03 94 Latin
  // 04 83 Kana
  // 05 86 Katakana
  // 06 48 Greek
  // 07 66 Cyrillic
  // 08 63 Phonetic Symbols
  // 09 76 Drawing Symbols
  // 10-15 Unassigned
  // 16-55 3755 Plane 1, in pinyin order
  // 56-87 3008 Plane 2, in radical/stroke order
  // 88-94 Unassigned
  // ======================================================

  /**
   * <p>
   * Transcode from GB2312 ID to Unicode
   * </p>
   * <p>
   * GB2312 is divided into a 94 * 94 grid, containing 7445 characters consisting of 6763 Chinese characters and 682 symbols.
   * Some regions are unassigned (reserved).
   * </p>
   * 
   * @param ccid GB2312 id
   * @return unicode String
   */
  public String getCCByGB2312Id(int ccid) {
    if (ccid < 0 || ccid > WordDictionary.GB2312_CHAR_NUM)
      return "";
    int cc1 = ccid / 94 + 161;
    int cc2 = ccid % 94 + 161;
    byte[] buffer = new byte[2];
    buffer[0] = (byte) cc1;
    buffer[1] = (byte) cc2;
    try {
      String cchar = new String(buffer, "GB2312");
      return cchar;
    } catch (UnsupportedEncodingException e) {
      return "";
    }
  }

  /**
   * Transcode from Unicode to GB2312
   * 
   * @param ch input character in Unicode, or character in Basic Latin range.
   * @return position in GB2312
   */
  public short getGB2312Id(char ch) {
    try {
      byte[] buffer = Character.toString(ch).getBytes("GB2312");
      if (buffer.length != 2) {
        // Should be a two-byte character
        return -1;
      }
      int b0 = (buffer[0] & 0x0FF) - 161; // Code starts from A1, therefore subtract 0xA1=161
      int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese char for the first and last symbol. 
      											// Therefore, each code page only has 16*6-2=94 characters.
      return (short) (b0 * 94 + b1);
    } catch (UnsupportedEncodingException e) {
      e.printStackTrace();
    }
    return -1;
  }

  /**
   * 32-bit FNV Hash Function
   * 
   * @param c input character
   * @return hashcode
   */
  public long hash1(char c) {
    final long p = 1099511628211L;
    long hash = 0xcbf29ce484222325L;
    hash = (hash ^ (c & 0x00FF)) * p;
    hash = (hash ^ (c >> 8)) * p;
    hash += hash << 13;
    hash ^= hash >> 7;
    hash += hash << 3;
    hash ^= hash >> 17;
    hash += hash << 5;
    return hash;
  }

  /**
   * 32-bit FNV Hash Function
   * 
   * @param carray character array
   * @return hashcode
   */
  public long hash1(char carray[]) {
    final long p = 1099511628211L;
    long hash = 0xcbf29ce484222325L;
    for (int i = 0; i < carray.length; i++) {
      char d = carray[i];
      hash = (hash ^ (d & 0x00FF)) * p;
      hash = (hash ^ (d >> 8)) * p;
    }

    // hash += hash << 13;
    // hash ^= hash >> 7;
    // hash += hash << 3;
    // hash ^= hash >> 17;
    // hash += hash << 5;
    return hash;
  }

  /**
   * djb2 hash algorithm,this algorithm (k=33) was first reported by dan
   * bernstein many years ago in comp.lang.c. another version of this algorithm
   * (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i];
   * the magic of number 33 (why it works better than many other constants,
   * prime or not) has never been adequately explained.
   * 
   * @param c character
   * @return hashcode
   */
  public int hash2(char c) {
    int hash = 5381;

    /* hash 33 + c */
    hash = ((hash << 5) + hash) + c & 0x00FF;
    hash = ((hash << 5) + hash) + c >> 8;

    return hash;
  }

  /**
   * djb2 hash algorithm,this algorithm (k=33) was first reported by dan
   * bernstein many years ago in comp.lang.c. another version of this algorithm
   * (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i];
   * the magic of number 33 (why it works better than many other constants,
   * prime or not) has never been adequately explained.
   * 
   * @param carray character array
   * @return hashcode
   */
  public int hash2(char carray[]) {
    int hash = 5381;

    /* hash 33 + c */
    for (int i = 0; i < carray.length; i++) {
      char d = carray[i];
      hash = ((hash << 5) + hash) + d & 0x00FF;
      hash = ((hash << 5) + hash) + d >> 8;
    }

    return hash;
  }
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene AbstractDictionary.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.