alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (WordDictionary.java)

This example Lucene source code file (WordDictionary.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

classnotfoundexception, file, filenotfoundexception, gb2312_first_char, io, ioexception, ioexception, nio, objectoutputstream, prime_index_length, prime_index_length, randomaccessfile, string, string, worddictionary, worddictionary

The Lucene WordDictionary.java source code

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.analysis.cn.smart.hhmm;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.RandomAccessFile;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;

import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
import org.apache.lucene.analysis.cn.smart.Utility;

/**
 * SmartChineseAnalyzer Word Dictionary
 * @lucene.experimental
 */
class WordDictionary extends AbstractDictionary {

  private WordDictionary() {
  }

  private static WordDictionary singleInstance;

  /**
   * Large prime number for hash function
   */
  public static final int PRIME_INDEX_LENGTH = 12071;

  /**
   * wordIndexTable guarantees to hash all Chinese characters in Unicode into 
   * PRIME_INDEX_LENGTH array. There will be conflict, but in reality this 
   * program only handles the 6768 characters found in GB2312 plus some 
   * ASCII characters. Therefore in order to guarantee better precision, it is
   * necessary to retain the original symbol in the charIndexTable.
   */
  private short[] wordIndexTable;

  private char[] charIndexTable;

  /**
   * To avoid taking too much space, the data structure needed to store the 
   * lexicon requires two multidimensional arrays to store word and frequency.
   * Each word is placed in a char[]. Each char represents a Chinese char or 
   * other symbol.  Each frequency is put into an int. These two arrays 
   * correspond to each other one-to-one. Therefore, one can use 
   * wordItem_charArrayTable[i][j] to look up word from lexicon, and 
   * wordItem_frequencyTable[i][j] to look up the corresponding frequency. 
   */
  private char[][][] wordItem_charArrayTable;

  private int[][] wordItem_frequencyTable;

  // static Logger log = Logger.getLogger(WordDictionary.class);

  /**
   * Get the singleton dictionary instance.
   * @return singleton
   */
  public synchronized static WordDictionary getInstance() {
    if (singleInstance == null) {
      singleInstance = new WordDictionary();
      try {
        singleInstance.load();
      } catch (IOException e) {
        String wordDictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
        singleInstance.load(wordDictRoot);
      } catch (ClassNotFoundException e) {
        throw new RuntimeException(e);
      }
    }
    return singleInstance;
  }

  /**
   * Attempt to load dictionary from provided directory, first trying coredict.mem, failing back on coredict.dct
   * 
   * @param dctFileRoot path to dictionary directory
   */
  public void load(String dctFileRoot) {
    String dctFilePath = dctFileRoot + "/coredict.dct";
    File serialObj = new File(dctFileRoot + "/coredict.mem");

    if (serialObj.exists() && loadFromObj(serialObj)) {

    } else {
      try {
        wordIndexTable = new short[PRIME_INDEX_LENGTH];
        charIndexTable = new char[PRIME_INDEX_LENGTH];
        for (int i = 0; i < PRIME_INDEX_LENGTH; i++) {
          charIndexTable[i] = 0;
          wordIndexTable[i] = -1;
        }
        wordItem_charArrayTable = new char[GB2312_CHAR_NUM][][];
        wordItem_frequencyTable = new int[GB2312_CHAR_NUM][];
        // int total =
        loadMainDataFromFile(dctFilePath);
        expandDelimiterData();
        mergeSameWords();
        sortEachItems();
        // log.info("load dictionary: " + dctFilePath + " total:" + total);
      } catch (IOException e) {
        throw new RuntimeException(e.getMessage());
      }

      saveToObj(serialObj);
    }

  }

  /**
   * Load coredict.mem internally from the jar file.
   * 
   * @throws ClassNotFoundException
   * @throws IOException
   */
  public void load() throws IOException, ClassNotFoundException {
    InputStream input = this.getClass().getResourceAsStream("coredict.mem");
    loadFromObjectInputStream(input);
  }

  private boolean loadFromObj(File serialObj) {
    try {
      loadFromObjectInputStream(new FileInputStream(serialObj));
      return true;
    } catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    } catch (ClassNotFoundException e) {
      e.printStackTrace();
    }
    return false;
  }

  private void loadFromObjectInputStream(InputStream serialObjectInputStream)
      throws IOException, ClassNotFoundException {
    ObjectInputStream input = new ObjectInputStream(serialObjectInputStream);
    wordIndexTable = (short[]) input.readObject();
    charIndexTable = (char[]) input.readObject();
    wordItem_charArrayTable = (char[][][]) input.readObject();
    wordItem_frequencyTable = (int[][]) input.readObject();
    // log.info("load core dict from serialization.");
    input.close();
  }

  private void saveToObj(File serialObj) {
    try {
      ObjectOutputStream output = new ObjectOutputStream(new FileOutputStream(
          serialObj));
      output.writeObject(wordIndexTable);
      output.writeObject(charIndexTable);
      output.writeObject(wordItem_charArrayTable);
      output.writeObject(wordItem_frequencyTable);
      output.close();
      // log.info("serialize core dict.");
    } catch (Exception e) {
      // log.warn(e.getMessage());
    }
  }

  /**
   * Load the datafile into this WordDictionary
   * 
   * @param dctFilePath path to word dictionary (coredict.dct)
   * @return number of words read
   * @throws FileNotFoundException
   * @throws IOException
   * @throws UnsupportedEncodingException
   */
  private int loadMainDataFromFile(String dctFilePath)
      throws FileNotFoundException, IOException, UnsupportedEncodingException {
    int i, cnt, length, total = 0;
    // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
    // The 3756th is used (as a header) to store information.
    int[] buffer = new int[3];
    byte[] intBuffer = new byte[4];
    String tmpword;
    RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");

    // GB2312 characters 0 - 6768
    for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
      // if (i == 5231)
      // System.out.println(i);

      dctFile.read(intBuffer);
      // the dictionary was developed for C, and byte order must be converted to work with Java
      cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
      if (cnt <= 0) {
        wordItem_charArrayTable[i] = null;
        wordItem_frequencyTable[i] = null;
        continue;
      }
      wordItem_charArrayTable[i] = new char[cnt][];
      wordItem_frequencyTable[i] = new int[cnt];
      total += cnt;
      int j = 0;
      while (j < cnt) {
        // wordItemTable[i][j] = new WordItem();
        dctFile.read(intBuffer);
        buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
            .getInt();// frequency
        dctFile.read(intBuffer);
        buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
            .getInt();// length
        dctFile.read(intBuffer);
        buffer[2] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
            .getInt();// handle

        // wordItemTable[i][j].frequency = buffer[0];
        wordItem_frequencyTable[i][j] = buffer[0];

        length = buffer[1];
        if (length > 0) {
          byte[] lchBuffer = new byte[length];
          dctFile.read(lchBuffer);
          tmpword = new String(lchBuffer, "GB2312");
          // indexTable[i].wordItems[j].word = tmpword;
          // wordItemTable[i][j].charArray = tmpword.toCharArray();
          wordItem_charArrayTable[i][j] = tmpword.toCharArray();
        } else {
          // wordItemTable[i][j].charArray = null;
          wordItem_charArrayTable[i][j] = null;
        }
        // System.out.println(indexTable[i].wordItems[j]);
        j++;
      }

      String str = getCCByGB2312Id(i);
      setTableIndex(str.charAt(0), i);
    }
    dctFile.close();
    return total;
  }

  /**
   * The original lexicon puts all information with punctuation into a 
   * chart (from 1 to 3755). Here it then gets expanded, separately being
   * placed into the chart that has the corresponding symbol.
   */
  private void expandDelimiterData() {
    int i;
    int cnt;
    // Punctuation then treating index 3755 as 1, 
    // distribute the original punctuation corresponding dictionary into 
    int delimiterIndex = 3755 + GB2312_FIRST_CHAR;
    i = 0;
    while (i < wordItem_charArrayTable[delimiterIndex].length) {
      char c = wordItem_charArrayTable[delimiterIndex][i][0];
      int j = getGB2312Id(c);// the id value of the punctuation
      if (wordItem_charArrayTable[j] == null) {

        int k = i;
        // Starting from i, count the number of the following worditem symbol from j
        while (k < wordItem_charArrayTable[delimiterIndex].length
            && wordItem_charArrayTable[delimiterIndex][k][0] == c) {
          k++;
        }
        // c is the punctuation character, j is the id value of c
        // k-1 represents the index of the last punctuation character
        cnt = k - i;
        if (cnt != 0) {
          wordItem_charArrayTable[j] = new char[cnt][];
          wordItem_frequencyTable[j] = new int[cnt];
        }

        // Assign value for each wordItem.
        for (k = 0; k < cnt; k++, i++) {
          // wordItemTable[j][k] = new WordItem();
          wordItem_frequencyTable[j][k] = wordItem_frequencyTable[delimiterIndex][i];
          wordItem_charArrayTable[j][k] = new char[wordItem_charArrayTable[delimiterIndex][i].length - 1];
          System.arraycopy(wordItem_charArrayTable[delimiterIndex][i], 1,
              wordItem_charArrayTable[j][k], 0,
              wordItem_charArrayTable[j][k].length);
        }
        setTableIndex(c, j);
      }
    }
    // Delete the original corresponding symbol array.
    wordItem_charArrayTable[delimiterIndex] = null;
    wordItem_frequencyTable[delimiterIndex] = null;
  }

  /*
   * since we aren't doing POS-tagging, merge the frequencies for entries of the same word (with different POS)
   */
  private void mergeSameWords() {
    int i;
    for (i = 0; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
      if (wordItem_charArrayTable[i] == null)
        continue;
      int len = 1;
      for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
        if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
            wordItem_charArrayTable[i][j - 1], 0) != 0)
          len++;

      }
      if (len < wordItem_charArrayTable[i].length) {
        char[][] tempArray = new char[len][];
        int[] tempFreq = new int[len];
        int k = 0;
        tempArray[0] = wordItem_charArrayTable[i][0];
        tempFreq[0] = wordItem_frequencyTable[i][0];
        for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
          if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
              tempArray[k], 0) != 0) {
            k++;
            // temp[k] = wordItemTable[i][j];
            tempArray[k] = wordItem_charArrayTable[i][j];
            tempFreq[k] = wordItem_frequencyTable[i][j];
          } else {
            // temp[k].frequency += wordItemTable[i][j].frequency;
            tempFreq[k] += wordItem_frequencyTable[i][j];
          }
        }
        // wordItemTable[i] = temp;
        wordItem_charArrayTable[i] = tempArray;
        wordItem_frequencyTable[i] = tempFreq;
      }
    }
  }

  private void sortEachItems() {
    char[] tmpArray;
    int tmpFreq;
    for (int i = 0; i < wordItem_charArrayTable.length; i++) {
      if (wordItem_charArrayTable[i] != null
          && wordItem_charArrayTable[i].length > 1) {
        for (int j = 0; j < wordItem_charArrayTable[i].length - 1; j++) {
          for (int j2 = j + 1; j2 < wordItem_charArrayTable[i].length; j2++) {
            if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
                wordItem_charArrayTable[i][j2], 0) > 0) {
              tmpArray = wordItem_charArrayTable[i][j];
              tmpFreq = wordItem_frequencyTable[i][j];
              wordItem_charArrayTable[i][j] = wordItem_charArrayTable[i][j2];
              wordItem_frequencyTable[i][j] = wordItem_frequencyTable[i][j2];
              wordItem_charArrayTable[i][j2] = tmpArray;
              wordItem_frequencyTable[i][j2] = tmpFreq;
            }
          }
        }
      }
    }
  }

  /*
   * Calculate character c's position in hash table, 
   * then initialize the value of that position in the address table.
   */
  private boolean setTableIndex(char c, int j) {
    int index = getAvaliableTableIndex(c);
    if (index != -1) {
      charIndexTable[index] = c;
      wordIndexTable[index] = (short) j;
      return true;
    } else
      return false;
  }

  private short getAvaliableTableIndex(char c) {
    int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
    int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
    if (hash1 < 0)
      hash1 = PRIME_INDEX_LENGTH + hash1;
    if (hash2 < 0)
      hash2 = PRIME_INDEX_LENGTH + hash2;
    int index = hash1;
    int i = 1;
    while (charIndexTable[index] != 0 && charIndexTable[index] != c
        && i < PRIME_INDEX_LENGTH) {
      index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
      i++;
    }
    // System.out.println(i - 1);

    if (i < PRIME_INDEX_LENGTH
        && (charIndexTable[index] == 0 || charIndexTable[index] == c)) {
      return (short) index;
    } else
      return -1;
  }

  private short getWordItemTableIndex(char c) {
    int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
    int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
    if (hash1 < 0)
      hash1 = PRIME_INDEX_LENGTH + hash1;
    if (hash2 < 0)
      hash2 = PRIME_INDEX_LENGTH + hash2;
    int index = hash1;
    int i = 1;
    while (charIndexTable[index] != 0 && charIndexTable[index] != c
        && i < PRIME_INDEX_LENGTH) {
      index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
      i++;
    }

    if (i < PRIME_INDEX_LENGTH && charIndexTable[index] == c) {
      return (short) index;
    } else
      return -1;
  }

  /**
   * Look up the text string corresponding with the word char array, 
   * and return the position of the word list.
   * 
   * @param knownHashIndex already figure out position of the first word 
   *   symbol charArray[0] in hash table. If not calculated yet, can be 
   *   replaced with function int findInTable(char[] charArray).
   * @param charArray look up the char array corresponding with the word.
   * @return word location in word array.  If not found, then return -1.
   */
  private int findInTable(short knownHashIndex, char[] charArray) {
    if (charArray == null || charArray.length == 0)
      return -1;

    char[][] items = wordItem_charArrayTable[wordIndexTable[knownHashIndex]];
    int start = 0, end = items.length - 1;
    int mid = (start + end) / 2, cmpResult;

    // Binary search for the index of idArray
    while (start <= end) {
      cmpResult = Utility.compareArray(items[mid], 0, charArray, 1);

      if (cmpResult == 0)
        return mid;// find it
      else if (cmpResult < 0)
        start = mid + 1;
      else if (cmpResult > 0)
        end = mid - 1;

      mid = (start + end) / 2;
    }
    return -1;
  }

  /**
   * Find the first word in the dictionary that starts with the supplied prefix
   * 
   * @see #getPrefixMatch(char[], int)
   * @param charArray input prefix
   * @return index of word, or -1 if not found
   */
  public int getPrefixMatch(char[] charArray) {
    return getPrefixMatch(charArray, 0);
  }

  /**
   * Find the nth word in the dictionary that starts with the supplied prefix
   * 
   * @see #getPrefixMatch(char[])
   * @param charArray input prefix
   * @param knownStart relative position in the dictionary to start
   * @return index of word, or -1 if not found
   */
  public int getPrefixMatch(char[] charArray, int knownStart) {
    short index = getWordItemTableIndex(charArray[0]);
    if (index == -1)
      return -1;
    char[][] items = wordItem_charArrayTable[wordIndexTable[index]];
    int start = knownStart, end = items.length - 1;

    int mid = (start + end) / 2, cmpResult;

    // Binary search for the index of idArray
    while (start <= end) {
      cmpResult = Utility.compareArrayByPrefix(charArray, 1, items[mid], 0);
      if (cmpResult == 0) {
        // Get the first item which match the current word
        while (mid >= 0
            && Utility.compareArrayByPrefix(charArray, 1, items[mid], 0) == 0)
          mid--;
        mid++;
        return mid;// Find the first word that uses charArray as prefix.
      } else if (cmpResult < 0)
        end = mid - 1;
      else
        start = mid + 1;
      mid = (start + end) / 2;
    }
    return -1;
  }

  /**
   * Get the frequency of a word from the dictionary
   * 
   * @param charArray input word
   * @return word frequency, or zero if the word is not found
   */
  public int getFrequency(char[] charArray) {
    short hashIndex = getWordItemTableIndex(charArray[0]);
    if (hashIndex == -1)
      return 0;
    int itemIndex = findInTable(hashIndex, charArray);
    if (itemIndex != -1)
      return wordItem_frequencyTable[wordIndexTable[hashIndex]][itemIndex];
    return 0;

  }

  /**
   * Return true if the dictionary entry at itemIndex for table charArray[0] is charArray
   * 
   * @param charArray input word
   * @param itemIndex item index for table charArray[0]
   * @return true if the entry exists
   */
  public boolean isEqual(char[] charArray, int itemIndex) {
    short hashIndex = getWordItemTableIndex(charArray[0]);
    return Utility.compareArray(charArray, 1,
        wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0;
  }

}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene WordDictionary.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.