alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (BigramDictionary.java)

This example Lucene source code file (BigramDictionary.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

bigramdictionary, bigramdictionary, classnotfoundexception, file, filenotfoundexception, gb2312_first_char, io, ioexception, ioexception, nio, objectoutputstream, prime_bigram_length, prime_bigram_length, randomaccessfile, string, string

The Lucene BigramDictionary.java source code

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.analysis.cn.smart.hhmm;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.RandomAccessFile;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;

import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;

/**
 * SmartChineseAnalyzer Bigram dictionary.
 * @lucene.experimental
 */
class BigramDictionary extends AbstractDictionary {

  private BigramDictionary() {
  }

  public static final char WORD_SEGMENT_CHAR = '@';

  private static BigramDictionary singleInstance;

  public static final int PRIME_BIGRAM_LENGTH = 402137;

  /*
   * The word associations are stored as FNV1 hashcodes, which have a small probability of collision, but save memory.  
   */
  private long[] bigramHashTable;

  private int[] frequencyTable;

  private int max = 0;

  private int repeat = 0;

  // static Logger log = Logger.getLogger(BigramDictionary.class);

  public synchronized static BigramDictionary getInstance() {
    if (singleInstance == null) {
      singleInstance = new BigramDictionary();
      try {
        singleInstance.load();
      } catch (IOException e) {
        String dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
        singleInstance.load(dictRoot);
      } catch (ClassNotFoundException e) {
        throw new RuntimeException(e);
      }
    }
    return singleInstance;
  }

  private boolean loadFromObj(File serialObj) {
    try {
      loadFromInputStream(new FileInputStream(serialObj));
      return true;
    } catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    } catch (ClassNotFoundException e) {
      e.printStackTrace();
    }
    return false;
  }

  private void loadFromInputStream(InputStream serialObjectInputStream)
      throws IOException, ClassNotFoundException {
    ObjectInputStream input = new ObjectInputStream(serialObjectInputStream);
    bigramHashTable = (long[]) input.readObject();
    frequencyTable = (int[]) input.readObject();
    // log.info("load bigram dict from serialization.");
    input.close();
  }

  private void saveToObj(File serialObj) {
    try {
      ObjectOutputStream output = new ObjectOutputStream(new FileOutputStream(
          serialObj));
      output.writeObject(bigramHashTable);
      output.writeObject(frequencyTable);
      output.close();
      // log.info("serialize bigram dict.");
    } catch (Exception e) {
      // log.warn(e.getMessage());
    }
  }

  private void load() throws IOException, ClassNotFoundException {
    InputStream input = this.getClass().getResourceAsStream("bigramdict.mem");
    loadFromInputStream(input);
  }

  private void load(String dictRoot) {
    String bigramDictPath = dictRoot + "/bigramdict.dct";

    File serialObj = new File(dictRoot + "/bigramdict.mem");

    if (serialObj.exists() && loadFromObj(serialObj)) {

    } else {
      try {
        bigramHashTable = new long[PRIME_BIGRAM_LENGTH];
        frequencyTable = new int[PRIME_BIGRAM_LENGTH];
        for (int i = 0; i < PRIME_BIGRAM_LENGTH; i++) {
          // it is possible for a value to hash to 0, but the probability is extremely low
          bigramHashTable[i] = 0;
          frequencyTable[i] = 0;
        }
        loadFromFile(bigramDictPath);
      } catch (IOException e) {
        throw new RuntimeException(e.getMessage());
      }
      saveToObj(serialObj);
    }
  }

  /**
   * Load the datafile into this BigramDictionary
   * 
   * @param dctFilePath path to the Bigramdictionary (bigramdict.dct)
   * @throws FileNotFoundException
   * @throws IOException
   * @throws UnsupportedEncodingException
   */
  public void loadFromFile(String dctFilePath) throws FileNotFoundException,
      IOException, UnsupportedEncodingException {

    int i, cnt, length, total = 0;
    // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.  
    // The 3756th is used (as a header) to store information.
    int[] buffer = new int[3];
    byte[] intBuffer = new byte[4];
    String tmpword;
    RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");

    // GB2312 characters 0 - 6768
    for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
      String currentStr = getCCByGB2312Id(i);
      // if (i == 5231)
      // System.out.println(i);

      dctFile.read(intBuffer);
      // the dictionary was developed for C, and byte order must be converted to work with Java
      cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
      if (cnt <= 0) {
        continue;
      }
      total += cnt;
      int j = 0;
      while (j < cnt) {
        dctFile.read(intBuffer);
        buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
            .getInt();// frequency
        dctFile.read(intBuffer);
        buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
            .getInt();// length
        dctFile.read(intBuffer);
        // buffer[2] = ByteBuffer.wrap(intBuffer).order(
        // ByteOrder.LITTLE_ENDIAN).getInt();// handle

        length = buffer[1];
        if (length > 0) {
          byte[] lchBuffer = new byte[length];
          dctFile.read(lchBuffer);
          tmpword = new String(lchBuffer, "GB2312");
          if (i != 3755 + GB2312_FIRST_CHAR) {
            tmpword = currentStr + tmpword;
          }
          char carray[] = tmpword.toCharArray();
          long hashId = hash1(carray);
          int index = getAvaliableIndex(hashId, carray);
          if (index != -1) {
            if (bigramHashTable[index] == 0) {
              bigramHashTable[index] = hashId;
              // bigramStringTable[index] = tmpword;
            }
            frequencyTable[index] += buffer[0];
          }
        }
        j++;
      }
    }
    dctFile.close();
    // log.info("load dictionary done! " + dctFilePath + " total:" + total);
  }

  private int getAvaliableIndex(long hashId, char carray[]) {
    int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
    int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
    if (hash1 < 0)
      hash1 = PRIME_BIGRAM_LENGTH + hash1;
    if (hash2 < 0)
      hash2 = PRIME_BIGRAM_LENGTH + hash2;
    int index = hash1;
    int i = 1;
    while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
        && i < PRIME_BIGRAM_LENGTH) {
      index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
      i++;
    }
    // System.out.println(i - 1);

    if (i < PRIME_BIGRAM_LENGTH
        && (bigramHashTable[index] == 0 || bigramHashTable[index] == hashId)) {
      return index;
    } else
      return -1;
  }

  /*
   * lookup the index into the frequency array.
   */
  private int getBigramItemIndex(char carray[]) {
    long hashId = hash1(carray);
    int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
    int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
    if (hash1 < 0)
      hash1 = PRIME_BIGRAM_LENGTH + hash1;
    if (hash2 < 0)
      hash2 = PRIME_BIGRAM_LENGTH + hash2;
    int index = hash1;
    int i = 1;
    repeat++;
    while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
        && i < PRIME_BIGRAM_LENGTH) {
      index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
      i++;
      repeat++;
      if (i > max)
        max = i;
    }
    // System.out.println(i - 1);

    if (i < PRIME_BIGRAM_LENGTH && bigramHashTable[index] == hashId) {
      return index;
    } else
      return -1;
  }

  public int getFrequency(char[] carray) {
    int index = getBigramItemIndex(carray);
    if (index != -1)
      return frequencyTable[index];
    return 0;
  }

}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene BigramDictionary.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.