alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (SynonymMap.java)

This example Lucene source code file (SynonymMap.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

arraylist, arraylist, debug, eof, hashmap, hashmap, integer, io, ioexception, map, map, nio, string, string, stringbuilder, treeset, util

The Lucene SynonymMap.java source code

package org.apache.lucene.wordnet;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;

/**
 * Loads the <a target="_blank" 
 * href="http://www.cogsci.princeton.edu/~wn/">WordNet </a> prolog file 
 * into a thread-safe main-memory hash map that can be used for fast
 * high-frequency lookups of synonyms for any given (lowercase) word string.
 * <p>
 * There holds: If B is a synonym for A (A -> B) then A is also a synonym for B (B -> A).
 * There does not necessarily hold: A -> B, B -> C then A -> C.
 * <p>
 * Loading typically takes some 1.5 secs, so should be done only once per
 * (server) program execution, using a singleton pattern. Once loaded, a
 * synonym lookup via {@link #getSynonyms(String)}takes constant time O(1).
 * A loaded default synonym map consumes about 10 MB main memory.
 * An instance is immutable, hence thread-safe.
 * <p>
 * This implementation borrows some ideas from the Lucene Syns2Index demo that 
 * Dave Spencer originally contributed to Lucene. Dave's approach
 * involved a persistent Lucene index which is suitable for occasional
 * lookups or very large synonym tables, but considered unsuitable for 
 * high-frequency lookups of medium size synonym tables.
 * <p>
 * Example Usage:
 * <pre class="prettyprint">
 * String[] words = new String[] { "hard", "woods", "forest", "wolfish", "xxxx"};
 * SynonymMap map = new SynonymMap(new FileInputStream("samples/fulltext/wn_s.pl"));
 * for (int i = 0; i < words.length; i++) {
 *     String[] synonyms = map.getSynonyms(words[i]);
 *     System.out.println(words[i] + ":" + java.util.Arrays.asList(synonyms).toString());
 * }
 * </pre>
 * <b/>
 * Example output:
 * <pre class="prettyprint">
 * hard:[arduous, backbreaking, difficult, fermented, firmly, grueling, gruelling, heavily, heavy, intemperately, knockout, laborious, punishing, severe, severely, strong, toilsome, tough]
 * woods:[forest, wood]
 * forest:[afforest, timber, timberland, wood, woodland, woods]
 * wolfish:[edacious, esurient, rapacious, ravening, ravenous, voracious, wolflike]
 * xxxx:[]
 * </pre>
 *
 * <p>
 * <b>See also:
* <a target="_blank" * href="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html">prologdb * man page </a>
* <a target="_blank" href="http://www.hostmon.com/rfc/advanced.jsp">Dave's synonym demo site
*/ public class SynonymMap { /** the index data; Map<String word, String[] synonyms> */ private final HashMap<String,String[]> table; private static final String[] EMPTY = new String[0]; private static final boolean DEBUG = false; /** * Constructs an instance, loading WordNet synonym data from the given input * stream. Finally closes the stream. The words in the stream must be in * UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.). * * @param input * the stream to read from (null indicates an empty synonym map) * @throws IOException * if an error occured while reading the stream. */ public SynonymMap(InputStream input) throws IOException { this.table = input == null ? new HashMap<String,String[]>(0) : read(toByteArray(input)); } /** * Returns the synonym set for the given word, sorted ascending. * * @param word * the word to lookup (must be in lowercase). * @return the synonyms; a set of zero or more words, sorted ascending, each * word containing lowercase characters that satisfy * <code>Character.isLetter(). */ public String[] getSynonyms(String word) { String[] synonyms = table.get(word); if (synonyms == null) return EMPTY; String[] copy = new String[synonyms.length]; // copy for guaranteed immutability System.arraycopy(synonyms, 0, copy, 0, synonyms.length); return copy; } /** * Returns a String representation of the index data for debugging purposes. * * @return a String representation */ @Override public String toString() { StringBuilder buf = new StringBuilder(); Iterator<String> iter = new TreeMap(table).keySet().iterator(); int count = 0; int f0 = 0; int f1 = 0; int f2 = 0; int f3 = 0; while (iter.hasNext()) { String word = iter.next(); buf.append(word + ":"); String[] synonyms = getSynonyms(word); buf.append(Arrays.asList(synonyms)); buf.append("\n"); count += synonyms.length; if (synonyms.length == 0) f0++; if (synonyms.length == 1) f1++; if (synonyms.length == 2) f2++; if (synonyms.length == 3) f3++; } buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3); return buf.toString(); } /** * Analyzes/transforms the given word on input stream loading. This default implementation simply * lowercases the word. Override this method with a custom stemming * algorithm or similar, if desired. * * @param word * the word to analyze * @return the same word, or a different word (or null to indicate that the * word should be ignored) */ protected String analyze(String word) { return word.toLowerCase(); } protected boolean isValid(String str) { for (int i=str.length(); --i >= 0; ) { if (!Character.isLetter(str.charAt(i))) return false; } return true; } private HashMap<String,String[]> read(byte[] data) { int WORDS = (int) (76401 / 0.7); // presizing int GROUPS = (int) (88022 / 0.7); // presizing HashMap<String,ArrayList word2Groups = new HashMap>(WORDS); // Map HashMap<Integer,ArrayList group2Words = new HashMap>(GROUPS); // Map HashMap<String,String> internedWords = new HashMap(WORDS);// Map Charset charset = Charset.forName("UTF-8"); int lastNum = -1; Integer lastGroup = null; int len = data.length; int i=0; while (i < len) { // until EOF /* Part A: Parse a line */ // scan to beginning of group while (i < len && data[i] != '(') i++; if (i >= len) break; // EOF i++; // parse group int num = 0; while (i < len && data[i] != ',') { num = 10*num + (data[i] - 48); i++; } i++; // if (DEBUG) System.err.println("num="+ num); // scan to beginning of word while (i < len && data[i] != '\'') i++; i++; // scan to end of word int start = i; do { while (i < len && data[i] != '\'') i++; i++; } while (i < len && data[i] != ','); // word must end with "'," if (i >= len) break; // EOF String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString(); // String word = new String(data, 0, start, i-start-1); // ASCII /* * Part B: ignore phrases (with spaces and hyphens) and * non-alphabetic words, and let user customize word (e.g. do some * stemming) */ if (!isValid(word)) continue; // ignore word = analyze(word); if (word == null || word.length() == 0) continue; // ignore /* Part C: Add (group,word) to tables */ // ensure compact string representation, minimizing memory overhead String w = internedWords.get(word); if (w == null) { word = new String(word); // ensure compact string internedWords.put(word, word); } else { word = w; } Integer group = lastGroup; if (num != lastNum) { group = Integer.valueOf(num); lastGroup = group; lastNum = num; } // add word --> group ArrayList<Integer> groups = word2Groups.get(word); if (groups == null) { groups = new ArrayList<Integer>(1); word2Groups.put(word, groups); } groups.add(group); // add group --> word ArrayList<String> words = group2Words.get(group); if (words == null) { words = new ArrayList<String>(1); group2Words.put(group, words); } words.add(word); } /* Part D: compute index data structure */ HashMap<String,String[]> word2Syns = createIndex(word2Groups, group2Words); /* Part E: minimize memory consumption by a factor 3 (or so) */ // if (true) return word2Syns; word2Groups = null; // help gc //TODO: word2Groups.clear(); would be more appropriate ? group2Words = null; // help gc //TODO: group2Words.clear(); would be more appropriate ? return optimize(word2Syns, internedWords); } private HashMap<String,String[]> createIndex(Map> word2Groups, Map> group2Words) { HashMap<String,String[]> word2Syns = new HashMap(); for (final Map.Entry<String,ArrayList entry : word2Groups.entrySet()) { // for each word ArrayList<Integer> group = entry.getValue(); String word = entry.getKey(); // HashSet synonyms = new HashSet(); TreeSet<String> synonyms = new TreeSet(); for (int i=group.size(); --i >= 0; ) { // for each groupID of word ArrayList<String> words = group2Words.get(group.get(i)); for (int j=words.size(); --j >= 0; ) { // add all words String synonym = words.get(j); // note that w and word are interned if (synonym != word) { // a word is implicitly it's own synonym synonyms.add(synonym); } } } int size = synonyms.size(); if (size > 0) { String[] syns = new String[size]; if (size == 1) syns[0] = synonyms.first(); else synonyms.toArray(syns); // if (syns.length > 1) Arrays.sort(syns); // if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns)); word2Syns.put(word, syns); } } return word2Syns; } private HashMap<String,String[]> optimize(HashMap word2Syns, HashMap internedWords) { if (DEBUG) { System.err.println("before gc"); for (int i=0; i < 10; i++) System.gc(); System.err.println("after gc"); } // collect entries int len = 0; int size = word2Syns.size(); String[][] allSynonyms = new String[size][]; String[] words = new String[size]; Iterator<Map.Entry iter = word2Syns.entrySet().iterator(); for (int j=0; j < size; j++) { Map.Entry<String,String[]> entry = iter.next(); allSynonyms[j] = entry.getValue(); words[j] = entry.getKey(); len += words[j].length(); } // assemble large string containing all words StringBuilder buf = new StringBuilder(len); for (int j=0; j < size; j++) buf.append(words[j]); String allWords = new String(buf.toString()); // ensure compact string across JDK versions buf = null; // intern words at app level via memory-overlaid substrings for (int p=0, j=0; j < size; j++) { String word = words[j]; internedWords.put(word, allWords.substring(p, p + word.length())); p += word.length(); } // replace words with interned words for (int j=0; j < size; j++) { String[] syns = allSynonyms[j]; for (int k=syns.length; --k >= 0; ) { syns[k] = internedWords.get(syns[k]); } word2Syns.remove(words[j]); word2Syns.put(internedWords.get(words[j]), syns); } if (DEBUG) { words = null; allSynonyms = null; internedWords = null; allWords = null; System.err.println("before gc"); for (int i=0; i < 10; i++) System.gc(); System.err.println("after gc"); } return word2Syns; } // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux private static byte[] toByteArray(InputStream input) throws IOException { try { // safe and fast even if input.available() behaves weird or buggy int len = Math.max(256, input.available()); byte[] buffer = new byte[len]; byte[] output = new byte[len]; len = 0; int n; while ((n = input.read(buffer)) >= 0) { if (len + n > output.length) { // grow capacity byte tmp[] = new byte[Math.max(output.length << 1, len + n)]; System.arraycopy(output, 0, tmp, 0, len); System.arraycopy(buffer, 0, tmp, len, n); buffer = output; // use larger buffer for future larger bulk reads output = tmp; } else { System.arraycopy(buffer, 0, output, len, n); } len += n; } if (len == output.length) return output; buffer = null; // help gc buffer = new byte[len]; System.arraycopy(output, 0, buffer, 0, len); return buffer; } finally { input.close(); } } }

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene SynonymMap.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.