|
Lucene example source code file (SynonymMap.java)
This example Lucene source code file (SynonymMap.java) is included in the DevDaily.com
"Java Source Code
Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.
The Lucene SynonymMap.java source code
package org.apache.lucene.wordnet;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;
/**
* Loads the <a target="_blank"
* href="http://www.cogsci.princeton.edu/~wn/">WordNet </a> prolog file
* into a thread-safe main-memory hash map that can be used for fast
* high-frequency lookups of synonyms for any given (lowercase) word string.
* <p>
* There holds: If B is a synonym for A (A -> B) then A is also a synonym for B (B -> A).
* There does not necessarily hold: A -> B, B -> C then A -> C.
* <p>
* Loading typically takes some 1.5 secs, so should be done only once per
* (server) program execution, using a singleton pattern. Once loaded, a
* synonym lookup via {@link #getSynonyms(String)}takes constant time O(1).
* A loaded default synonym map consumes about 10 MB main memory.
* An instance is immutable, hence thread-safe.
* <p>
* This implementation borrows some ideas from the Lucene Syns2Index demo that
* Dave Spencer originally contributed to Lucene. Dave's approach
* involved a persistent Lucene index which is suitable for occasional
* lookups or very large synonym tables, but considered unsuitable for
* high-frequency lookups of medium size synonym tables.
* <p>
* Example Usage:
* <pre class="prettyprint">
* String[] words = new String[] { "hard", "woods", "forest", "wolfish", "xxxx"};
* SynonymMap map = new SynonymMap(new FileInputStream("samples/fulltext/wn_s.pl"));
* for (int i = 0; i < words.length; i++) {
* String[] synonyms = map.getSynonyms(words[i]);
* System.out.println(words[i] + ":" + java.util.Arrays.asList(synonyms).toString());
* }
* </pre>
* <b/>
* Example output:
* <pre class="prettyprint">
* hard:[arduous, backbreaking, difficult, fermented, firmly, grueling, gruelling, heavily, heavy, intemperately, knockout, laborious, punishing, severe, severely, strong, toilsome, tough]
* woods:[forest, wood]
* forest:[afforest, timber, timberland, wood, woodland, woods]
* wolfish:[edacious, esurient, rapacious, ravening, ravenous, voracious, wolflike]
* xxxx:[]
* </pre>
*
* <p>
* <b>See also:
* <a target="_blank"
* href="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html">prologdb
* man page </a>
* <a target="_blank" href="http://www.hostmon.com/rfc/advanced.jsp">Dave's synonym demo site
*/
public class SynonymMap {
/** the index data; Map<String word, String[] synonyms> */
private final HashMap<String,String[]> table;
private static final String[] EMPTY = new String[0];
private static final boolean DEBUG = false;
/**
* Constructs an instance, loading WordNet synonym data from the given input
* stream. Finally closes the stream. The words in the stream must be in
* UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).
*
* @param input
* the stream to read from (null indicates an empty synonym map)
* @throws IOException
* if an error occured while reading the stream.
*/
public SynonymMap(InputStream input) throws IOException {
this.table = input == null ? new HashMap<String,String[]>(0) : read(toByteArray(input));
}
/**
* Returns the synonym set for the given word, sorted ascending.
*
* @param word
* the word to lookup (must be in lowercase).
* @return the synonyms; a set of zero or more words, sorted ascending, each
* word containing lowercase characters that satisfy
* <code>Character.isLetter().
*/
public String[] getSynonyms(String word) {
String[] synonyms = table.get(word);
if (synonyms == null) return EMPTY;
String[] copy = new String[synonyms.length]; // copy for guaranteed immutability
System.arraycopy(synonyms, 0, copy, 0, synonyms.length);
return copy;
}
/**
* Returns a String representation of the index data for debugging purposes.
*
* @return a String representation
*/
@Override
public String toString() {
StringBuilder buf = new StringBuilder();
Iterator<String> iter = new TreeMap(table).keySet().iterator();
int count = 0;
int f0 = 0;
int f1 = 0;
int f2 = 0;
int f3 = 0;
while (iter.hasNext()) {
String word = iter.next();
buf.append(word + ":");
String[] synonyms = getSynonyms(word);
buf.append(Arrays.asList(synonyms));
buf.append("\n");
count += synonyms.length;
if (synonyms.length == 0) f0++;
if (synonyms.length == 1) f1++;
if (synonyms.length == 2) f2++;
if (synonyms.length == 3) f3++;
}
buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3);
return buf.toString();
}
/**
* Analyzes/transforms the given word on input stream loading. This default implementation simply
* lowercases the word. Override this method with a custom stemming
* algorithm or similar, if desired.
*
* @param word
* the word to analyze
* @return the same word, or a different word (or null to indicate that the
* word should be ignored)
*/
protected String analyze(String word) {
return word.toLowerCase();
}
protected boolean isValid(String str) {
for (int i=str.length(); --i >= 0; ) {
if (!Character.isLetter(str.charAt(i))) return false;
}
return true;
}
private HashMap<String,String[]> read(byte[] data) {
int WORDS = (int) (76401 / 0.7); // presizing
int GROUPS = (int) (88022 / 0.7); // presizing
HashMap<String,ArrayList word2Groups = new HashMap>(WORDS); // Map
HashMap<Integer,ArrayList group2Words = new HashMap>(GROUPS); // Map
HashMap<String,String> internedWords = new HashMap(WORDS);// Map
Charset charset = Charset.forName("UTF-8");
int lastNum = -1;
Integer lastGroup = null;
int len = data.length;
int i=0;
while (i < len) { // until EOF
/* Part A: Parse a line */
// scan to beginning of group
while (i < len && data[i] != '(') i++;
if (i >= len) break; // EOF
i++;
// parse group
int num = 0;
while (i < len && data[i] != ',') {
num = 10*num + (data[i] - 48);
i++;
}
i++;
// if (DEBUG) System.err.println("num="+ num);
// scan to beginning of word
while (i < len && data[i] != '\'') i++;
i++;
// scan to end of word
int start = i;
do {
while (i < len && data[i] != '\'') i++;
i++;
} while (i < len && data[i] != ','); // word must end with "',"
if (i >= len) break; // EOF
String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();
// String word = new String(data, 0, start, i-start-1); // ASCII
/*
* Part B: ignore phrases (with spaces and hyphens) and
* non-alphabetic words, and let user customize word (e.g. do some
* stemming)
*/
if (!isValid(word)) continue; // ignore
word = analyze(word);
if (word == null || word.length() == 0) continue; // ignore
/* Part C: Add (group,word) to tables */
// ensure compact string representation, minimizing memory overhead
String w = internedWords.get(word);
if (w == null) {
word = new String(word); // ensure compact string
internedWords.put(word, word);
} else {
word = w;
}
Integer group = lastGroup;
if (num != lastNum) {
group = Integer.valueOf(num);
lastGroup = group;
lastNum = num;
}
// add word --> group
ArrayList<Integer> groups = word2Groups.get(word);
if (groups == null) {
groups = new ArrayList<Integer>(1);
word2Groups.put(word, groups);
}
groups.add(group);
// add group --> word
ArrayList<String> words = group2Words.get(group);
if (words == null) {
words = new ArrayList<String>(1);
group2Words.put(group, words);
}
words.add(word);
}
/* Part D: compute index data structure */
HashMap<String,String[]> word2Syns = createIndex(word2Groups, group2Words);
/* Part E: minimize memory consumption by a factor 3 (or so) */
// if (true) return word2Syns;
word2Groups = null; // help gc
//TODO: word2Groups.clear(); would be more appropriate ?
group2Words = null; // help gc
//TODO: group2Words.clear(); would be more appropriate ?
return optimize(word2Syns, internedWords);
}
private HashMap<String,String[]> createIndex(Map> word2Groups, Map> group2Words) {
HashMap<String,String[]> word2Syns = new HashMap();
for (final Map.Entry<String,ArrayList entry : word2Groups.entrySet()) { // for each word
ArrayList<Integer> group = entry.getValue();
String word = entry.getKey();
// HashSet synonyms = new HashSet();
TreeSet<String> synonyms = new TreeSet();
for (int i=group.size(); --i >= 0; ) { // for each groupID of word
ArrayList<String> words = group2Words.get(group.get(i));
for (int j=words.size(); --j >= 0; ) { // add all words
String synonym = words.get(j); // note that w and word are interned
if (synonym != word) { // a word is implicitly it's own synonym
synonyms.add(synonym);
}
}
}
int size = synonyms.size();
if (size > 0) {
String[] syns = new String[size];
if (size == 1)
syns[0] = synonyms.first();
else
synonyms.toArray(syns);
// if (syns.length > 1) Arrays.sort(syns);
// if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns));
word2Syns.put(word, syns);
}
}
return word2Syns;
}
private HashMap<String,String[]> optimize(HashMap word2Syns, HashMap internedWords) {
if (DEBUG) {
System.err.println("before gc");
for (int i=0; i < 10; i++) System.gc();
System.err.println("after gc");
}
// collect entries
int len = 0;
int size = word2Syns.size();
String[][] allSynonyms = new String[size][];
String[] words = new String[size];
Iterator<Map.Entry iter = word2Syns.entrySet().iterator();
for (int j=0; j < size; j++) {
Map.Entry<String,String[]> entry = iter.next();
allSynonyms[j] = entry.getValue();
words[j] = entry.getKey();
len += words[j].length();
}
// assemble large string containing all words
StringBuilder buf = new StringBuilder(len);
for (int j=0; j < size; j++) buf.append(words[j]);
String allWords = new String(buf.toString()); // ensure compact string across JDK versions
buf = null;
// intern words at app level via memory-overlaid substrings
for (int p=0, j=0; j < size; j++) {
String word = words[j];
internedWords.put(word, allWords.substring(p, p + word.length()));
p += word.length();
}
// replace words with interned words
for (int j=0; j < size; j++) {
String[] syns = allSynonyms[j];
for (int k=syns.length; --k >= 0; ) {
syns[k] = internedWords.get(syns[k]);
}
word2Syns.remove(words[j]);
word2Syns.put(internedWords.get(words[j]), syns);
}
if (DEBUG) {
words = null;
allSynonyms = null;
internedWords = null;
allWords = null;
System.err.println("before gc");
for (int i=0; i < 10; i++) System.gc();
System.err.println("after gc");
}
return word2Syns;
}
// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
private static byte[] toByteArray(InputStream input) throws IOException {
try {
// safe and fast even if input.available() behaves weird or buggy
int len = Math.max(256, input.available());
byte[] buffer = new byte[len];
byte[] output = new byte[len];
len = 0;
int n;
while ((n = input.read(buffer)) >= 0) {
if (len + n > output.length) { // grow capacity
byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
System.arraycopy(output, 0, tmp, 0, len);
System.arraycopy(buffer, 0, tmp, len, n);
buffer = output; // use larger buffer for future larger bulk reads
output = tmp;
} else {
System.arraycopy(buffer, 0, output, len, n);
}
len += n;
}
if (len == output.length) return output;
buffer = null; // help gc
buffer = new byte[len];
System.arraycopy(output, 0, buffer, 0, len);
return buffer;
} finally {
input.close();
}
}
}
Other Lucene examples (source code examples)
Here is a short list of links related to this Lucene SynonymMap.java source code file:
|