This example Lucene source code file (FSTLookup.java) is included in the DevDaily.com
"Java Source Code
Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.
package org.apache.lucene.search.suggest.fst;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.NoOutputs;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.tst.TSTLookup;
import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.store.OutputStreamDataOutput;
/**
* Finite state automata based implementation of {@link Lookup} query
* suggestion/ autocomplete interface.
*
* <h2>Implementation details
*
* <p>The construction step in {@link #build(TermFreqIterator)} works as follows:
* <ul>
* <li>A set of input terms (String) and weights (float) is given.
* <li>The range of weights is determined and then all weights are discretized into a fixed set
* of values ({@link #buckets}).
* Note that this means that minor changes in weights may be lost during automaton construction.
* In general, this is not a big problem because the "priorities" of completions can be split
* into a fixed set of classes (even as rough as: very frequent, frequent, baseline, marginal).
* If you need exact, fine-grained weights, use {@link TSTLookup} instead.<li>
* <li>All terms in the input are preprended with a synthetic pseudo-character being the weight
* of that term. For example a term <code>abc with a discretized weight equal '1' would
* become <code>1abc.
* <li>The terms are sorted by their raw value of utf16 character values (including the synthetic
* term in front).</li>
* <li>A finite state automaton ({@link FST}) is constructed from the input. The root node has
* arcs labeled with all possible weights. We cache all these arcs, highest-weight first.</li>
* </ul>
*
* <p>At runtime, in {@link #lookup(String, boolean, int)}, the automaton is utilized as follows:
* <ul>
* <li>For each possible term weight encoded in the automaton (cached arcs from the root above),
* starting with the highest one, we descend along the path of the input key. If the key is not
* a prefix of a sequence in the automaton (path ends prematurely), we exit immediately.
* No completions.
* <li>Otherwise, we have found an internal automaton node that ends the key. The entire
* subautomaton (all paths) starting from this node form the key's completions.</b> We start
* the traversal of this subautomaton. Every time we reach a final state (arc), we add a single
* suggestion to the list of results (the weight of this suggestion is constant and equal to the
* root path we started from). The tricky part is that because automaton edges are sorted and
* we scan depth-first, we can terminate the entire procedure as soon as we collect enough
* suggestions the user requested.
* <li>In case the number of suggestions collected in the step above is still insufficient,
* we proceed to the next (smaller) weight leaving the root node and repeat the same
* algorithm again.
* </li>
* </ul>
*
* <h2>Runtime behavior and performance characteristic
*
* <p>The algorithm described above is optimized for finding suggestions to short prefixes
* in a top-weights-first order. This is probably the most common use case: it allows
* presenting suggestions early and sorts them by the global frequency (and then alphabetically).
*
* <p>If there is an exact match in the automaton, it is returned first on the results
* list (even with by-weight sorting).
*
* <p>Note that the maximum lookup time for any prefix
* is the time of descending to the subtree, plus traversal of the subtree up to the number
* of requested suggestions (because they are already presorted by weight on the root level
* and alphabetically at any node level).
*
* <p>To order alphabetically only (no ordering by priorities), use identical term weights
* for all terms. Alphabetical suggestions are returned even if non-constant weights are
* used, but the algorithm for doing this is suboptimal.
*
* <p>"alphabetically" in any of the documentation above indicates utf16 codepoint order,
* nothing else.
*/
public class FSTLookup extends Lookup {
public FSTLookup() {
this(10, true);
}
public FSTLookup(int buckets, boolean exactMatchFirst) {
this.buckets = buckets;
this.exactMatchFirst = exactMatchFirst;
}
/** A structure for a single entry (for sorting/ preprocessing). */
private static class Entry {
char [] term;
float weight;
public Entry(char [] term, float freq) {
this.term = term;
this.weight = freq;
}
}
/** Serialized automaton file name (storage). */
public static final String FILENAME = "fst.dat";
/** An empty result. */
private static final List<LookupResult> EMPTY_RESULT = Collections.emptyList();
/**
* The number of separate buckets for weights (discretization). The more buckets,
* the more fine-grained term weights (priorities) can be assigned. The speed of lookup
* will not decrease for prefixes which have highly-weighted completions (because these
* are filled-in first), but will decrease significantly for low-weighted terms (but
* these should be infrequent, so it is all right).
*
* <p>The number of buckets must be within [1, 255] range.
*/
private final int buckets;
/**
* If <code>true, exact suggestions are returned first, even if they are prefixes
* of other strings in the automaton (possibly with larger weights).
*/
private final boolean exactMatchFirst;
/**
* Finite state automaton encoding all the lookup terms. See class
* notes for details.
*/
private FST<Object> automaton;
/**
* An array of arcs leaving the root automaton state and encoding weights of all
* completions in their sub-trees.
*/
private Arc<Object> [] rootArcs;
/* */
@Override
public void build(TermFreqIterator tfit) throws IOException {
// Buffer the input because we will need it twice: for calculating
// weights distribution and for the actual automata building.
List<Entry> entries = new ArrayList();
while (tfit.hasNext()) {
String term = tfit.next();
char [] termChars = new char [term.length() + 1]; // add padding for weight.
for (int i = 0; i < term.length(); i++)
termChars[i + 1] = term.charAt(i);
entries.add(new Entry(termChars, tfit.freq()));
}
// Distribute weights into at most N buckets. This is a form of discretization to
// limit the number of possible weights so that they can be efficiently encoded in the
// automaton.
//
// It is assumed the distribution of weights is _linear_ so proportional division
// of [min, max] range will be enough here. Other approaches could be to sort
// weights and divide into proportional ranges.
if (entries.size() > 0) {
redistributeWeightsProportionalMinMax(entries, buckets);
encodeWeightPrefix(entries);
}
// Build the automaton (includes input sorting) and cache root arcs in order from the highest,
// to the lowest weight.
this.automaton = buildAutomaton(entries);
cacheRootArcs();
}
/**
* Cache the root node's output arcs starting with completions with the highest weights.
*/
@SuppressWarnings("unchecked")
private void cacheRootArcs() throws IOException {
if (automaton != null) {
List<Arc