alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (TestFSTs.java)

This example Lucene source code file (TestFSTs.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

bytesref, bytesref, fst, fsttester, intsref, intsref, io, ioexception, no_output, positiveintoutputs, string, string, t, util, verbose, verbose

The Lucene TestFSTs.java source code

package org.apache.lucene.util.fst;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.*;

import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.MockDirectoryWrapper;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LineFileDocs;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.fst.FST.Arc;

public class TestFSTs extends LuceneTestCase {

  private MockDirectoryWrapper dir;

  @Override
  public void setUp() throws Exception {
    super.setUp();
    dir = newDirectory();
    dir.setPreventDoubleWrite(false);
  }

  @Override
  public void tearDown() throws Exception {
    dir.close();
    super.tearDown();
  }

  private static BytesRef toBytesRef(IntsRef ir) {
    BytesRef br = new BytesRef(ir.length);
    for(int i=0;i<ir.length;i++) {
      int x = ir.ints[ir.offset+i];
      assert x >= 0 && x <= 255;
      br.bytes[i] = (byte) x;
    }
    br.length = ir.length;
    return br;
  }

  private static IntsRef toIntsRef(String s, int inputMode) {
    return toIntsRef(s, inputMode, new IntsRef(10));
  }

  private static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) {
    if (inputMode == 0) {
      // utf8
      return toIntsRef(new BytesRef(s), ir);
    } else {
      // utf32
      return toIntsRefUTF32(s, ir);
    }
  }

  private static IntsRef toIntsRefUTF32(String s, IntsRef ir) {
    final int charLength = s.length();
    int charIdx = 0;
    int intIdx = 0;
    while(charIdx < charLength) {
      if (intIdx == ir.ints.length) {
        ir.grow(intIdx+1);
      }
      final int utf32 = s.codePointAt(charIdx);
      ir.ints[intIdx] = utf32;
      charIdx += Character.charCount(utf32);
      intIdx++;
    }
    ir.length = intIdx;
    return ir;
  }

  private static IntsRef toIntsRef(BytesRef br, IntsRef ir) {
    if (br.length > ir.ints.length) {
      ir.grow(br.length);
    }
    for(int i=0;i<br.length;i++) {
      ir.ints[i] = br.bytes[br.offset+i]&0xFF;
    }
    ir.length = br.length;
    return ir;
  }

  public void testBasicFSA() throws IOException {
    String[] strings = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation", "stat"};
    String[] strings2 = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation"};
    IntsRef[] terms = new IntsRef[strings.length];
    IntsRef[] terms2 = new IntsRef[strings2.length];
    for(int inputMode=0;inputMode<2;inputMode++) {
      if (VERBOSE) {
        System.out.println("TEST: inputMode=" + inputModeToString(inputMode));
      }

      for(int idx=0;idx<strings.length;idx++) {
        terms[idx] = toIntsRef(strings[idx], inputMode);
      }
      for(int idx=0;idx<strings2.length;idx++) {
        terms2[idx] = toIntsRef(strings2[idx], inputMode);
      }
      Arrays.sort(terms2);

      doTest(inputMode, terms);
    
      // Test pre-determined FST sizes to make sure we haven't lost minimality (at least on this trivial set of terms):

      // FSA
      {
        final Outputs<Object> outputs = NoOutputs.getSingleton();
        final Object NO_OUTPUT = outputs.getNoOutput();      
        final List<FSTTester.InputOutput pairs = new ArrayList>(terms2.length);
        for(IntsRef term : terms2) {
          pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT));
        }
        FST<Object> fst = new FSTTester(random, dir, inputMode, pairs, outputs).doTest(0, 0);
        assertNotNull(fst);
        assertEquals(22, fst.getNodeCount());
        assertEquals(27, fst.getArcCount());
      }

      // FST ord pos int
      {
        final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
        final List<FSTTester.InputOutput pairs = new ArrayList>(terms2.length);
        for(int idx=0;idx<terms2.length;idx++) {
          pairs.add(new FSTTester.InputOutput<Long>(terms2[idx], outputs.get(idx)));
        }
        final FST<Long> fst = new FSTTester(random, dir, inputMode, pairs, outputs).doTest(0, 0);
        assertNotNull(fst);
        assertEquals(22, fst.getNodeCount());
        assertEquals(27, fst.getArcCount());
      }

      // FST byte sequence ord
      {
        final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
        final BytesRef NO_OUTPUT = outputs.getNoOutput();      
        final List<FSTTester.InputOutput pairs = new ArrayList>(terms2.length);
        for(int idx=0;idx<terms2.length;idx++) {
          final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx));
          pairs.add(new FSTTester.InputOutput<BytesRef>(terms2[idx], output));
        }
        final FST<BytesRef> fst = new FSTTester(random, dir, inputMode, pairs, outputs).doTest(0, 0);
        assertNotNull(fst);
        assertEquals(24, fst.getNodeCount());
        assertEquals(30, fst.getArcCount());
      }
    }
  }

  private static String simpleRandomString(Random r) {
    final int end = r.nextInt(10);
    if (end == 0) {
      // allow 0 length
      return "";
    }
    final char[] buffer = new char[end];
    for (int i = 0; i < end; i++) {
      buffer[i] = (char) _TestUtil.nextInt(r, 97, 102);
    }
    return new String(buffer, 0, end);
  }

  // given set of terms, test the different outputs for them
  private void doTest(int inputMode, IntsRef[] terms) throws IOException {
    Arrays.sort(terms);

    // NoOutputs (simple FSA)
    {
      final Outputs<Object> outputs = NoOutputs.getSingleton();
      final Object NO_OUTPUT = outputs.getNoOutput();      
      final List<FSTTester.InputOutput pairs = new ArrayList>(terms.length);
      for(IntsRef term : terms) {
        pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT));
      }
      new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest();
    }

    // PositiveIntOutput (ord)
    {
      final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
      final List<FSTTester.InputOutput pairs = new ArrayList>(terms.length);
      for(int idx=0;idx<terms.length;idx++) {
        pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(idx)));
      }
      new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest();
    }

    // PositiveIntOutput (random monotonically increasing positive number)
    {
      final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
      final List<FSTTester.InputOutput pairs = new ArrayList>(terms.length);
      long lastOutput = 0;
      for(int idx=0;idx<terms.length;idx++) {
        final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000);
        lastOutput = value;
        pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(value)));
      }
      new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest();
    }

    // PositiveIntOutput (random positive number)
    {
      final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
      final List<FSTTester.InputOutput pairs = new ArrayList>(terms.length);
      for(int idx=0;idx<terms.length;idx++) {
        pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(random.nextLong()) & Long.MAX_VALUE));
      }
      new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest();
    }

    // Pair<ord, (random monotonically increasing positive number>
    {
      final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(random.nextBoolean());
      final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(random.nextBoolean());
      final PairOutputs<Long,Long> outputs = new PairOutputs(o1, o2);
      final List<FSTTester.InputOutput> pairs = new ArrayList>>(terms.length);
      long lastOutput = 0;
      for(int idx=0;idx<terms.length;idx++) {
        final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000);
        lastOutput = value;
        pairs.add(new FSTTester.InputOutput<PairOutputs.Pair(terms[idx],
                                                                         outputs.get(o1.get(idx),
                                                                                     o2.get(value))));
      }
      new FSTTester<PairOutputs.Pair(random, dir, inputMode, pairs, outputs).doTest();
    }

    // Sequence-of-bytes
    {
      final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
      final BytesRef NO_OUTPUT = outputs.getNoOutput();      
      final List<FSTTester.InputOutput pairs = new ArrayList>(terms.length);
      for(int idx=0;idx<terms.length;idx++) {
        final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx));
        pairs.add(new FSTTester.InputOutput<BytesRef>(terms[idx], output));
      }
      new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs).doTest();
    }

    // Sequence-of-ints
    {
      final IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
      final List<FSTTester.InputOutput pairs = new ArrayList>(terms.length);
      for(int idx=0;idx<terms.length;idx++) {
        final String s = Integer.toString(idx);
        final IntsRef output = new IntsRef(s.length());
        output.length = s.length();
        for(int idx2=0;idx2<output.length;idx2++) {
          output.ints[idx2] = s.charAt(idx2);
        }
        pairs.add(new FSTTester.InputOutput<IntsRef>(terms[idx], output));
      }
      new FSTTester<IntsRef>(random, dir, inputMode, pairs, outputs).doTest();
    }

    // Up to two positive ints, shared, generally but not
    // monotonically increasing
    {
      if (VERBOSE) {
        System.out.println("TEST: now test UpToTwoPositiveIntOutputs");
      }
      final UpToTwoPositiveIntOutputs outputs = UpToTwoPositiveIntOutputs.getSingleton(true);
      final List<FSTTester.InputOutput pairs = new ArrayList>(terms.length);
      long lastOutput = 0;
      for(int idx=0;idx<terms.length;idx++) {
        // Sometimes go backwards
        long value = lastOutput + _TestUtil.nextInt(random, -100, 1000);
        while(value < 0) {
          value = lastOutput + _TestUtil.nextInt(random, -100, 1000);
        }
        final Object output;
        if (random.nextInt(5) == 3) {
          long value2 = lastOutput + _TestUtil.nextInt(random, -100, 1000);
          while(value2 < 0) {
            value2 = lastOutput + _TestUtil.nextInt(random, -100, 1000);
          }
          output = outputs.get(value, value2);
        } else {
          output = outputs.get(value);
        }
        pairs.add(new FSTTester.InputOutput<Object>(terms[idx], output));
      }
      new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest();
    }
  }

  private static class FSTTester<T> {

    final Random random;
    final List<InputOutput pairs;
    final int inputMode;
    final Outputs<T> outputs;
    final Directory dir;

    public FSTTester(Random random, Directory dir, int inputMode, List<InputOutput pairs, Outputs outputs) {
      this.random = random;
      this.dir = dir;
      this.inputMode = inputMode;
      this.pairs = pairs;
      this.outputs = outputs;
    }

    private static class InputOutput<T> implements Comparable> {
      public final IntsRef input;
      public final T output;

      public InputOutput(IntsRef input, T output) {
        this.input = input;
        this.output = output;
      }

      public int compareTo(InputOutput<T> other) {
        if (other instanceof InputOutput) {
          return input.compareTo((other).input);
        } else {
          throw new IllegalArgumentException();
        }
      }
    }

    public void doTest() throws IOException {
      // no pruning
      doTest(0, 0);

      if (!(outputs instanceof UpToTwoPositiveIntOutputs)) {
        // simple pruning
        doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0);
        
        // leafy pruning
        doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size()));
      }
    }

    // runs the term, returning the output, or null if term
    // isn't accepted.  if prefixLength is non-null it must be
    // length 1 int array; prefixLength[0] is set to the length
    // of the term prefix that matches
    private T run(FST<T> fst, IntsRef term, int[] prefixLength) throws IOException {
      assert prefixLength == null || prefixLength.length == 1;
      final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc());
      final T NO_OUTPUT = fst.outputs.getNoOutput();
      T output = NO_OUTPUT;

      for(int i=0;i<=term.length;i++) {
        final int label;
        if (i == term.length) {
          label = FST.END_LABEL;
        } else {
          label = term.ints[term.offset+i];
        }
        //System.out.println("   loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal());
        if (fst.findTargetArc(label, arc, arc) == null) {
          if (prefixLength != null) {
            prefixLength[0] = i;
            return output;
          } else {
            return null;
          }
        }
        output = fst.outputs.add(output, arc.output);
      }

      if (prefixLength != null) {
        prefixLength[0] = term.length;
      }

      return output;
    }

    private T randomAcceptedWord(FST<T> fst, IntsRef in) throws IOException {
      FST.Arc<T> arc = fst.getFirstArc(new FST.Arc());

      final List<FST.Arc arcs = new ArrayList>();
      in.length = 0;
      in.offset = 0;
      final T NO_OUTPUT = fst.outputs.getNoOutput();
      T output = NO_OUTPUT;

      while(true) {
        // read all arcs:
        fst.readFirstTargetArc(arc, arc);
        arcs.add(new FST.Arc<T>().copyFrom(arc));
        while(!arc.isLast()) {
          fst.readNextArc(arc);
          arcs.add(new FST.Arc<T>().copyFrom(arc));
        }
      
        // pick one
        arc = arcs.get(random.nextInt(arcs.size()));
        arcs.clear();

        // accumulate output
        output = fst.outputs.add(output, arc.output);

        // append label
        if (arc.label == FST.END_LABEL) {
          break;
        }

        if (in.ints.length == in.length) {
          in.grow(1+in.length);
        }
        in.ints[in.length++] = arc.label;
      }

      return output;
    }


    FST<T> doTest(int prune1, int prune2) throws IOException {
      if (VERBOSE) {
        System.out.println("TEST: prune1=" + prune1 + " prune2=" + prune2);
      }

      final Builder<T> builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
                                                prune1, prune2,
                                                prune1==0 && prune2==0, outputs);

      for(InputOutput<T> pair : pairs) {
        if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) {
          final UpToTwoPositiveIntOutputs _outputs = (UpToTwoPositiveIntOutputs) outputs;
          final UpToTwoPositiveIntOutputs.TwoLongs twoLongs = (UpToTwoPositiveIntOutputs.TwoLongs) pair.output;
          @SuppressWarnings("unchecked") final Builder<Object> builderObject = (Builder) builder;
          builderObject.add(pair.input, _outputs.get(twoLongs.first));
          builderObject.add(pair.input, _outputs.get(twoLongs.second));
        } else {
          builder.add(pair.input, pair.output);
        }
      }
      FST<T> fst = builder.finish();

      if (random.nextBoolean() && fst != null) {
        IndexOutput out = dir.createOutput("fst.bin");
        fst.save(out);
        out.close();
        IndexInput in = dir.openInput("fst.bin");
        try {
          fst = new FST<T>(in, outputs);
        } finally {
          in.close();
          dir.deleteFile("fst.bin");
        }
      }

      if (VERBOSE && pairs.size() <= 20 && fst != null) {
        Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
        Util.toDot(fst, w, false, false);
        w.close();
        System.out.println("SAVED out.dot");
      }

      if (VERBOSE) {
        if (fst == null) {
          System.out.println("  fst has 0 nodes (fully pruned)");
        } else {
          System.out.println("  fst has " + fst.getNodeCount() + " nodes and " + fst.getArcCount() + " arcs");
        }
      }

      if (prune1 == 0 && prune2 == 0) {
        verifyUnPruned(inputMode, fst);
      } else {
        verifyPruned(inputMode, fst, prune1, prune2);
      }

      return fst;
    }

    // FST is complete
    private void verifyUnPruned(int inputMode, FST<T> fst) throws IOException {

      if (pairs.size() == 0) {
        assertNull(fst);
        return;
      }

      if (VERBOSE) {
        System.out.println("TEST: now verify " + pairs.size() + " terms");
        for(InputOutput<T> pair : pairs) {
          assertNotNull(pair);
          assertNotNull(pair.input);
          assertNotNull(pair.output);
          System.out.println("  " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output));
        }
      }

      assertNotNull(fst);

      // visit valid paris in order -- make sure all words
      // are accepted, and FSTEnum's next() steps through
      // them correctly
      if (VERBOSE) {
        System.out.println("TEST: check valid terms/next()");
      }
      {
        IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum(fst);
        for(InputOutput<T> pair : pairs) {
          IntsRef term = pair.input;
          if (VERBOSE) {
            System.out.println("TEST: check term=" + inputToString(inputMode, term) + " output=" + fst.outputs.outputToString(pair.output));
          }
          Object output = run(fst, term, null);

          assertNotNull("term " + inputToString(inputMode, term) + " is not accepted", output);
          assertEquals(pair.output, output);

          // verify enum's next
          IntsRefFSTEnum.InputOutput<T> t = fstEnum.next();
          assertNotNull(t);
          assertEquals("expected input=" + inputToString(inputMode, term) + " but fstEnum returned " + inputToString(inputMode, t.input), term, t.input);
          assertEquals(pair.output, t.output);
        }
        assertNull(fstEnum.next());
      }

      final Map<IntsRef,T> termsMap = new HashMap();
      for(InputOutput<T> pair : pairs) {
        termsMap.put(pair.input, pair.output);
      }

      // find random matching word and make sure it's valid
      if (VERBOSE) {
        System.out.println("TEST: verify random accepted terms");
      }
      final IntsRef scratch = new IntsRef(10);
      int num = atLeast(500);
      for(int iter=0;iter<num;iter++) {
        T output = randomAcceptedWord(fst, scratch);
        assertTrue("accepted word " + inputToString(inputMode, scratch) + " is not valid", termsMap.containsKey(scratch));
        assertEquals(termsMap.get(scratch), output);
      }
    
      // test IntsRefFSTEnum.seek:
      if (VERBOSE) {
        System.out.println("TEST: verify seek");
      }
      IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum(fst);
      num = atLeast(100);
      for(int iter=0;iter<num;iter++) {
        if (VERBOSE) {
          System.out.println("TEST: iter=" + iter);
        }
        if (random.nextBoolean()) {
          // seek to term that doesn't exist:
          while(true) {
            final IntsRef term = toIntsRef(getRandomString(), inputMode);
            int pos = Collections.binarySearch(pairs, new InputOutput<T>(term, null));
            if (pos < 0) {
              pos = -(pos+1);
              // ok doesn't exist
              //System.out.println("  seek " + inputToString(inputMode, term));
              final IntsRefFSTEnum.InputOutput<T> seekResult;
              if (random.nextBoolean()) {
                if (VERBOSE) {
                  System.out.println("  do non-exist seekFloor term=" + inputToString(inputMode, term));
                }
                seekResult = fstEnum.seekFloor(term);
                pos--;
              } else {
                if (VERBOSE) {
                  System.out.println("  do non-exist seekCeil term=" + inputToString(inputMode, term));
                }
                seekResult = fstEnum.seekCeil(term);
              }

              if (pos != -1 && pos < pairs.size()) {
                //System.out.println("    got " + inputToString(inputMode,seekResult.input) + " output=" + fst.outputs.outputToString(seekResult.output));
                assertNotNull("got null but expected term=" + inputToString(inputMode, pairs.get(pos).input), seekResult);
                if (VERBOSE) {
                  System.out.println("    got " + inputToString(inputMode, seekResult.input));
                }
                assertEquals("expected " + inputToString(inputMode, pairs.get(pos).input) + " but got " + inputToString(inputMode, seekResult.input), pairs.get(pos).input, seekResult.input);
                assertEquals(pairs.get(pos).output, seekResult.output);
              } else {
                // seeked before start or beyond end
                //System.out.println("seek=" + seekTerm);
                assertNull("expected null but got " + (seekResult==null ? "null" : inputToString(inputMode, seekResult.input)), seekResult);
                if (VERBOSE) {
                  System.out.println("    got null");
                }
              }

              break;
            }
          }
        } else {
          // seek to term that does exist:
          InputOutput<T> pair = pairs.get(random.nextInt(pairs.size()));
          final IntsRefFSTEnum.InputOutput<T> seekResult;
          if (random.nextBoolean()) {
            if (VERBOSE) {
              System.out.println("  do exists seekFloor " + inputToString(inputMode, pair.input));
            }
            seekResult = fstEnum.seekFloor(pair.input);
          } else {
            if (VERBOSE) {
              System.out.println("  do exists seekCeil " + inputToString(inputMode, pair.input));
            }
            seekResult = fstEnum.seekCeil(pair.input);
          }
          assertNotNull(seekResult);
          assertEquals("got " + inputToString(inputMode, seekResult.input) + " but expected " + inputToString(inputMode, pair.input), pair.input, seekResult.input);
          assertEquals(pair.output, seekResult.output);
        }
      }

      if (VERBOSE) {
        System.out.println("TEST: mixed next/seek");
      }

      // test mixed next/seek
      num = atLeast(100);
      for(int iter=0;iter<num;iter++) {
        if (VERBOSE) {
          System.out.println("TEST: iter " + iter);
        }
        // reset:
        fstEnum = new IntsRefFSTEnum<T>(fst);
        int upto = -1;
        while(true) {
          boolean isDone = false;
          if (upto == pairs.size()-1 || random.nextBoolean()) {
            // next
            upto++;
            if (VERBOSE) {
              System.out.println("  do next");
            }
            isDone = fstEnum.next() == null;
          } else if (upto != -1 && upto < 0.75 * pairs.size() && random.nextBoolean()) {
            int attempt = 0;
            for(;attempt<10;attempt++) {
              IntsRef term = toIntsRef(getRandomString(), inputMode);
              if (!termsMap.containsKey(term) && term.compareTo(pairs.get(upto).input) > 0) {
                int pos = Collections.binarySearch(pairs, new InputOutput<T>(term, null));
                assert pos < 0;
                upto = -(pos+1);

                if (random.nextBoolean()) {
                  upto--;
                  assertTrue(upto != -1);
                  if (VERBOSE) {
                    System.out.println("  do non-exist seekFloor(" + inputToString(inputMode, term) + ")");
                  }
                  isDone = fstEnum.seekFloor(term) == null;
                } else {
                  if (VERBOSE) {
                    System.out.println("  do non-exist seekCeil(" + inputToString(inputMode, term) + ")");
                  }
                  isDone = fstEnum.seekCeil(term) == null;
                }

                break;
              }
            }
            if (attempt == 10) {
              continue;
            }
            
          } else {
            final int inc = random.nextInt(pairs.size() - upto - 1);
            upto += inc;
            if (upto == -1) {
              upto = 0;
            }

            if (random.nextBoolean()) {
              if (VERBOSE) {
                System.out.println("  do advanceCeil(" + inputToString(inputMode, pairs.get(upto).input) + ")");
              }
              isDone = fstEnum.seekCeil(pairs.get(upto).input) == null;
            } else {
              if (VERBOSE) {
                System.out.println("  do advanceFloor(" + inputToString(inputMode, pairs.get(upto).input) + ")");
              }
              isDone = fstEnum.seekFloor(pairs.get(upto).input) == null;
            }
          }
          if (VERBOSE) {
            if (!isDone) {
              System.out.println("    got " + inputToString(inputMode, fstEnum.current().input));
            } else {
              System.out.println("    got null");
            }
          }

          if (upto == pairs.size()) {
            assertTrue(isDone);
            break;
          } else {
            assertFalse(isDone);
            assertEquals(pairs.get(upto).input, fstEnum.current().input);
            assertEquals(pairs.get(upto).output, fstEnum.current().output);

            /*
            if (upto < pairs.size()-1) {
              int tryCount = 0;
              while(tryCount < 10) {
                final IntsRef t = toIntsRef(getRandomString(), inputMode);
                if (pairs.get(upto).input.compareTo(t) < 0) {
                  final boolean expected = t.compareTo(pairs.get(upto+1).input) < 0;
                  if (VERBOSE) {
                    System.out.println("TEST: call beforeNext(" + inputToString(inputMode, t) + "); current=" + inputToString(inputMode, pairs.get(upto).input) + " next=" + inputToString(inputMode, pairs.get(upto+1).input) + " expected=" + expected);
                  }
                  assertEquals(expected, fstEnum.beforeNext(t));
                  break;
                }
                tryCount++;
              }
            }
            */
          }
        }
      }
    }

    private static class CountMinOutput<T> {
      int count;
      T output;
      T finalOutput;
      boolean isLeaf = true;
      boolean isFinal;
    }

    // FST is pruned
    private void verifyPruned(int inputMode, FST<T> fst, int prune1, int prune2) throws IOException {

      if (VERBOSE) {
        System.out.println("TEST: now verify pruned " + pairs.size() + " terms; outputs=" + outputs);
        for(InputOutput<T> pair : pairs) {
          System.out.println("  " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output));
        }
      }

      // To validate the FST, we brute-force compute all prefixes
      // in the terms, matched to their "common" outputs, prune that
      // set according to the prune thresholds, then assert the FST
      // matches that same set.

      // NOTE: Crazy RAM intensive!!

      //System.out.println("TEST: tally prefixes");

      // build all prefixes
      final Map<IntsRef,CountMinOutput prefixes = new HashMap>();
      final IntsRef scratch = new IntsRef(10);
      for(InputOutput<T> pair: pairs) {
        scratch.copy(pair.input);
        for(int idx=0;idx<=pair.input.length;idx++) {
          scratch.length = idx;
          CountMinOutput<T> cmo = prefixes.get(scratch);
          if (cmo == null) {
            cmo = new CountMinOutput<T>();
            cmo.count = 1;
            cmo.output = pair.output;
            prefixes.put(new IntsRef(scratch), cmo);
          } else {
            cmo.count++;
            cmo.output = outputs.common(cmo.output, pair.output);
          }
          if (idx == pair.input.length) {
            cmo.isFinal = true;
            cmo.finalOutput = cmo.output;
          }
        }
      }

      if (VERBOSE) {
        System.out.println("TEST: now prune");
      }

      // prune 'em
      final Iterator<Map.Entry> it = prefixes.entrySet().iterator();
      while(it.hasNext()) {
        Map.Entry<IntsRef,CountMinOutput ent = it.next();
        final IntsRef prefix = ent.getKey();
        final CountMinOutput<T> cmo = ent.getValue();
        if (VERBOSE) {
          System.out.println("  term=" + inputToString(inputMode, prefix) + " count=" + cmo.count + " isLeaf=" + cmo.isLeaf + " output=" + outputs.outputToString(cmo.output) + " isFinal=" + cmo.isFinal);
        }
        final boolean keep;
        if (prune1 > 0) {
          keep = cmo.count >= prune1;
        } else {
          assert prune2 > 0;
          if (prune2 > 1 && cmo.count >= prune2) {
            keep = true;
          } else if (prefix.length > 0) {
            // consult our parent
            scratch.length = prefix.length-1;
            System.arraycopy(prefix.ints, prefix.offset, scratch.ints, 0, scratch.length);
            final CountMinOutput<T> cmo2 = prefixes.get(scratch);
            //System.out.println("    parent count = " + (cmo2 == null ? -1 : cmo2.count));
            keep = cmo2 != null && ((prune2 > 1 && cmo2.count >= prune2) || (prune2 == 1 && (cmo2.count >= 2 || prefix.length <= 1)));
          } else if (cmo.count >= prune2) {
            keep = true;
          } else {
            keep = false;
          }
        }

        if (!keep) {
          it.remove();
          //System.out.println("    remove");
        } else {
          // clear isLeaf for all ancestors
          //System.out.println("    keep");
          scratch.copy(prefix);
          scratch.length--;
          while(scratch.length >= 0) {
            final CountMinOutput<T> cmo2 = prefixes.get(scratch);
            if (cmo2 != null) {
              //System.out.println("    clear isLeaf " + inputToString(inputMode, scratch));
              cmo2.isLeaf = false;
            }
            scratch.length--;
          }
        }
      }

      //System.out.println("TEST: after prune");
      /*
        for(Map.Entry<BytesRef,CountMinOutput> ent : prefixes.entrySet()) {
        System.out.println("  " + inputToString(inputMode, ent.getKey()) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal);
        if (ent.getValue().isFinal) {
        System.out.println("    finalOutput=" + outputs.outputToString(ent.getValue().finalOutput));
        }
        }
      */

      if (prefixes.size() <= 1) {
        assertNull(fst);
        return;
      }

      assertNotNull(fst);

      // make sure FST only enums valid prefixes
      if (VERBOSE) {
        System.out.println("TEST: check pruned enum");
      }
      IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum(fst);
      IntsRefFSTEnum.InputOutput<T> current;
      while((current = fstEnum.next()) != null) {
        if (VERBOSE) {
          System.out.println("  fstEnum.next term=" + inputToString(inputMode, current.input) + " output=" + outputs.outputToString(current.output));
        }
        final CountMinOutput cmo = prefixes.get(current.input);
        assertNotNull(cmo);
        assertTrue(cmo.isLeaf || cmo.isFinal);
        //if (cmo.isFinal && !cmo.isLeaf) {
        if (cmo.isFinal) {
          assertEquals(cmo.finalOutput, current.output);
        } else {
          assertEquals(cmo.output, current.output);
        }
      }

      // make sure all non-pruned prefixes are present in the FST
      if (VERBOSE) {
        System.out.println("TEST: verify all prefixes");
      }
      final int[] stopNode = new int[1];
      for(Map.Entry<IntsRef,CountMinOutput ent : prefixes.entrySet()) {
        if (ent.getKey().length > 0) {
          final CountMinOutput<T> cmo = ent.getValue();
          final T output = run(fst, ent.getKey(), stopNode);
          if (VERBOSE) {
            System.out.println("TEST: verify term=" + inputToString(inputMode, ent.getKey()) + " output=" + outputs.outputToString(cmo.output));
          }
          // if (cmo.isFinal && !cmo.isLeaf) {
          if (cmo.isFinal) {
            assertEquals(cmo.finalOutput, output);
          } else {
            assertEquals(cmo.output, output);
          }
          assertEquals(ent.getKey().length, stopNode[0]);
        }
      }
    }
  }

  public void testRandomWords() throws IOException {
    testRandomWords(1000, atLeast(2));
    //testRandomWords(20, 100);
  }

  private String inputModeToString(int mode) {
    if (mode == 0) {
      return "utf8";
    } else {
      return "utf32";
    }
  }

  private void testRandomWords(int maxNumWords, int numIter) throws IOException {
    for(int iter=0;iter<numIter;iter++) {
      if (VERBOSE) {
        System.out.println("\nTEST: iter " + iter);
      }
      for(int inputMode=0;inputMode<2;inputMode++) {
        final int numWords = random.nextInt(maxNumWords+1);
        Set<IntsRef> termsSet = new HashSet();
        IntsRef[] terms = new IntsRef[numWords];
        while(termsSet.size() < numWords) {
          final String term = getRandomString();
          termsSet.add(toIntsRef(term, inputMode));
        }
        doTest(inputMode, termsSet.toArray(new IntsRef[termsSet.size()]));
      }
    }
  }

  static String getRandomString() {
    final String term;
    if (random.nextBoolean()) {
      term = _TestUtil.randomRealisticUnicodeString(random);
    } else {
      // we want to mix in limited-alphabet symbols so
      // we get more sharing of the nodes given how few
      // terms we are testing...
      term = simpleRandomString(random);
    }
    return term;
  }

  @Nightly
  public void testBigSet() throws IOException {
    testRandomWords(_TestUtil.nextInt(random, 50000, 60000), atLeast(1));
  }

  private static String inputToString(int inputMode, IntsRef term) {
    if (inputMode == 0) {
      // utf8
      return toBytesRef(term).utf8ToString() + " " + term;
    } else {
      // utf32
      return UnicodeUtil.newString(term.ints, term.offset, term.length) + " " + term;
    }
  }

  private static IntsRef toIntsRef(String s) {
    final int charCount = s.length();
    IntsRef ir = new IntsRef(charCount);
    for(int charIDX=0;charIDX<charCount;charIDX++) {
      ir.ints[charIDX] = s.charAt(charIDX);
    }
    ir.length = charCount;
    return ir;
  }

  private static String toString(IntsRef ints) {
    char[] chars = new char[ints.length];
    for(int charIDX=0;charIDX<ints.length;charIDX++) {
      final int ch = ints.ints[ints.offset+charIDX];
      assertTrue(ch >= 0 && ch < 65536);
      chars[charIDX] = (char) ch;
    }
    return new String(chars);
  }

  // Build FST for all unique terms in the test line docs
  // file, up until a time limit
  public void testRealTerms() throws Exception {

    /*
    if (CodecProvider.getDefault().getDefaultFieldCodec().equals("SimpleText")) {
      // no
      CodecProvider.getDefault().setDefaultFieldCodec("Standard");
    }
    */

    final LineFileDocs docs = new LineFileDocs(random);
    final int RUN_TIME_MSEC = atLeast(500);
    final IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMaxBufferedDocs(-1).setRAMBufferSizeMB(64);
    final File tempDir = _TestUtil.getTempDir("fstlines");
    final MockDirectoryWrapper dir = new MockDirectoryWrapper(random, FSDirectory.open(tempDir));
    final IndexWriter writer = new IndexWriter(dir, conf);
    writer.setInfoStream(VERBOSE ? System.out : null);
    final long stopTime = System.currentTimeMillis() + RUN_TIME_MSEC;
    Document doc;
    int docCount = 0;
    while((doc = docs.nextDoc()) != null && System.currentTimeMillis() < stopTime) {
      writer.addDocument(doc);
      docCount++;
    }
    IndexReader r = IndexReader.open(writer, true);
    writer.close();
    final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
    Builder<Long> builder = new Builder(FST.INPUT_TYPE.BYTE2, 0, 0, true, outputs);

    boolean storeOrd = false;
    if (VERBOSE) {
      if (storeOrd) {
        System.out.println("FST stores ord");
      } else {
        System.out.println("FST stores docFreq");
      }
    }
    TermEnum termEnum = r.terms(new Term("body", ""));
    if (VERBOSE) {
      System.out.println("TEST: got termEnum=" + termEnum);
    }
    int ord = 0;
    while(true) {
      final Term term = termEnum.term();
      if (term == null || !"body".equals(term.field())) {
        break;
      }

      // No ord in 3.x:
      /*
      if (ord == 0) {
        try {
          termsEnum.ord();
        } catch (UnsupportedOperationException uoe) {
          if (VERBOSE) {
            System.out.println("TEST: codec doesn't support ord; FST stores docFreq");
          }
          storeOrd = false;
        }
      }
      */
      final int output;
      if (storeOrd) {
        output = ord;
      } else {
        output = termEnum.docFreq();
      }
      //System.out.println("ADD: " + term.text() + " ch[0]=" + (term.text().length() == 0 ? -1 : term.text().charAt(0)));
      builder.add(toIntsRef(term.text()), outputs.get(output));
      ord++;
      if (ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
        System.out.println(ord + " terms...");
      }
      termEnum.next();
    }
    final FST<Long> fst = builder.finish();
    if (VERBOSE) {
      System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs;" + " " + fst.sizeInBytes() + " bytes");
    }

    if (ord > 0) {
      // Now confirm BytesRefFSTEnum and TermEnum act the
      // same:
      final IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum(fst);
      int num = atLeast(1000);
      for(int iter=0;iter<num;iter++) {
        final String randomTerm = getRandomString();

        if (VERBOSE) {
          System.out.println("TEST: seek " + randomTerm + " ch[0]=" + (randomTerm.length() == 0 ? -1 : randomTerm.charAt(0)));
        }

        termEnum = r.terms(new Term("body", randomTerm));
        final IntsRefFSTEnum.InputOutput fstSeekResult = fstEnum.seekCeil(toIntsRef(randomTerm));

        if (termEnum.term() == null || !"body".equals(termEnum.term().field())) {
          assertNull("got " + (fstSeekResult == null ? "null" : toString(fstSeekResult.input) + " but expected null"), fstSeekResult);
        } else {
          assertSame(termEnum, fstEnum, storeOrd);
          for(int nextIter=0;nextIter<10;nextIter++) {
            if (VERBOSE) {
              System.out.println("TEST: next");
              //if (storeOrd) {
              //System.out.println("  ord=" + termEnum.ord());
              //}
            }
            termEnum.next();
            if (termEnum.term() != null && "body".equals(termEnum.term().field())) {
              if (VERBOSE) {
                System.out.println("  term=" + termEnum.term());
              }
              assertNotNull(fstEnum.next());
              assertSame(termEnum, fstEnum, storeOrd);
            } else {
              if (VERBOSE) {
                System.out.println("  end!");
              }
              IntsRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next();
              if (nextResult != null) {
                System.out.println("expected null but got: input=" + toString(nextResult.input) + " output=" + outputs.outputToString(nextResult.output));
                fail();
              }
              break;
            }
          }
        }
      }
    }

    r.close();
    dir.close();
  }

  private void assertSame(TermEnum termEnum, IntsRefFSTEnum fstEnum, boolean storeOrd) throws Exception {
    if (termEnum.term() == null || !"body".equals(termEnum.term().field())) {
      if (fstEnum.current() != null) {
        fail("fstEnum.current().input=" + toString(fstEnum.current().input));
      }
    } else {
      assertNotNull(fstEnum.current());
      assertEquals(termEnum.term() + " != " + toString(fstEnum.current().input), termEnum.term().text(), toString(fstEnum.current().input));
      if (storeOrd) {
        // fst stored the ord
        // No ord in 3.x
        // assertEquals(termEnum.ord(), ((Long) fstEnum.current().output).longValue());
      } else {
        // fst stored the docFreq
        assertEquals(termEnum.docFreq(), (int) (((Long) fstEnum.current().output).longValue()));
      }
    }
  }

  private static abstract class VisitTerms<T> {
    private final String dirOut;
    private final String wordsFileIn;
    private int inputMode;
    private final Outputs<T> outputs;
    private final Builder<T> builder;

    public VisitTerms(String dirOut, String wordsFileIn, int inputMode, int prune, Outputs<T> outputs) {
      this.dirOut = dirOut;
      this.wordsFileIn = wordsFileIn;
      this.inputMode = inputMode;
      this.outputs = outputs;
      
      builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, outputs);
    }

    protected abstract T getOutput(IntsRef input, int ord) throws IOException;

    public void run(int limit, boolean verify) throws IOException {
      BufferedReader is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
      try {
        final IntsRef intsRef = new IntsRef(10);
        long tStart = System.currentTimeMillis();
        int ord = 0;
        while(true) {
          String w = is.readLine();
          if (w == null) {
            break;
          }
          toIntsRef(w, inputMode, intsRef);
          builder.add(intsRef,
                      getOutput(intsRef, ord));

          ord++;
          if (ord % 500000 == 0) {
            System.out.println(
                String.format(Locale.ENGLISH, 
                    "%6.2fs: %9d...", ((System.currentTimeMillis() - tStart) / 1000.0), ord));
          }
          if (ord >= limit) {
            break;
          }
        }

        assert builder.getTermCount() == ord;
        final FST<T> fst = builder.finish();
        if (fst == null) {
          System.out.println("FST was fully pruned!");
          System.exit(0);
        }

        if (dirOut == null)
          return;

        System.out.println(ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs; " + fst.getArcWithOutputCount() + " arcs w/ output; tot size " + fst.sizeInBytes());
        if (fst.getNodeCount() < 100) {
          Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
          Util.toDot(fst, w, false, false);
          w.close();
          System.out.println("Wrote FST to out.dot");
        }

        Directory dir = FSDirectory.open(new File(dirOut));
        IndexOutput out = dir.createOutput("fst.bin");
        fst.save(out);
        out.close();

        System.out.println("Saved FST to fst.bin.");

        if (!verify) {
          return;
        }

        System.out.println("\nNow verify...");

        is.close();
        is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);

        ord = 0;
        tStart = System.currentTimeMillis();
        while(true) {
          String w = is.readLine();
          if (w == null) {
            break;
          }
          toIntsRef(w, inputMode, intsRef);
          T expected = getOutput(intsRef, ord);
          T actual = Util.get(fst, intsRef);
          if (actual == null) {
            throw new RuntimeException("unexpected null output on input=" + w);
          }
          if (!actual.equals(expected)) {
            throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w);
          }

          ord++;
          if (ord % 500000 == 0) {
            System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "...");
          }
          if (ord >= limit) {
            break;
          }
        }

        double totSec = ((System.currentTimeMillis() - tStart)/1000.0);
        System.out.println("Verify took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)");

      } finally {
        is.close();
      }
    }
  }

  // java -cp build/classes/test:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.automaton.fst.TestFSTs /x/tmp/allTerms3.txt out
  public static void main(String[] args) throws IOException {
    int prune = 0;
    int limit = Integer.MAX_VALUE;
    int inputMode = 0;                             // utf8
    boolean storeOrds = false;
    boolean storeDocFreqs = false;
    boolean verify = true;
    
    String wordsFileIn = null;
    String dirOut = null;

    int idx = 0;
    while (idx < args.length) {
      if (args[idx].equals("-prune")) {
        prune = Integer.valueOf(args[1 + idx]);
        idx++;
      } else if (args[idx].equals("-limit")) {
        limit = Integer.valueOf(args[1 + idx]);
        idx++;
      } else if (args[idx].equals("-utf8")) {
        inputMode = 0;
      } else if (args[idx].equals("-utf32")) {
        inputMode = 1;
      } else if (args[idx].equals("-docFreq")) {
        storeDocFreqs = true;
      } else if (args[idx].equals("-ords")) {
        storeOrds = true;
      } else if (args[idx].equals("-noverify")) {
        verify = false;
      } else if (args[idx].startsWith("-")) {
        System.err.println("Unrecognized option: " + args[idx]);
        System.exit(-1);
      } else {
        if (wordsFileIn == null) {
          wordsFileIn = args[idx];
        } else if (dirOut == null) {
          dirOut = args[idx];
        } else {
          System.err.println("Too many arguments, expected: input [output]");
          System.exit(-1);
        }
      }
      idx++;
    }
    
    if (wordsFileIn == null) {
      System.err.println("No input file.");
      System.exit(-1);
    }

    // ord benefits from share, docFreqs don't:

    if (storeOrds && storeDocFreqs) {
      // Store both ord & docFreq:
      final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(true);
      final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(false);
      final PairOutputs<Long,Long> outputs = new PairOutputs(o1, o2);
      new VisitTerms<PairOutputs.Pair(dirOut, wordsFileIn, inputMode, prune, outputs) {
        Random rand;
        @Override
        public PairOutputs.Pair<Long,Long> getOutput(IntsRef input, int ord) {
          if (ord == 0) {
            rand = new Random(17);
          }
          return new PairOutputs.Pair<Long,Long>(o1.get(ord),
                                                 o2.get(_TestUtil.nextInt(rand, 1, 5000)));
        }
      }.run(limit, verify);
    } else if (storeOrds) {
      // Store only ords
      final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
      new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs) {
        @Override
        public Long getOutput(IntsRef input, int ord) {
          return outputs.get(ord);
        }
      }.run(limit, verify);
    } else if (storeDocFreqs) {
      // Store only docFreq
      final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(false);
      new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs) {
        Random rand;
        @Override
        public Long getOutput(IntsRef input, int ord) {
          if (ord == 0) {
            rand = new Random(17);
          }
          return outputs.get(_TestUtil.nextInt(rand, 1, 5000));
        }
      }.run(limit, verify);
    } else {
      // Store nothing
      final NoOutputs outputs = NoOutputs.getSingleton();
      final Object NO_OUTPUT = outputs.getNoOutput();
      new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs) {
        @Override
        public Object getOutput(IntsRef input, int ord) {
          return NO_OUTPUT;
        }
      }.run(limit, verify);
    }
  }

  public void testSingleString() throws Exception {
    final Outputs<Object> outputs = NoOutputs.getSingleton();
    final Builder<Object> b = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
    b.add(new BytesRef("foobar"), outputs.getNoOutput());
    final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum(b.finish());
    assertNull(fstEnum.seekFloor(new BytesRef("foo")));
    assertNull(fstEnum.seekCeil(new BytesRef("foobaz")));
  }

  public void testSimple() throws Exception {

    // Get outputs -- passing true means FST will share
    // (delta code) the outputs.  This should result in
    // smaller FST if the outputs grow monotonically.  But
    // if numbers are "random", false should give smaller
    // final size:
    final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);

    // Build an FST mapping BytesRef -> Long
    final Builder<Long> builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);

    final BytesRef a = new BytesRef("a");
    final BytesRef b = new BytesRef("b");
    final BytesRef c = new BytesRef("c");

    builder.add(a, outputs.get(17));
    builder.add(b, outputs.get(42));
    builder.add(c, outputs.get(13824324872317238L));

    final FST<Long> fst = builder.finish();

    assertEquals(13824324872317238L, (long) Util.get(fst, c));
    assertEquals(42, (long) Util.get(fst, b));
    assertEquals(17, (long) Util.get(fst, a));

    BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum(fst);
    BytesRefFSTEnum.InputOutput<Long> seekResult;
    seekResult = fstEnum.seekFloor(a);
    assertNotNull(seekResult);
    assertEquals(17, (long) seekResult.output);

    // goes to a
    seekResult = fstEnum.seekFloor(new BytesRef("aa"));
    assertNotNull(seekResult);
    assertEquals(17, (long) seekResult.output);

    // goes to b
    seekResult = fstEnum.seekCeil(new BytesRef("aa"));
    assertNotNull(seekResult);
    assertEquals(b, seekResult.input);
    assertEquals(42, (long) seekResult.output);
  }

  /**
   * Test state expansion (array format) on close-to-root states. Creates
   * synthetic input that has one expanded state on each level.
   * 
   * @see "https://issues.apache.org/jira/browse/LUCENE-2933" 
   */
  public void testExpandedCloseToRoot() throws Exception {
    class SyntheticData {
      FST<Object> compile(String[] lines) throws IOException {
        final NoOutputs outputs = NoOutputs.getSingleton();
        final Object nothing = outputs.getNoOutput();
        final Builder<Object> b = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);

        int line = 0;
        final BytesRef term = new BytesRef();
        while (line < lines.length) {
          String w = lines[line++];
          if (w == null) {
            break;
          }
          term.copy(w);
          b.add(term, nothing);
        }
        
        return b.finish();
      }
      
      void generate(ArrayList<String> out, StringBuilder b, char from, char to,
          int depth) {
        if (depth == 0 || from == to) {
          String seq = b.toString() + "_" + out.size() + "_end";
          out.add(seq);
        } else {
          for (char c = from; c <= to; c++) {
            b.append(c);
            generate(out, b, from, c == to ? to : from, depth - 1);
            b.deleteCharAt(b.length() - 1);
          }
        }
      }

      public int verifyStateAndBelow(FST<Object> fst, Arc arc, int depth) 
        throws IOException {
        if (fst.targetHasArcs(arc)) {
          int childCount = 0;
          for (arc = fst.readFirstTargetArc(arc, arc);; 
               arc = fst.readNextArc(arc), childCount++)
          {
            boolean expanded = fst.isExpandedTarget(arc);
            int children = verifyStateAndBelow(fst, new FST.Arc<Object>().copyFrom(arc), depth + 1);

            assertEquals(
                expanded,
                (depth <= FST.FIXED_ARRAY_SHALLOW_DISTANCE && 
                    children >= FST.FIXED_ARRAY_NUM_ARCS_SHALLOW) ||
                 children >= FST.FIXED_ARRAY_NUM_ARCS_DEEP);
            if (arc.isLast()) break;
          }

          return childCount;
        }
        return 0;
      }
    }

    // Sanity check.
    assertTrue(FST.FIXED_ARRAY_NUM_ARCS_SHALLOW < FST.FIXED_ARRAY_NUM_ARCS_DEEP);
    assertTrue(FST.FIXED_ARRAY_SHALLOW_DISTANCE >= 0);

    SyntheticData s = new SyntheticData();

    ArrayList<String> out = new ArrayList();
    StringBuilder b = new StringBuilder();
    s.generate(out, b, 'a', 'i', 10);
    String[] input = out.toArray(new String[out.size()]);
    Arrays.sort(input);
    FST<Object> fst = s.compile(input);
    FST.Arc<Object> arc = fst.getFirstArc(new FST.Arc());
    s.verifyStateAndBelow(fst, arc, 1);
  }

  // Make sure raw FST can differentiate between final vs
  // non-final end nodes
  public void testNonFinalStopNodes() throws Exception {
    final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
    final Long nothing = outputs.getNoOutput();
    final Builder<Long> b = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);

    final FST<Long> fst = new FST(FST.INPUT_TYPE.BYTE1, outputs);

    final Builder.UnCompiledNode<Long> rootNode = new Builder.UnCompiledNode(b, 0);

    // Add final stop node
    {
      final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode(b, 0);
      node.isFinal = true;
      rootNode.addArc('a', node);
      final Builder.CompiledNode frozen = new Builder.CompiledNode();
      frozen.address = fst.addNode(node);
      rootNode.arcs[0].nextFinalOutput = outputs.get(17);
      rootNode.arcs[0].isFinal = true;
      rootNode.arcs[0].output = nothing;
      rootNode.arcs[0].target = frozen;
    }

    // Add non-final stop node
    {
      final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode(b, 0);
      rootNode.addArc('b', node);
      final Builder.CompiledNode frozen = new Builder.CompiledNode();
      frozen.address = fst.addNode(node);
      rootNode.arcs[1].nextFinalOutput = nothing;
      rootNode.arcs[1].output = outputs.get(42);
      rootNode.arcs[1].target = frozen;
    }

    fst.finish(fst.addNode(rootNode));
    
    checkStopNodes(fst, outputs);

    // Make sure it still works after save/load:
    Directory dir = newDirectory();
    IndexOutput out = dir.createOutput("fst");
    fst.save(out);
    out.close();

    IndexInput in = dir.openInput("fst");
    final FST<Long> fst2 = new FST(in, outputs);
    checkStopNodes(fst2, outputs);
    in.close();
    dir.close();
  }

  private void checkStopNodes(FST<Long> fst, PositiveIntOutputs outputs) throws Exception {
    final Long nothing = outputs.getNoOutput();
    FST.Arc<Long> startArc = fst.getFirstArc(new FST.Arc());
    assertEquals(nothing, startArc.output);
    assertEquals(nothing, startArc.nextFinalOutput);

    FST.Arc<Long> arc = fst.readFirstTargetArc(startArc, new FST.Arc());
    assertEquals('a', arc.label);
    assertEquals(17, arc.nextFinalOutput.longValue());
    assertTrue(arc.isFinal());

    arc = fst.readNextArc(arc);
    assertEquals('b', arc.label);
    assertFalse(arc.isFinal());
    assertEquals(42, arc.output.longValue());
  }
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene TestFSTs.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.