alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (FSTEnum.java)

This example Lucene source code file (FSTEnum.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

fst, fstenum, io, ioexception, ioexception, no_output, no_output, object, suppresswarnings, suppresswarnings, t, t

The Lucene FSTEnum.java source code

package org.apache.lucene.util.fst;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;

import java.io.IOException;

/** Can next() and advance() through the terms in an FST
 *
  * @lucene.experimental
*/

abstract class FSTEnum<T> {
  protected final FST<T> fst;

  @SuppressWarnings("unchecked") protected FST.Arc<T>[] arcs = new FST.Arc[10];
  // outputs are cumulative
  @SuppressWarnings("unchecked") protected T[] output = (T[]) new Object[10];

  protected final T NO_OUTPUT;
  protected final FST.Arc<T> scratchArc = new FST.Arc();

  protected int upto;
  protected int targetLength;

  /** doFloor controls the behavior of advance: if it's true
   *  doFloor is true, advance positions to the biggest
   *  term before target.  */
  protected FSTEnum(FST<T> fst) {
    this.fst = fst;
    NO_OUTPUT = fst.outputs.getNoOutput();
    fst.getFirstArc(getArc(0));
    output[0] = NO_OUTPUT;
  }

  protected abstract int getTargetLabel();
  protected abstract int getCurrentLabel();

  protected abstract void setCurrentLabel(int label);
  protected abstract void grow();

  /** Rewinds enum state to match the shared prefix between
   *  current term and target term */
  protected final void rewindPrefix() throws IOException {
    if (upto == 0) {
      //System.out.println("  init");
      upto = 1;
      fst.readFirstTargetArc(getArc(0), getArc(1));
      return;
    }
    //System.out.println("  rewind upto=" + upto + " vs targetLength=" + targetLength);

    final int currentLimit = upto;
    upto = 1;
    while (upto < currentLimit && upto <= targetLength+1) {
      final int cmp = getCurrentLabel() - getTargetLabel();
      if (cmp < 0) {
        // seek forward
        break;
      } else if (cmp > 0) {
        // seek backwards -- reset this arc to the first arc
        final FST.Arc<T> arc = getArc(upto);
        fst.readFirstTargetArc(getArc(upto-1), arc);
        //System.out.println("    seek first arc");
        break;
      }
      upto++;
    }
  }

  protected void doNext() throws IOException {
    //System.out.println("FE: next upto=" + upto);
    if (upto == 0) {
      //System.out.println("  init");
      upto = 1;
      fst.readFirstTargetArc(getArc(0), getArc(1));
    } else {
      // pop
      //System.out.println("  check pop curArc target=" + arcs[upto].target + " label=" + arcs[upto].label + " isLast?=" + arcs[upto].isLast());
      while (arcs[upto].isLast()) {
        upto--;
        if (upto == 0) {
          //System.out.println("  eof");
          return;
        }
      }
      fst.readNextArc(arcs[upto]);
    }

    pushFirst();
  }

  // TODO: should we return a status here (SEEK_FOUND / SEEK_NOT_FOUND /
  // SEEK_END)?  saves the eq check above?

  /** Seeks to smallest term that's >= target. */
  protected void doSeekCeil() throws IOException {

    //System.out.println("    advance len=" + target.length + " curlen=" + current.length);

    // TODO: possibly caller could/should provide common
    // prefix length?  ie this work may be redundant if
    // caller is in fact intersecting against its own
    // automaton

    //System.out.println("FE.seekCeil upto=" + upto);

    // Save time by starting at the end of the shared prefix
    // b/w our current term & the target:
    rewindPrefix();
    //System.out.println("  after rewind upto=" + upto);

    FST.Arc<T> arc = getArc(upto);
    int targetLabel = getTargetLabel();
    //System.out.println("  init targetLabel=" + targetLabel);

    // Now scan forward, matching the new suffix of the target
    while(true) {

      //System.out.println("  cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") vs targetLabel=" + targetLabel);

      if (arc.bytesPerArc != 0 && arc.label != -1) {

        // Arcs are fixed array -- use binary search to find
        // the target.

        final FST<T>.BytesReader in = fst.getBytesReader(0);
        int low = arc.arcIdx;
        int high = arc.numArcs-1;
        int mid = 0;
        //System.out.println("do arc array low=" + low + " high=" + high + " targetLabel=" + targetLabel);
        boolean found = false;
        while (low <= high) {
          mid = (low + high) >>> 1;
          in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1;
          final int midLabel = fst.readLabel(in);
          final int cmp = midLabel - targetLabel;
          //System.out.println("  cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp);
          if (cmp < 0)
            low = mid + 1;
          else if (cmp > 0)
            high = mid - 1;
          else {
            found = true;
            break;
          }
        }

        // NOTE: this code is dup'd w/ the code below (in
        // the outer else clause):
        if (found) {
          // Match
          arc.arcIdx = mid-1;
          fst.readNextRealArc(arc);
          assert arc.arcIdx == mid;
          assert arc.label == targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid;
          output[upto] = fst.outputs.add(output[upto-1], arc.output);
          if (targetLabel == FST.END_LABEL) {
            return;
          }
          setCurrentLabel(arc.label);
          incr();
          arc = fst.readFirstTargetArc(arc, getArc(upto));
          targetLabel = getTargetLabel();
          continue;
        } else if (low == arc.numArcs) {
          // Dead end
          arc.arcIdx = arc.numArcs-2;
          fst.readNextRealArc(arc);
          assert arc.isLast();
          // Dead end (target is after the last arc);
          // rollback to last fork then push
          upto--;
          while(true) {
            if (upto == 0) {
              return;
            }
            final FST.Arc<T> prevArc = getArc(upto);
            //System.out.println("  rollback upto=" + upto + " arc.label=" + prevArc.label + " isLast?=" + prevArc.isLast());
            if (!prevArc.isLast()) {
              fst.readNextArc(prevArc);
              pushFirst();
              return;
            }
            upto--;
          }
        } else {
          arc.arcIdx = (low > high ? low : high)-1;
          fst.readNextRealArc(arc);
          assert arc.label > targetLabel;
          pushFirst();
          return;
        }
      } else {
        // Arcs are not array'd -- must do linear scan:
        if (arc.label == targetLabel) {
          // recurse
          output[upto] = fst.outputs.add(output[upto-1], arc.output);
          if (targetLabel == FST.END_LABEL) {
            return;
          }
          setCurrentLabel(arc.label);
          incr();
          arc = fst.readFirstTargetArc(arc, getArc(upto));
          targetLabel = getTargetLabel();
        } else if (arc.label > targetLabel) {
          pushFirst();
          return;
        } else if (arc.isLast()) {
          // Dead end (target is after the last arc);
          // rollback to last fork then push
          upto--;
          while(true) {
            if (upto == 0) {
              return;
            }
            final FST.Arc<T> prevArc = getArc(upto);
            //System.out.println("  rollback upto=" + upto + " arc.label=" + prevArc.label + " isLast?=" + prevArc.isLast());
            if (!prevArc.isLast()) {
              fst.readNextArc(prevArc);
              pushFirst();
              return;
            }
            upto--;
          }
        } else {
          // keep scanning
          //System.out.println("    next scan");
          fst.readNextArc(arc);
        }
      }
    }
  }

  // TODO: should we return a status here (SEEK_FOUND / SEEK_NOT_FOUND /
  // SEEK_END)?  saves the eq check above?
  /** Seeks to largest term that's <= target. */
  protected void doSeekFloor() throws IOException {

    // TODO: possibly caller could/should provide common
    // prefix length?  ie this work may be redundant if
    // caller is in fact intersecting against its own
    // automaton
    //System.out.println("FE: seek floor upto=" + upto);

    // Save CPU by starting at the end of the shared prefix
    // b/w our current term & the target:
    rewindPrefix();

    //System.out.println("FE: after rewind upto=" + upto);

    FST.Arc<T> arc = getArc(upto);
    int targetLabel = getTargetLabel();

    //System.out.println("FE: init targetLabel=" + targetLabel);

    // Now scan forward, matching the new suffix of the target
    while(true) {
      //System.out.println("  cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast());

      if (arc.bytesPerArc != 0 && arc.label != FST.END_LABEL) {
        // Arcs are fixed array -- use binary search to find
        // the target.

        final FST<T>.BytesReader in = fst.getBytesReader(0);
        int low = arc.arcIdx;
        int high = arc.numArcs-1;
        int mid = 0;
        //System.out.println("do arc array low=" + low + " high=" + high + " targetLabel=" + targetLabel);
        boolean found = false;
        while (low <= high) {
          mid = (low + high) >>> 1;
          in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1;
          final int midLabel = fst.readLabel(in);
          final int cmp = midLabel - targetLabel;
          //System.out.println("  cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp);
          if (cmp < 0)
            low = mid + 1;
          else if (cmp > 0)
            high = mid - 1;
          else {
            found = true;
            break;
          }
        }

        // NOTE: this code is dup'd w/ the code below (in
        // the outer else clause):
        if (found) {
          // Match -- recurse
          //System.out.println("  match!  arcIdx=" + mid);
          arc.arcIdx = mid-1;
          fst.readNextRealArc(arc);
          assert arc.arcIdx == mid;
          assert arc.label == targetLabel: "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid;
          output[upto] = fst.outputs.add(output[upto-1], arc.output);
          if (targetLabel == FST.END_LABEL) {
            return;
          }
          setCurrentLabel(arc.label);
          incr();
          arc = fst.readFirstTargetArc(arc, getArc(upto));
          targetLabel = getTargetLabel();
          continue;
        } else if (high == -1) {
          //System.out.println("  before first");
          // Very first arc is after our target
          // TODO: if each arc could somehow read the arc just
          // before, we can save this re-scan.  The ceil case
          // doesn't need this because it reads the next arc
          // instead:
          while(true) {
            // First, walk backwards until we find a first arc
            // that's before our target label:
            fst.readFirstTargetArc(getArc(upto-1), arc);
            if (arc.label < targetLabel) {
              // Then, scan forwards to the arc just before
              // the targetLabel:
              while(!arc.isLast() && fst.readNextArcLabel(arc) < targetLabel) {
                fst.readNextArc(arc);
              }
              pushLast();
              return;
            }
            upto--;
            if (upto == 0) {
              return;
            }
            targetLabel = getTargetLabel();
            arc = getArc(upto);
          }
        } else {
          // There is a floor arc:
          arc.arcIdx = (low > high ? high : low)-1;
          //System.out.println(" hasFloor arcIdx=" + (arc.arcIdx+1));
          fst.readNextRealArc(arc);
          assert arc.isLast() || fst.readNextArcLabel(arc) > targetLabel;
          assert arc.label < targetLabel;
          pushLast();
          return;
        }        
      } else {

        if (arc.label == targetLabel) {
          // Match -- recurse
          output[upto] = fst.outputs.add(output[upto-1], arc.output);
          if (targetLabel == FST.END_LABEL) {
            return;
          }
          setCurrentLabel(arc.label);
          incr();
          arc = fst.readFirstTargetArc(arc, getArc(upto));
          targetLabel = getTargetLabel();
        } else if (arc.label > targetLabel) {
          // TODO: if each arc could somehow read the arc just
          // before, we can save this re-scan.  The ceil case
          // doesn't need this because it reads the next arc
          // instead:
          while(true) {
            // First, walk backwards until we find a first arc
            // that's before our target label:
            fst.readFirstTargetArc(getArc(upto-1), arc);
            if (arc.label < targetLabel) {
              // Then, scan forwards to the arc just before
              // the targetLabel:
              while(!arc.isLast() && fst.readNextArcLabel(arc) < targetLabel) {
                fst.readNextArc(arc);
              }
              pushLast();
              return;
            }
            upto--;
            if (upto == 0) {
              return;
            }
            targetLabel = getTargetLabel();
            arc = getArc(upto);
          }
        } else if (!arc.isLast()) {
          //System.out.println("  check next label=" + fst.readNextArcLabel(arc) + " (" + (char) fst.readNextArcLabel(arc) + ")");
          if (fst.readNextArcLabel(arc) > targetLabel) {
            pushLast();
            return;
          } else {
            // keep scanning
            fst.readNextArc(arc);
          }
        } else {
          pushLast();
          return;
        }
      }
    }
  }

  private void incr() {
    upto++;
    grow();
    if (arcs.length <= upto) {
      @SuppressWarnings("unchecked") final FST.Arc<T>[] newArcs =
        new FST.Arc[ArrayUtil.oversize(1+upto, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
      System.arraycopy(arcs, 0, newArcs, 0, arcs.length);
      arcs = newArcs;
    }
    if (output.length <= upto) {
      @SuppressWarnings("unchecked") final T[] newOutput =
        (T[]) new Object[ArrayUtil.oversize(1+upto, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
      System.arraycopy(output, 0, newOutput, 0, output.length);
      output = newOutput;
    }
  }

  // Appends current arc, and then recurses from its target,
  // appending first arc all the way to the final node
  private void pushFirst() throws IOException {

    FST.Arc<T> arc = arcs[upto];
    assert arc != null;

    while (true) {
      output[upto] = fst.outputs.add(output[upto-1], arc.output);
      if (arc.label == FST.END_LABEL) {
        // Final node
        break;
      }
      //System.out.println("  pushFirst label=" + (char) arc.label + " upto=" + upto + " output=" + fst.outputs.outputToString(output[upto]));
      setCurrentLabel(arc.label);
      incr();
      
      final FST.Arc<T> nextArc = getArc(upto);
      fst.readFirstTargetArc(arc, nextArc);
      arc = nextArc;
    }
  }

  // Recurses from current arc, appending last arc all the
  // way to the first final node
  private void pushLast() throws IOException {

    FST.Arc<T> arc = arcs[upto];
    assert arc != null;

    while (true) {
      setCurrentLabel(arc.label);
      output[upto] = fst.outputs.add(output[upto-1], arc.output);
      if (arc.label == FST.END_LABEL) {
        // Final node
        break;
      }
      incr();

      arc = fst.readLastTargetArc(arc, getArc(upto));
    }
  }

  private FST.Arc<T> getArc(int idx) {
    if (arcs[idx] == null) {
      arcs[idx] = new FST.Arc<T>();
    }
    return arcs[idx];
  }
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene FSTEnum.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.