alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (SortedVIntList.java)

This example Lucene source code file (SortedVIntList.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

bit_shift, bit_shift, docidsetiterator, input, io, ioexception, max_bytes_per_int, no_more_docs, override, sortedvintlist, sortedvintlist, sortedvintlistbuilder, sortedvintlistbuilder, util, vb1, vb1

The Lucene SortedVIntList.java source code

package org.apache.lucene.util;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.BitSet;

import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;

/**
 * Stores and iterate on sorted integers in compressed form in RAM. <br>
 * The code for compressing the differences between ascending integers was
 * borrowed from {@link org.apache.lucene.store.IndexInput} and
 * {@link org.apache.lucene.store.IndexOutput}.
 * <p>
 * <b>NOTE: this class assumes the stored integers are doc Ids (hence why it
 * extends {@link DocIdSet}). Therefore its {@link #iterator()} assumes {@link
 * DocIdSetIterator#NO_MORE_DOCS} can be used as sentinel. If you intent to use
 * this value, then make sure it's not used during search flow.
 */
public class SortedVIntList extends DocIdSet {
  /** When a BitSet has fewer than 1 in BITS2VINTLIST_SIZE bits set,
   * a SortedVIntList representing the index numbers of the set bits
   * will be smaller than that BitSet.
   */
  final static int BITS2VINTLIST_SIZE = 8;

  private int size;
  private byte[] bytes;
  private int lastBytePos;
    
  /**
   *  Create a SortedVIntList from all elements of an array of integers.
   *
   * @param  sortedInts  A sorted array of non negative integers.
   */
  public SortedVIntList(int... sortedInts) {
    this(sortedInts, sortedInts.length);
  }

  /**
   * Create a SortedVIntList from an array of integers.
   * @param  sortedInts  An array of sorted non negative integers.
   * @param  inputSize   The number of integers to be used from the array.
   */
  public SortedVIntList(int[] sortedInts, int inputSize) {
    SortedVIntListBuilder builder = new SortedVIntListBuilder();
    for (int i = 0; i < inputSize; i++) {
      builder.addInt(sortedInts[i]);
    }
    builder.done();
  }

  /**
   * Create a SortedVIntList from a BitSet.
   * @param  bits  A bit set representing a set of integers.
   */
  public SortedVIntList(BitSet bits) {
    SortedVIntListBuilder builder = new SortedVIntListBuilder();
    int nextInt = bits.nextSetBit(0);
    while (nextInt != -1) {
      builder.addInt(nextInt);
      nextInt = bits.nextSetBit(nextInt + 1);
    }
    builder.done();
  }

  /**
   * Create a SortedVIntList from an OpenBitSet.
   * @param  bits  A bit set representing a set of integers.
   */
  public SortedVIntList(OpenBitSet bits) {
    SortedVIntListBuilder builder = new SortedVIntListBuilder();
    int nextInt = bits.nextSetBit(0);
    while (nextInt != -1) {
      builder.addInt(nextInt);
      nextInt = bits.nextSetBit(nextInt + 1);
    }
    builder.done();
  }

  /**
   * Create a SortedVIntList.
   * @param  docIdSetIterator  An iterator providing document numbers as a set of integers.
   *                  This DocIdSetIterator is iterated completely when this constructor
   *                  is called and it must provide the integers in non
   *                  decreasing order.
   */
  public SortedVIntList(DocIdSetIterator docIdSetIterator) throws IOException {
    SortedVIntListBuilder builder = new SortedVIntListBuilder();
    int doc;
    while ((doc = docIdSetIterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
      builder.addInt(doc);
    }
    builder.done();
  }


  private class SortedVIntListBuilder {
    private int lastInt = 0;
    
    SortedVIntListBuilder() {
      initBytes();
      lastInt = 0;
    }

    void addInt(int nextInt) {
      int diff = nextInt - lastInt;
      if (diff < 0) {
        throw new IllegalArgumentException(
            "Input not sorted or first element negative.");
      }
  
      if ((lastBytePos + MAX_BYTES_PER_INT) > bytes.length) {
        // Biggest possible int does not fit.
        resizeBytes(ArrayUtil.oversize(lastBytePos + MAX_BYTES_PER_INT, 1));
      }
  
      // See org.apache.lucene.store.IndexOutput.writeVInt()
      while ((diff & ~VB1) != 0) { // The high bit of the next byte needs to be set.
        bytes[lastBytePos++] = (byte) ((diff & VB1) | ~VB1);
        diff >>>= BIT_SHIFT;
      }
      bytes[lastBytePos++] = (byte) diff; // Last byte, high bit not set.
      size++;
      lastInt = nextInt;
    }
    
    void done() {
      resizeBytes(lastBytePos);
    }
  }


  private void initBytes() {
    size = 0;
    bytes = new byte[128]; // initial byte size
    lastBytePos = 0;
  }

  private void resizeBytes(int newSize) {
    if (newSize != bytes.length) {
      byte[] newBytes = new byte[newSize];
      System.arraycopy(bytes, 0, newBytes, 0, lastBytePos);
      bytes = newBytes;
    }
  }

  private static final int VB1 = 0x7F;
  private static final int BIT_SHIFT = 7;
  private final int MAX_BYTES_PER_INT = (31 / BIT_SHIFT) + 1;

  /**
   * @return    The total number of sorted integers.
   */
  public int size() {
    return size;
  }

  /**
   * @return The size of the byte array storing the compressed sorted integers.
   */
  public int getByteSize() {
    return bytes.length;
  }

  /** This DocIdSet implementation is cacheable. */
  @Override
  public boolean isCacheable() {
    return true;
  }

  /**
   * @return    An iterator over the sorted integers.
   */
  @Override
  public DocIdSetIterator iterator() {
    return new DocIdSetIterator() {
      int bytePos = 0;
      int lastInt = 0;
      int doc = -1;
      
      private void advance() {
        // See org.apache.lucene.store.IndexInput.readVInt()
        byte b = bytes[bytePos++];
        lastInt += b & VB1;
        for (int s = BIT_SHIFT; (b & ~VB1) != 0; s += BIT_SHIFT) {
          b = bytes[bytePos++];
          lastInt += (b & VB1) << s;
        }
      }
      
      @Override
      public int docID() {
        return doc;
      }
      
      @Override
      public int nextDoc() {
        if (bytePos >= lastBytePos) {
          doc = NO_MORE_DOCS;
        } else {
          advance();
          doc = lastInt;
        }
        return doc;
      }
      
      @Override
      public int advance(int target) {
        while (bytePos < lastBytePos) {
          advance();
          if (lastInt >= target) {
            return doc = lastInt;
          }
        }
        return doc = NO_MORE_DOCS;
      }
      
    };
  }
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene SortedVIntList.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.