Lucene example source code file (TermsHashPerField.java)

This example Lucene source code file (TermsHashPerField.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" ^TM.
Java - Lucene tags/keywords

fieldinfo, fieldinvertstate, inverteddocconsumerperfield, io, ioexception, ioexception, override, override, parallelpostingsarray, string, termshashconsumerperfield, termshashperfield, termshashperfield, termshashperthread, termshashperthread, util
The Lucene TermsHashPerField.java source code

package org.apache.lucene.index;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.Arrays;

import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.SorterTemplate;

final class TermsHashPerField extends InvertedDocConsumerPerField {

  final TermsHashConsumerPerField consumer;

  final TermsHashPerField nextPerField;
  final TermsHashPerThread perThread;
  final DocumentsWriter.DocState docState;
  final FieldInvertState fieldState;
  CharTermAttribute termAtt;
  
  // Copied from our perThread
  final CharBlockPool charPool;
  final IntBlockPool intPool;
  final ByteBlockPool bytePool;

  final int streamCount;
  final int numPostingInt;

  final FieldInfo fieldInfo;

  boolean postingsCompacted;
  int numPostings;
  private int postingsHashSize = 4;
  private int postingsHashHalfSize = postingsHashSize/2;
  private int postingsHashMask = postingsHashSize-1;
  private int[] postingsHash;
 
  ParallelPostingsArray postingsArray;
  
  public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) {
    this.perThread = perThread;
    intPool = perThread.intPool;
    charPool = perThread.charPool;
    bytePool = perThread.bytePool;
    docState = perThread.docState;

    postingsHash = new int[postingsHashSize];
    Arrays.fill(postingsHash, -1);
    bytesUsed(postingsHashSize * RamUsageEstimator.NUM_BYTES_INT);

    fieldState = docInverterPerField.fieldState;
    this.consumer = perThread.consumer.addField(this, fieldInfo);
    initPostingsArray();

    streamCount = consumer.getStreamCount();
    numPostingInt = 2*streamCount;
    this.fieldInfo = fieldInfo;
    if (nextPerThread != null)
      nextPerField = (TermsHashPerField) nextPerThread.addField(docInverterPerField, fieldInfo);
    else
      nextPerField = null;
  }

  private void initPostingsArray() {
    postingsArray = consumer.createPostingsArray(2);
    bytesUsed(postingsArray.size * postingsArray.bytesPerPosting());
  }

  // sugar: just forwards to DW
  private void bytesUsed(long size) {
    if (perThread.termsHash.trackAllocations) {
      perThread.termsHash.docWriter.bytesUsed(size);
    }
  }
  
  void shrinkHash(int targetSize) {
    assert postingsCompacted || numPostings == 0;

    final int newSize = 4;
    if (newSize != postingsHash.length) {
      final long previousSize = postingsHash.length;
      postingsHash = new int[newSize];
      bytesUsed((newSize-previousSize)*RamUsageEstimator.NUM_BYTES_INT);
      Arrays.fill(postingsHash, -1);
      postingsHashSize = newSize;
      postingsHashHalfSize = newSize/2;
      postingsHashMask = newSize-1;
    }

    // Fully free the postings array on each flush:
    if (postingsArray != null) {
      bytesUsed(-postingsArray.bytesPerPosting() * postingsArray.size);
      postingsArray = null;
    }
  }

  public void reset() {
    if (!postingsCompacted)
      compactPostings();
    assert numPostings <= postingsHash.length;
    if (numPostings > 0) {
      Arrays.fill(postingsHash, 0, numPostings, -1);
      numPostings = 0;
    }
    postingsCompacted = false;
    if (nextPerField != null)
      nextPerField.reset();
  }

  @Override
  synchronized public void abort() {
    reset();
    if (nextPerField != null)
      nextPerField.abort();
  }
  
  private final void growParallelPostingsArray() {
    int oldSize = postingsArray.size;
    this.postingsArray = this.postingsArray.grow();
    bytesUsed(postingsArray.bytesPerPosting() * (postingsArray.size - oldSize));
  }

  public void initReader(ByteSliceReader reader, int termID, int stream) {
    assert stream < streamCount;
    int intStart = postingsArray.intStarts[termID];
    final int[] ints = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
    final int upto = intStart & DocumentsWriter.INT_BLOCK_MASK;
    reader.init(bytePool,
                postingsArray.byteStarts[termID]+stream*ByteBlockPool.FIRST_LEVEL_SIZE,
                ints[upto+stream]);
  }

  private void compactPostings() {
    int upto = 0;
    for(int i=0;i<postingsHashSize;i++) {
      if (postingsHash[i] != -1) {
        if (upto < i) {
          postingsHash[upto] = postingsHash[i];
          postingsHash[i] = -1;
        }
        upto++;
      }
    }

    assert upto == numPostings: "upto=" + upto + " numPostings=" + numPostings;
    postingsCompacted = true;
  }

  /** Collapse the hash table & sort in-place. */
  public int[] sortPostings() {
    compactPostings();
    final int[] postingsHash = this.postingsHash;
    new SorterTemplate() {
      @Override
      protected void swap(int i, int j) {
        final int o = postingsHash[i];
        postingsHash[i] = postingsHash[j];
        postingsHash[j] = o;
      }
      
      @Override
      protected int compare(int i, int j) {
        final int term1 = postingsHash[i], term2 = postingsHash[j];
        if (term1 == term2)
          return 0;
        final int textStart1 = postingsArray.textStarts[term1],
          textStart2 = postingsArray.textStarts[term2];
        final char[] text1 = charPool.buffers[textStart1 >> DocumentsWriter.CHAR_BLOCK_SHIFT];
        final int pos1 = textStart1 & DocumentsWriter.CHAR_BLOCK_MASK;
        final char[] text2 = charPool.buffers[textStart2 >> DocumentsWriter.CHAR_BLOCK_SHIFT];
        final int pos2 = textStart2 & DocumentsWriter.CHAR_BLOCK_MASK;
        return comparePostings(text1, pos1, text2, pos2);
      }

      @Override
      protected void setPivot(int i) {
        pivotTerm = postingsHash[i];
        final int textStart = postingsArray.textStarts[pivotTerm];
        pivotBuf = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
        pivotBufPos = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
      }
  
      @Override
      protected int comparePivot(int j) {
        final int term = postingsHash[j];
        if (pivotTerm == term)
          return 0;
        final int textStart = postingsArray.textStarts[term];
        final char[] text = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
        final int pos = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
        return comparePostings(pivotBuf, pivotBufPos, text, pos);
      }
      
      private int pivotTerm, pivotBufPos;
      private char[] pivotBuf;

      /** Compares term text for two Posting instance and
       *  returns -1 if p1 < p2; 1 if p1 > p2; else 0. */
      private int comparePostings(final char[] text1, int pos1, final char[] text2, int pos2) {
        assert text1 != text2 || pos1 != pos2;

        while(true) {
          final char c1 = text1[pos1++];
          final char c2 = text2[pos2++];
          if (c1 != c2) {
            if (0xffff == c2)
              return 1;
            else if (0xffff == c1)
              return -1;
            else
              return c1-c2;
          } else
            // This method should never compare equal postings
            // unless p1==p2
            assert c1 != 0xffff;
        }
      }
    }.quickSort(0, numPostings-1);
    return postingsHash;
  }

  /** Test whether the text for current RawPostingList p equals
   *  current tokenText. */
  private boolean postingEquals(final int termID, final char[] tokenText, final int tokenTextLen) {
    final int textStart = postingsArray.textStarts[termID];
    
    final char[] text = perThread.charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
    assert text != null;
    int pos = textStart & DocumentsWriter.CHAR_BLOCK_MASK;

    int tokenPos = 0;
    for(;tokenPos<tokenTextLen;pos++,tokenPos++)
      if (tokenText[tokenPos] != text[pos])
        return false;
    return 0xffff == text[pos];
  }
  
  private boolean doCall;
  private boolean doNextCall;

  @Override
  void start(Fieldable f) {
    termAtt = fieldState.attributeSource.addAttribute(CharTermAttribute.class);
    consumer.start(f);
    if (nextPerField != null) {
      nextPerField.start(f);
    }
  }
  
  @Override
  boolean start(Fieldable[] fields, int count) throws IOException {
    doCall = consumer.start(fields, count);
    if (postingsArray == null) {
      initPostingsArray();
    }

    if (nextPerField != null)
      doNextCall = nextPerField.start(fields, count);
    return doCall || doNextCall;
  }

  // Secondary entry point (for 2nd & subsequent TermsHash),
  // because token text has already been "interned" into
  // textStart, so we hash by textStart
  public void add(int textStart) throws IOException {
    int code = textStart;

    int hashPos = code & postingsHashMask;

    assert !postingsCompacted;

    // Locate RawPostingList in hash
    int termID = postingsHash[hashPos];

    if (termID != -1 && postingsArray.textStarts[termID] != textStart) {
      // Conflict: keep searching different locations in
      // the hash table.
      final int inc = ((code>>8)+code)|1;
      do {
        code += inc;
        hashPos = code & postingsHashMask;
        termID = postingsHash[hashPos];
      } while (termID != -1 && postingsArray.textStarts[termID] != textStart);
    }

    if (termID == -1) {

      // First time we are seeing this token since we last
      // flushed the hash.

      // New posting
      termID = numPostings++;
      if (termID >= postingsArray.size) {
        growParallelPostingsArray();
      }

      assert termID >= 0;

      postingsArray.textStarts[termID] = textStart;
          
      assert postingsHash[hashPos] == -1;
      postingsHash[hashPos] = termID;

      if (numPostings == postingsHashHalfSize)
        rehashPostings(2*postingsHashSize);

      // Init stream slices
      if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
        intPool.nextBuffer();

      if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE)
        bytePool.nextBuffer();

      intUptos = intPool.buffer;
      intUptoStart = intPool.intUpto;
      intPool.intUpto += streamCount;

      postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;

      for(int i=0;i<streamCount;i++) {
        final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
        intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
      }
      postingsArray.byteStarts[termID] = intUptos[intUptoStart];

      consumer.newTerm(termID);

    } else {
      int intStart = postingsArray.intStarts[termID];
      intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
      intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
      consumer.addTerm(termID);
    }
  }

  // Primary entry point (for first TermsHash)
  @Override
  void add() throws IOException {

    assert !postingsCompacted;

    // We are first in the chain so we must "intern" the
    // term text into textStart address

    // Get the text of this term.
    final char[] tokenText = termAtt.buffer();
    final int tokenTextLen = termAtt.length();

    // Compute hashcode & replace any invalid UTF16 sequences
    int downto = tokenTextLen;
    int code = 0;
    while (downto > 0) {
      char ch = tokenText[--downto];

      if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END) {
        if (0 == downto) {
          // Unpaired
          ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
        } else {
          final char ch2 = tokenText[downto-1];
          if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END) {
            // OK: high followed by low.  This is a valid
            // surrogate pair.
            code = ((code*31) + ch)*31+ch2;
            downto--;
            continue;
          } else {
            // Unpaired
            ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
          }            
        }
      } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && (ch <= UnicodeUtil.UNI_SUR_HIGH_END ||
                                                          ch == 0xffff)) {
        // Unpaired or 0xffff
        ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
      }

      code = (code*31) + ch;
    }

    int hashPos = code & postingsHashMask;

    // Locate RawPostingList in hash
    int termID = postingsHash[hashPos];

    if (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen)) {
      // Conflict: keep searching different locations in
      // the hash table.
      final int inc = ((code>>8)+code)|1;
      do {
        code += inc;
        hashPos = code & postingsHashMask;
        termID = postingsHash[hashPos];
      } while (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen));
    }

    if (termID == -1) {

      // First time we are seeing this token since we last
      // flushed the hash.
      final int textLen1 = 1+tokenTextLen;
      if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE) {
        if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE) {
          // Just skip this term, to remain as robust as
          // possible during indexing.  A TokenFilter
          // can be inserted into the analyzer chain if
          // other behavior is wanted (pruning the term
          // to a prefix, throwing an exception, etc).

          if (docState.maxTermPrefix == null)
            docState.maxTermPrefix = new String(tokenText, 0, 30);

          consumer.skippingLongTerm();
          return;
        }
        charPool.nextBuffer();
      }

      // New posting
      termID = numPostings++;
      if (termID >= postingsArray.size) {
        growParallelPostingsArray();
      }

      assert termID != -1;

      final char[] text = charPool.buffer;
      final int textUpto = charPool.charUpto;
      postingsArray.textStarts[termID] = textUpto + charPool.charOffset;
      charPool.charUpto += textLen1;
      System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen);
      text[textUpto+tokenTextLen] = 0xffff;
          
      assert postingsHash[hashPos] == -1;
      postingsHash[hashPos] = termID;

      if (numPostings == postingsHashHalfSize) {
        rehashPostings(2*postingsHashSize);
        bytesUsed(2*numPostings * RamUsageEstimator.NUM_BYTES_INT);
      }

      // Init stream slices
      if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
        intPool.nextBuffer();

      if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE)
        bytePool.nextBuffer();

      intUptos = intPool.buffer;
      intUptoStart = intPool.intUpto;
      intPool.intUpto += streamCount;

      postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;

      for(int i=0;i<streamCount;i++) {
        final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
        intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
      }
      postingsArray.byteStarts[termID] = intUptos[intUptoStart];
      
      consumer.newTerm(termID);

    } else {
      final int intStart = postingsArray.intStarts[termID];
      intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
      intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
      consumer.addTerm(termID);
    }

    if (doNextCall)
      nextPerField.add(postingsArray.textStarts[termID]);
  }

  int[] intUptos;
  int intUptoStart;

  void writeByte(int stream, byte b) {
    int upto = intUptos[intUptoStart+stream];
    byte[] bytes = bytePool.buffers[upto >> DocumentsWriter.BYTE_BLOCK_SHIFT];
    assert bytes != null;
    int offset = upto & DocumentsWriter.BYTE_BLOCK_MASK;
    if (bytes[offset] != 0) {
      // End of slice; allocate a new one
      offset = bytePool.allocSlice(bytes, offset);
      bytes = bytePool.buffer;
      intUptos[intUptoStart+stream] = offset + bytePool.byteOffset;
    }
    bytes[offset] = b;
    (intUptos[intUptoStart+stream])++;
  }

  public void writeBytes(int stream, byte[] b, int offset, int len) {
    // TODO: optimize
    final int end = offset + len;
    for(int i=offset;i<end;i++)
      writeByte(stream, b[i]);
  }

  void writeVInt(int stream, int i) {
    assert stream < streamCount;
    while ((i & ~0x7F) != 0) {
      writeByte(stream, (byte)((i & 0x7f) | 0x80));
      i >>>= 7;
    }
    writeByte(stream, (byte) i);
  }

  @Override
  void finish() throws IOException {
    try {
      consumer.finish();
    } finally {
      if (nextPerField != null) {
        nextPerField.finish();
      }
    }
  }

  /** Called when postings hash is too small (> 50%
   *  occupied) or too large (< 20% occupied). */
  void rehashPostings(final int newSize) {

    final int newMask = newSize-1;

    int[] newHash = new int[newSize];
    Arrays.fill(newHash, -1);
    for(int i=0;i<postingsHashSize;i++) {
      int termID = postingsHash[i];
      if (termID != -1) {
        int code;
        if (perThread.primary) {
          final int textStart = postingsArray.textStarts[termID];
          final int start = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
          final char[] text = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
          int pos = start;
          while(text[pos] != 0xffff)
            pos++;
          code = 0;
          while (pos > start)
            code = (code*31) + text[--pos];
        } else
          code = postingsArray.textStarts[termID];

        int hashPos = code & newMask;
        assert hashPos >= 0;
        if (newHash[hashPos] != -1) {
          final int inc = ((code>>8)+code)|1;
          do {
            code += inc;
            hashPos = code & newMask;
          } while (newHash[hashPos] != -1);
        }
        newHash[hashPos] = termID;
      }
    }

    postingsHashMask = newMask;
    postingsHash = newHash;

    postingsHashSize = newSize;
    postingsHashHalfSize = newSize >> 1;
  }
}
Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene TermsHashPerField.java source code file:
Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.
... this post is sponsored by my books ...
#1 New Release!	FP Best Seller
Lucene example source code file (TermsHashPerField.java)

Java - Lucene tags/keywords

The Lucene TermsHashPerField.java source code

Other Lucene examples (source code examples)

new blog posts