alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (TermInfosReader.java)

This example Lucene source code file (TermInfosReader.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

cloneableterm, cloneableterm, closeablethreadlocal, io, ioexception, ioexception, override, segmenttermenum, segmenttermenum, string, term, terminfo, terminfo, terminfoandord, threadresources

The Lucene TermInfosReader.java source code

package org.apache.lucene.index;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;

import org.apache.lucene.store.Directory;
import org.apache.lucene.util.DoubleBarrelLRUCache;
import org.apache.lucene.util.CloseableThreadLocal;

/** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
 * Directory.  Pairs are accessed either by Term or by ordinal position the
 * set.  */

final class TermInfosReader {
  private final Directory directory;
  private final String segment;
  private final FieldInfos fieldInfos;

  private final CloseableThreadLocal<ThreadResources> threadResources = new CloseableThreadLocal();
  private final SegmentTermEnum origEnum;
  private final long size;

  private final Term[] indexTerms;
  private final TermInfo[] indexInfos;
  private final long[] indexPointers;
  
  private final int totalIndexInterval;

  private final static int DEFAULT_CACHE_SIZE = 1024;

  // Just adds term's ord to TermInfo
  private final static class TermInfoAndOrd extends TermInfo {
    final long termOrd;
    public TermInfoAndOrd(TermInfo ti, long termOrd) {
      super(ti);
      assert termOrd >= 0;
      this.termOrd = termOrd;
    }
  }

  private static class CloneableTerm extends DoubleBarrelLRUCache.CloneableKey {
    private final Term term;

    public CloneableTerm(Term t) {
      this.term = new Term(t.field(), t.text());
    }

    @Override
    public Object clone() {
      return new CloneableTerm(term);
    }

    @Override
    public boolean equals(Object _other) {
      CloneableTerm other = (CloneableTerm) _other;
      return term.equals(other.term);
    }

    @Override
    public int hashCode() {
      return term.hashCode();
    }
  }

  private final DoubleBarrelLRUCache<CloneableTerm,TermInfoAndOrd> termsCache = new DoubleBarrelLRUCache(DEFAULT_CACHE_SIZE);
  
  /**
   * Per-thread resources managed by ThreadLocal
   */
  private static final class ThreadResources {
    SegmentTermEnum termEnum;
  }
  
  TermInfosReader(Directory dir, String seg, FieldInfos fis, int readBufferSize, int indexDivisor)
       throws CorruptIndexException, IOException {
    boolean success = false;

    if (indexDivisor < 1 && indexDivisor != -1) {
      throw new IllegalArgumentException("indexDivisor must be -1 (don't load terms index) or greater than 0: got " + indexDivisor);
    }

    try {
      directory = dir;
      segment = seg;
      fieldInfos = fis;

      origEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.TERMS_EXTENSION),
          readBufferSize), fieldInfos, false);
      size = origEnum.size;


      if (indexDivisor != -1) {
        // Load terms index
        totalIndexInterval = origEnum.indexInterval * indexDivisor;
        final SegmentTermEnum indexEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.TERMS_INDEX_EXTENSION),
                                                                                  readBufferSize), fieldInfos, true);

        try {
          int indexSize = 1+((int)indexEnum.size-1)/indexDivisor;  // otherwise read index

          indexTerms = new Term[indexSize];
          indexInfos = new TermInfo[indexSize];
          indexPointers = new long[indexSize];
        
          for (int i = 0; indexEnum.next(); i++) {
            indexTerms[i] = indexEnum.term();
            indexInfos[i] = indexEnum.termInfo();
            indexPointers[i] = indexEnum.indexPointer;
        
            for (int j = 1; j < indexDivisor; j++)
              if (!indexEnum.next())
                break;
          }
        } finally {
          indexEnum.close();
        }
      } else {
        // Do not load terms index:
        totalIndexInterval = -1;
        indexTerms = null;
        indexInfos = null;
        indexPointers = null;
      }
      success = true;
    } finally {
      // With lock-less commits, it's entirely possible (and
      // fine) to hit a FileNotFound exception above. In
      // this case, we want to explicitly close any subset
      // of things that were opened so that we don't have to
      // wait for a GC to do so.
      if (!success) {
        close();
      }
    }
  }

  public int getSkipInterval() {
    return origEnum.skipInterval;
  }
  
  public int getMaxSkipLevels() {
    return origEnum.maxSkipLevels;
  }

  final void close() throws IOException {
    if (origEnum != null)
      origEnum.close();
    threadResources.close();
  }

  /** Returns the number of term/value pairs in the set. */
  final long size() {
    return size;
  }

  private ThreadResources getThreadResources() {
    ThreadResources resources = threadResources.get();
    if (resources == null) {
      resources = new ThreadResources();
      resources.termEnum = terms();
      threadResources.set(resources);
    }
    return resources;
  }


  /** Returns the offset of the greatest index entry which is less than or equal to term.*/
  private final int getIndexOffset(Term term) {
    int lo = 0;					  // binary search indexTerms[]
    int hi = indexTerms.length - 1;

    while (hi >= lo) {
      int mid = (lo + hi) >>> 1;
      int delta = term.compareTo(indexTerms[mid]);
      if (delta < 0)
	hi = mid - 1;
      else if (delta > 0)
	lo = mid + 1;
      else
	return mid;
    }
    return hi;
  }

  private final void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
    enumerator.seek(indexPointers[indexOffset],
                   ((long) indexOffset * totalIndexInterval) - 1,
                   indexTerms[indexOffset], indexInfos[indexOffset]);
  }

  /** Returns the TermInfo for a Term in the set, or null. */
  TermInfo get(Term term) throws IOException {
    return get(term, false);
  }
  
  /** Returns the TermInfo for a Term in the set, or null. */
  private TermInfo get(Term term, boolean mustSeekEnum) throws IOException {
    if (size == 0) return null;

    ensureIndexIsRead();

    final CloneableTerm cacheKey = new CloneableTerm(term);

    TermInfoAndOrd tiOrd = termsCache.get(cacheKey);
    ThreadResources resources = getThreadResources();
    
    if (!mustSeekEnum && tiOrd != null) {
      return tiOrd;
    }
    
    // optimize sequential access: first try scanning cached enum w/o seeking
    SegmentTermEnum enumerator = resources.termEnum;
    if (enumerator.term() != null                 // term is at or past current
	&& ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0)
	    || term.compareTo(enumerator.term()) >= 0)) {
      int enumOffset = (int)(enumerator.position/totalIndexInterval)+1;
      if (indexTerms.length == enumOffset	  // but before end of block
    || term.compareTo(indexTerms[enumOffset]) < 0) {
       // no need to seek

        final TermInfo ti;

        int numScans = enumerator.scanTo(term);
        if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) {
          ti = enumerator.termInfo();
          if (numScans > 1) {
            // we only  want to put this TermInfo into the cache if
            // scanEnum skipped more than one dictionary entry.
            // This prevents RangeQueries or WildcardQueries to 
            // wipe out the cache when they iterate over a large numbers
            // of terms in order
            if (tiOrd == null) {
              termsCache.put(cacheKey, new TermInfoAndOrd(ti, enumerator.position));
            } else {
              assert sameTermInfo(ti, tiOrd, enumerator);
              assert (int) enumerator.position == tiOrd.termOrd;
            }
          }
        } else {
          ti = null;
        }

        return ti;
      }  
    }

    // random-access: must seek
    final int indexPos;
    if (tiOrd != null) {
      indexPos = (int) (tiOrd.termOrd / totalIndexInterval);
    } else {
      // Must do binary search:
      indexPos = getIndexOffset(term);
    }

    seekEnum(enumerator, indexPos);
    enumerator.scanTo(term);
    final TermInfo ti;
    if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) {
      ti = enumerator.termInfo();
      if (tiOrd == null) {
        // LUCENE-3183: it's possible, if term is Term("",
        // ""), for the STE to be incorrectly un-positioned
        // after scan-to; work around this by not caching in
        // this case:
        if (enumerator.position >= 0) {
          termsCache.put(cacheKey, new TermInfoAndOrd(ti, enumerator.position));
        }
      } else {
        assert sameTermInfo(ti, tiOrd, enumerator);
        assert enumerator.position == tiOrd.termOrd;
      }
    } else {
      ti = null;
    }
    return ti;
  }

  // called only from asserts
  private final boolean sameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator) {
    if (ti1.docFreq != ti2.docFreq) {
      return false;
    }
    if (ti1.freqPointer != ti2.freqPointer) {
      return false;
    }
    if (ti1.proxPointer != ti2.proxPointer) {
      return false;
    }
    // skipOffset is only valid when docFreq >= skipInterval:
    if (ti1.docFreq >= enumerator.skipInterval &&
        ti1.skipOffset != ti2.skipOffset) {
      return false;
    }
    return true;
  }

  private void ensureIndexIsRead() {
    if (indexTerms == null) {
      throw new IllegalStateException("terms index was not loaded when this reader was created");
    }
  }

  /** Returns the position of a Term in the set or -1. */
  final long getPosition(Term term) throws IOException {
    if (size == 0) return -1;

    ensureIndexIsRead();
    int indexOffset = getIndexOffset(term);
    
    SegmentTermEnum enumerator = getThreadResources().termEnum;
    seekEnum(enumerator, indexOffset);

    while(term.compareTo(enumerator.term()) > 0 && enumerator.next()) {}

    if (term.compareTo(enumerator.term()) == 0)
      return enumerator.position;
    else
      return -1;
  }

  /** Returns an enumeration of all the Terms and TermInfos in the set. */
  public SegmentTermEnum terms() {
    return (SegmentTermEnum)origEnum.clone();
  }

  /** Returns an enumeration of terms starting at or after the named term. */
  public SegmentTermEnum terms(Term term) throws IOException {
    get(term, true);
    return (SegmentTermEnum)getThreadResources().termEnum.clone();
  }
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene TermInfosReader.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.