Java example - TermVectorsReader.java

What this is

This file is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" ^TM.
The source code

package org.apache.lucene.index;

import org.apache.lucene.store.Directory;
import org.apache.lucene.store.InputStream;

import java.io.IOException;

/** TODO: relax synchro!
 */
class TermVectorsReader {
  private FieldInfos fieldInfos;

  private InputStream tvx;
  private InputStream tvd;
  private InputStream tvf;
  private int size;

  TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos)
    throws IOException {
    if (d.fileExists(segment + TermVectorsWriter.TVX_EXTENSION)) {
      tvx = d.openFile(segment + TermVectorsWriter.TVX_EXTENSION);
      checkValidFormat(tvx);
      tvd = d.openFile(segment + TermVectorsWriter.TVD_EXTENSION);
      checkValidFormat(tvd);
      tvf = d.openFile(segment + TermVectorsWriter.TVF_EXTENSION);
      checkValidFormat(tvf);
      size = (int) tvx.length() / 8;
    }

    this.fieldInfos = fieldInfos;
  }
  
  private void checkValidFormat(InputStream in) throws IOException
  {
    int format = in.readInt();
    if (format > TermVectorsWriter.FORMAT_VERSION)
    {
      throw new IOException("Incompatible format version: " + format + " expected " 
              + TermVectorsWriter.FORMAT_VERSION + " or less");
    }
    
  }

  synchronized void close() throws IOException {
    // why don't we trap the exception and at least make sure that
    // all streams that we can close are closed?
    if (tvx != null) tvx.close();
    if (tvd != null) tvd.close();
    if (tvf != null) tvf.close();
  }

  /**
   * 
   * @return The number of documents in the reader
   */
  int size() {
    return size;
  }

  /**
   * Retrieve the term vector for the given document and field
   * @param docNum The document number to retrieve the vector for
   * @param field The field within the document to retrieve
   * @return The TermFreqVector for the document and field or null
   */ 
  synchronized TermFreqVector get(int docNum, String field) {
    // Check if no term vectors are available for this segment at all
    int fieldNumber = fieldInfos.fieldNumber(field);
    TermFreqVector result = null;
    if (tvx != null) {
      try {
        //We need to account for the FORMAT_SIZE at when seeking in the tvx
        //We don't need to do this in other seeks because we already have the file pointer
        //that was written in another file
        tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
        //System.out.println("TVX Pointer: " + tvx.getFilePointer());
        long position = tvx.readLong();

        tvd.seek(position);
        int fieldCount = tvd.readVInt();
        //System.out.println("Num Fields: " + fieldCount);
        // There are only a few fields per document. We opt for a full scan
        // rather then requiring that they be ordered. We need to read through
        // all of the fields anyway to get to the tvf pointers.
        int number = 0;
        int found = -1;
        for (int i = 0; i < fieldCount; i++) {
          number += tvd.readVInt();
          if (number == fieldNumber) found = i;
        }
  
        // This field, although valid in the segment, was not found in this document
        if (found != -1) {
          // Compute position in the tvf file
          position = 0;
          for (int i = 0; i <= found; i++)
          {
            position += tvd.readVLong();
          }
          result = readTermVector(field, position);
        }
        else {
          //System.out.println("Field not found");
        }
          
      } catch (Exception e) {
        //e.printStackTrace();
      }
    }
    else
    {
      System.out.println("No tvx file");
    }
    return result;
  }


  /** Return all term vectors stored for this document or null if the could not be read in. */
  synchronized TermFreqVector[] get(int docNum) {
    TermFreqVector[] result = null;
    // Check if no term vectors are available for this segment at all
    if (tvx != null) {
      try {
        //We need to offset by
        tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
        long position = tvx.readLong();

        tvd.seek(position);
        int fieldCount = tvd.readVInt();

        // No fields are vectorized for this document
        if (fieldCount != 0) {
          int number = 0;
          String[] fields = new String[fieldCount];

          for (int i = 0; i < fieldCount; i++) {
            number += tvd.readVInt();
            fields[i] = fieldInfos.fieldName(number);
          }
  
          // Compute position in the tvf file
          position = 0;
          long[] tvfPointers = new long[fieldCount];
          for (int i = 0; i < fieldCount; i++) {
            position += tvd.readVLong();
            tvfPointers[i] = position;
          }

          result = readTermVectors(fields, tvfPointers);
        }
      } catch (IOException e) {
        e.printStackTrace();
      }
    }
    else
    {
      System.out.println("No tvx file");
    }
    return result;
  }


  private SegmentTermVector[] readTermVectors(String fields[], long tvfPointers[])
          throws IOException {
    SegmentTermVector res[] = new SegmentTermVector[fields.length];
    for (int i = 0; i < fields.length; i++) {
      res[i] = readTermVector(fields[i], tvfPointers[i]);
    }
    return res;
  }

  /**
   * 
   * @param fieldNum The field to read in
   * @param tvfPointer The pointer within the tvf file where we should start reading
   * @return The TermVector located at that position
   * @throws IOException
   */ 
  private SegmentTermVector readTermVector(String field, long tvfPointer)
          throws IOException {

    // Now read the data from specified position
    //We don't need to offset by the FORMAT here since the pointer already includes the offset
    tvf.seek(tvfPointer);

    int numTerms = tvf.readVInt();
    //System.out.println("Num Terms: " + numTerms);
    // If no terms - return a constant empty termvector
    if (numTerms == 0) return new SegmentTermVector(field, null, null);

    int length = numTerms + tvf.readVInt();

    String terms[] = new String[numTerms];
    
    int termFreqs[] = new int[numTerms];

    int start = 0;
    int deltaLength = 0;
    int totalLength = 0;
    char [] buffer = {};
    String previousString = "";
    for (int i = 0; i < numTerms; i++) {
      start = tvf.readVInt();
      deltaLength = tvf.readVInt();
      totalLength = start + deltaLength;
      if (buffer.length < totalLength)
      {
        buffer = new char[totalLength];
        for (int j = 0; j < previousString.length(); j++)  // copy contents
          buffer[j] = previousString.charAt(j);
      }
      tvf.readChars(buffer, start, deltaLength);
      terms[i] = new String(buffer, 0, totalLength);
      previousString = terms[i];
      termFreqs[i] = tvf.readVInt();
    }
    SegmentTermVector tv = new SegmentTermVector(field, terms, termFreqs);
    return tv;
  }

}
Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.
What this is

Other links

The source code

new blog posts

... this post is sponsored by my books ...
#1 New Release!	FP Best Seller