alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (CheckIndex.java)

This example Lucene source code file (CheckIndex.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

error, io, ioexception, lucene, lucene, mysegmenttermdocs, ok, runtimeexception, runtimeexception, segmentreader, status, string, string, text, throwable, throwable, util

The Lucene CheckIndex.java source code

package org.apache.lucene.index;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.document.AbstractField;  // for javadocs
import org.apache.lucene.document.Document;

import java.text.NumberFormat;
import java.io.PrintStream;
import java.io.IOException;
import java.io.File;
import java.util.Collection;

import java.util.Comparator;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;

/**
 * Basic tool and API to check the health of an index and
 * write a new segments file that removes reference to
 * problematic segments.
 * 
 * <p>As this tool checks every byte in the index, on a large
 * index it can take quite a long time to run.
 *
 * @lucene.experimental Please make a complete backup of your
 * index before using this to fix your index!
 */
public class CheckIndex {

  private PrintStream infoStream;
  private Directory dir;

  /**
   * Returned from {@link #checkIndex()} detailing the health and status of the index.
   *
   * @lucene.experimental
   **/

  public static class Status {

    /** True if no problems were found with the index. */
    public boolean clean;

    /** True if we were unable to locate and load the segments_N file. */
    public boolean missingSegments;

    /** True if we were unable to open the segments_N file. */
    public boolean cantOpenSegments;

    /** True if we were unable to read the version number from segments_N file. */
    public boolean missingSegmentVersion;

    /** Name of latest segments_N file in the index. */
    public String segmentsFileName;

    /** Number of segments in the index. */
    public int numSegments;

    /** String description of the version of the index. */
    public String segmentFormat;

    /** Empty unless you passed specific segments list to check as optional 3rd argument.
     *  @see CheckIndex#checkIndex(List) */
    public List<String> segmentsChecked = new ArrayList();
  
    /** True if the index was created with a newer version of Lucene than the CheckIndex tool. */
    public boolean toolOutOfDate;

    /** List of {@link SegmentInfoStatus} instances, detailing status of each segment. */
    public List<SegmentInfoStatus> segmentInfos = new ArrayList();
  
    /** Directory index is in. */
    public Directory dir;

    /** 
     * SegmentInfos instance containing only segments that
     * had no problems (this is used with the {@link CheckIndex#fixIndex} 
     * method to repair the index. 
     */
    SegmentInfos newSegments;

    /** How many documents will be lost to bad segments. */
    public int totLoseDocCount;

    /** How many bad segments were found. */
    public int numBadSegments;

    /** True if we checked only specific segments ({@link
     * #checkIndex(List)}) was called with non-null
     * argument). */
    public boolean partial;

    /** The greatest segment name. */
    public int maxSegmentName;

    /** Whether the SegmentInfos.counter is greater than any of the segments' names. */
    public boolean validCounter; 

    /** Holds the userData of the last commit in the index */
    public Map<String, String> userData;

    /** Holds the status of each segment in the index.
     *  See {@link #segmentInfos}.
     *
     * <p>WARNING: this API is new and experimental and is
     * subject to suddenly change in the next release.
     */
    public static class SegmentInfoStatus {
      /** Name of the segment. */
      public String name;

      /** Document count (does not take deletions into account). */
      public int docCount;

      /** True if segment is compound file format. */
      public boolean compound;

      /** Number of files referenced by this segment. */
      public int numFiles;

      /** Net size (MB) of the files referenced by this
       *  segment. */
      public double sizeMB;

      /** Doc store offset, if this segment shares the doc
       *  store files (stored fields and term vectors) with
       *  other segments.  This is -1 if it does not share. */
      public int docStoreOffset = -1;
    
      /** String of the shared doc store segment, or null if
       *  this segment does not share the doc store files. */
      public String docStoreSegment;

      /** True if the shared doc store files are compound file
       *  format. */
      public boolean docStoreCompoundFile;

      /** True if this segment has pending deletions. */
      public boolean hasDeletions;

      /** Name of the current deletions file name. */
      public String deletionsFileName;
    
      /** Number of deleted documents. */
      public int numDeleted;

      /** True if we were able to open a SegmentReader on this
       *  segment. */
      public boolean openReaderPassed;

      /** Number of fields in this segment. */
      int numFields;

      /** True if at least one of the fields in this segment
       *  does not omitTermFreqAndPositions.
       *  @see AbstractField#setOmitTermFreqAndPositions */
      public boolean hasProx;

      /** Map that includes certain
       *  debugging details that IndexWriter records into
       *  each segment it creates */
      public Map<String,String> diagnostics;

      /** Status for testing of field norms (null if field norms could not be tested). */
      public FieldNormStatus fieldNormStatus;

      /** Status for testing of indexed terms (null if indexed terms could not be tested). */
      public TermIndexStatus termIndexStatus;

      /** Status for testing of stored fields (null if stored fields could not be tested). */
      public StoredFieldStatus storedFieldStatus;

      /** Status for testing of term vectors (null if term vectors could not be tested). */
      public TermVectorStatus termVectorStatus;
    }

    /**
     * Status from testing field norms.
     */
    public static final class FieldNormStatus {
      /** Number of fields successfully tested */
      public long totFields = 0L;

      /** Exception thrown during term index test (null on success) */
      public Throwable error = null;
    }

    /**
     * Status from testing term index.
     */
    public static final class TermIndexStatus {
      /** Total term count */
      public long termCount = 0L;

      /** Total frequency across all terms. */
      public long totFreq = 0L;
      
      /** Total number of positions. */
      public long totPos = 0L;

      /** Exception thrown during term index test (null on success) */
      public Throwable error = null;
    }

    /**
     * Status from testing stored fields.
     */
    public static final class StoredFieldStatus {
      
      /** Number of documents tested. */
      public int docCount = 0;
      
      /** Total number of stored fields tested. */
      public long totFields = 0;
      
      /** Exception thrown during stored fields test (null on success) */
      public Throwable error = null;
    }

    /**
     * Status from testing stored fields.
     */
    public static final class TermVectorStatus {
      
      /** Number of documents tested. */
      public int docCount = 0;
      
      /** Total number of term vectors tested. */
      public long totVectors = 0;
      
      /** Exception thrown during term vector test (null on success) */
      public Throwable error = null;
    }
  }

  /** Create a new CheckIndex on the directory. */
  public CheckIndex(Directory dir) {
    this.dir = dir;
    infoStream = null;
  }

  /** Set infoStream where messages should go.  If null, no
   *  messages are printed */
  public void setInfoStream(PrintStream out) {
    infoStream = out;
  }

  private void msg(String msg) {
    if (infoStream != null)
      infoStream.println(msg);
  }

  private static class MySegmentTermDocs extends SegmentTermDocs {

    int delCount;

    MySegmentTermDocs(SegmentReader p) {    
      super(p);
    }

    @Override
    public void seek(Term term) throws IOException {
      super.seek(term);
      delCount = 0;
    }

    @Override
    protected void skippingDoc() throws IOException {
      delCount++;
    }
  }

  /** Returns a {@link Status} instance detailing
   *  the state of the index.
   *
   *  <p>As this method checks every byte in the index, on a large
   *  index it can take quite a long time to run.
   *
   *  <p>WARNING: make sure
   *  you only call this when the index is not opened by any
   *  writer. */
  public Status checkIndex() throws IOException {
    return checkIndex(null);
  }

  /** Returns a {@link Status} instance detailing
   *  the state of the index.
   * 
   *  @param onlySegments list of specific segment names to check
   *
   *  <p>As this method checks every byte in the specified
   *  segments, on a large index it can take quite a long
   *  time to run.
   *
   *  <p>WARNING: make sure
   *  you only call this when the index is not opened by any
   *  writer. */
  public Status checkIndex(List<String> onlySegments) throws IOException {
    NumberFormat nf = NumberFormat.getInstance();
    SegmentInfos sis = new SegmentInfos();
    Status result = new Status();
    result.dir = dir;
    try {
      sis.read(dir);
    } catch (Throwable t) {
      msg("ERROR: could not read any segments file in directory");
      result.missingSegments = true;
      if (infoStream != null)
        t.printStackTrace(infoStream);
      return result;
    }

    // find the oldest and newest segment versions
    String oldest = Integer.toString(Integer.MAX_VALUE), newest = Integer.toString(Integer.MIN_VALUE);
    String oldSegs = null;
    boolean foundNonNullVersion = false;
    Comparator<String> versionComparator = StringHelper.getVersionComparator();
    for (SegmentInfo si : sis) {
      String version = si.getVersion();
      if (version == null) {
        // pre-3.1 segment
        oldSegs = "pre-3.1";
      } else if (version.equals("2.x")) {
        // an old segment that was 'touched' by 3.1+ code
        oldSegs = "2.x";
      } else {
        foundNonNullVersion = true;
        if (versionComparator.compare(version, oldest) < 0) {
          oldest = version;
        }
        if (versionComparator.compare(version, newest) > 0) {
          newest = version;
        }
      }
    }
    
    final int numSegments = sis.size();
    final String segmentsFileName = sis.getCurrentSegmentFileName();
    IndexInput input = null;
    try {
      input = dir.openInput(segmentsFileName);
    } catch (Throwable t) {
      msg("ERROR: could not open segments file in directory");
      if (infoStream != null)
        t.printStackTrace(infoStream);
      result.cantOpenSegments = true;
      return result;
    }
    int format = 0;
    try {
      format = input.readInt();
    } catch (Throwable t) {
      msg("ERROR: could not read segment file version in directory");
      if (infoStream != null)
        t.printStackTrace(infoStream);
      result.missingSegmentVersion = true;
      return result;
    } finally {
      if (input != null)
        input.close();
    }

    String sFormat = "";
    boolean skip = false;

    if (format == SegmentInfos.FORMAT)
      sFormat = "FORMAT [Lucene Pre-2.1]";
    if (format == SegmentInfos.FORMAT_LOCKLESS)
      sFormat = "FORMAT_LOCKLESS [Lucene 2.1]";
    else if (format == SegmentInfos.FORMAT_SINGLE_NORM_FILE)
      sFormat = "FORMAT_SINGLE_NORM_FILE [Lucene 2.2]";
    else if (format == SegmentInfos.FORMAT_SHARED_DOC_STORE)
      sFormat = "FORMAT_SHARED_DOC_STORE [Lucene 2.3]";
    else {
      if (format == SegmentInfos.FORMAT_CHECKSUM)
        sFormat = "FORMAT_CHECKSUM [Lucene 2.4]";
      else if (format == SegmentInfos.FORMAT_DEL_COUNT)
        sFormat = "FORMAT_DEL_COUNT [Lucene 2.4]";
      else if (format == SegmentInfos.FORMAT_HAS_PROX)
        sFormat = "FORMAT_HAS_PROX [Lucene 2.4]";
      else if (format == SegmentInfos.FORMAT_USER_DATA)
        sFormat = "FORMAT_USER_DATA [Lucene 2.9]";
      else if (format == SegmentInfos.FORMAT_DIAGNOSTICS)
        sFormat = "FORMAT_DIAGNOSTICS [Lucene 2.9]";
      else if (format == SegmentInfos.FORMAT_HAS_VECTORS)
        sFormat = "FORMAT_HAS_VECTORS [Lucene 3.1]";
      else if (format == SegmentInfos.FORMAT_3_1)
        sFormat = "FORMAT_3_1 [Lucene 3.1+]";
      else if (format == SegmentInfos.CURRENT_FORMAT)
        throw new RuntimeException("BUG: You should update this tool!");
      else if (format < SegmentInfos.CURRENT_FORMAT) {
        sFormat = "int=" + format + " [newer version of Lucene than this tool]";
        skip = true;
      } else {
        sFormat = format + " [Lucene 1.3 or prior]";
      }
    }

    result.segmentsFileName = segmentsFileName;
    result.numSegments = numSegments;
    result.segmentFormat = sFormat;
    result.userData = sis.getUserData();
    String userDataString;
    if (sis.getUserData().size() > 0) {
      userDataString = " userData=" + sis.getUserData();
    } else {
      userDataString = "";
    }

    String versionString = null;
    if (oldSegs != null) {
      if (foundNonNullVersion) {
        versionString = "versions=[" + oldSegs + " .. " + newest + "]";
      } else {
        versionString = "version=" + oldSegs;
      }
    } else {
      versionString = oldest.equals(newest) ? ( "version=" + oldest ) : ("versions=[" + oldest + " .. " + newest + "]");
    }
    
    msg("Segments file=" + segmentsFileName + " numSegments=" + numSegments
        + " " + versionString + " format=" + sFormat + userDataString);

    if (onlySegments != null) {
      result.partial = true;
      if (infoStream != null)
        infoStream.print("\nChecking only these segments:");
      for (String s : onlySegments) {
        if (infoStream != null)
          infoStream.print(" " + s);
      }
      result.segmentsChecked.addAll(onlySegments);
      msg(":");
    }

    if (skip) {
      msg("\nERROR: this index appears to be created by a newer version of Lucene than this tool was compiled on; please re-compile this tool on the matching version of Lucene; exiting");
      result.toolOutOfDate = true;
      return result;
    }


    result.newSegments = (SegmentInfos) sis.clone();
    result.newSegments.clear();
    result.maxSegmentName = -1;

    for(int i=0;i<numSegments;i++) {
      final SegmentInfo info = sis.info(i);
      int segmentName = Integer.parseInt(info.name.substring(1), Character.MAX_RADIX);
      if (segmentName > result.maxSegmentName) {
        result.maxSegmentName = segmentName;
      }
      if (onlySegments != null && !onlySegments.contains(info.name))
        continue;
      Status.SegmentInfoStatus segInfoStat = new Status.SegmentInfoStatus();
      result.segmentInfos.add(segInfoStat);
      msg("  " + (1+i) + " of " + numSegments + ": name=" + info.name + " docCount=" + info.docCount);
      segInfoStat.name = info.name;
      segInfoStat.docCount = info.docCount;

      int toLoseDocCount = info.docCount;

      SegmentReader reader = null;

      try {
        msg("    compound=" + info.getUseCompoundFile());
        segInfoStat.compound = info.getUseCompoundFile();
        msg("    hasProx=" + info.getHasProx());
        segInfoStat.hasProx = info.getHasProx();
        msg("    numFiles=" + info.files().size());
        segInfoStat.numFiles = info.files().size();
        segInfoStat.sizeMB = info.sizeInBytes(true)/(1024.*1024.);
        msg("    size (MB)=" + nf.format(segInfoStat.sizeMB));
        Map<String,String> diagnostics = info.getDiagnostics();
        segInfoStat.diagnostics = diagnostics;
        if (diagnostics.size() > 0) {
          msg("    diagnostics = " + diagnostics);
        }

        final int docStoreOffset = info.getDocStoreOffset();
        if (docStoreOffset != -1) {
          msg("    docStoreOffset=" + docStoreOffset);
          segInfoStat.docStoreOffset = docStoreOffset;
          msg("    docStoreSegment=" + info.getDocStoreSegment());
          segInfoStat.docStoreSegment = info.getDocStoreSegment();
          msg("    docStoreIsCompoundFile=" + info.getDocStoreIsCompoundFile());
          segInfoStat.docStoreCompoundFile = info.getDocStoreIsCompoundFile();
        }
        final String delFileName = info.getDelFileName();
        if (delFileName == null){
          msg("    no deletions");
          segInfoStat.hasDeletions = false;
        }
        else{
          msg("    has deletions [delFileName=" + delFileName + "]");
          segInfoStat.hasDeletions = true;
          segInfoStat.deletionsFileName = delFileName;
        }
        if (infoStream != null)
          infoStream.print("    test: open reader.........");
        reader = SegmentReader.get(true, info, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR);

        segInfoStat.openReaderPassed = true;

        final int numDocs = reader.numDocs();
        toLoseDocCount = numDocs;
        if (reader.hasDeletions()) {
          if (reader.deletedDocs.count() != info.getDelCount()) {
            throw new RuntimeException("delete count mismatch: info=" + info.getDelCount() + " vs deletedDocs.count()=" + reader.deletedDocs.count());
          }
          if (reader.deletedDocs.count() > reader.maxDoc()) {
            throw new RuntimeException("too many deleted docs: maxDoc()=" + reader.maxDoc() + " vs deletedDocs.count()=" + reader.deletedDocs.count());
          }
          if (info.docCount - numDocs != info.getDelCount()){
            throw new RuntimeException("delete count mismatch: info=" + info.getDelCount() + " vs reader=" + (info.docCount - numDocs));
          }
          segInfoStat.numDeleted = info.docCount - numDocs;
          msg("OK [" + (segInfoStat.numDeleted) + " deleted docs]");
        } else {
          if (info.getDelCount() != 0) {
            throw new RuntimeException("delete count mismatch: info=" + info.getDelCount() + " vs reader=" + (info.docCount - numDocs));
          }
          msg("OK");
        }
        if (reader.maxDoc() != info.docCount)
          throw new RuntimeException("SegmentReader.maxDoc() " + reader.maxDoc() + " != SegmentInfos.docCount " + info.docCount);

        // Test getFieldNames()
        if (infoStream != null) {
          infoStream.print("    test: fields..............");
        }         
        Collection<String> fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL);
        msg("OK [" + fieldNames.size() + " fields]");
        segInfoStat.numFields = fieldNames.size();
        
        // Test Field Norms
        segInfoStat.fieldNormStatus = testFieldNorms(fieldNames, reader);

        // Test the Term Index
        segInfoStat.termIndexStatus = testTermIndex(info, reader);

        // Test Stored Fields
        segInfoStat.storedFieldStatus = testStoredFields(info, reader, nf);

        // Test Term Vectors
        segInfoStat.termVectorStatus = testTermVectors(info, reader, nf);

        // Rethrow the first exception we encountered
        //  This will cause stats for failed segments to be incremented properly
        if (segInfoStat.fieldNormStatus.error != null) {
          throw new RuntimeException("Field Norm test failed");
        } else if (segInfoStat.termIndexStatus.error != null) {
          throw new RuntimeException("Term Index test failed");
        } else if (segInfoStat.storedFieldStatus.error != null) {
          throw new RuntimeException("Stored Field test failed");
        } else if (segInfoStat.termVectorStatus.error != null) {
          throw new RuntimeException("Term Vector test failed");
        }

        msg("");

      } catch (Throwable t) {
        msg("FAILED");
        String comment;
        comment = "fixIndex() would remove reference to this segment";
        msg("    WARNING: " + comment + "; full exception:");
        if (infoStream != null)
          t.printStackTrace(infoStream);
        msg("");
        result.totLoseDocCount += toLoseDocCount;
        result.numBadSegments++;
        continue;
      } finally {
        if (reader != null)
          reader.close();
      }

      // Keeper
      result.newSegments.add((SegmentInfo) info.clone());
    }

    if (0 == result.numBadSegments) {
      result.clean = true;
    } else
      msg("WARNING: " + result.numBadSegments + " broken segments (containing " + result.totLoseDocCount + " documents) detected");

    if ( ! (result.validCounter = (result.maxSegmentName < sis.counter))) {
      result.clean = false;
      result.newSegments.counter = result.maxSegmentName + 1; 
      msg("ERROR: Next segment name counter " + sis.counter + " is not greater than max segment name " + result.maxSegmentName);
    }
    
    if (result.clean) {
      msg("No problems were detected with this index.\n");
    }

    return result;
  }

  /**
   * Test field norms.
   */
  private Status.FieldNormStatus testFieldNorms(Collection<String> fieldNames, SegmentReader reader) {
    final Status.FieldNormStatus status = new Status.FieldNormStatus();

    try {
      // Test Field Norms
      if (infoStream != null) {
        infoStream.print("    test: field norms.........");
      }
      final byte[] b = new byte[reader.maxDoc()];
      for (final String fieldName : fieldNames) {
        if (reader.hasNorms(fieldName)) {
          reader.norms(fieldName, b, 0);
          ++status.totFields;
        }
      }

      msg("OK [" + status.totFields + " fields]");
    } catch (Throwable e) {
      msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
      status.error = e;
      if (infoStream != null) {
        e.printStackTrace(infoStream);
      }
    }

    return status;
  }

  /**
   * Test the term index.
   */
  private Status.TermIndexStatus testTermIndex(SegmentInfo info, SegmentReader reader) {
    final Status.TermIndexStatus status = new Status.TermIndexStatus();

    final IndexSearcher is = new IndexSearcher(reader);

    try {
      if (infoStream != null) {
        infoStream.print("    test: terms, freq, prox...");
      }

      final TermEnum termEnum = reader.terms();
      final TermPositions termPositions = reader.termPositions();
      final int postingsSkipInterval = reader.getPostingsSkipInterval();

      // Used only to count up # deleted docs for this term
      final MySegmentTermDocs myTermDocs = new MySegmentTermDocs(reader);

      final int maxDoc = reader.maxDoc();
      Term lastTerm = null;
      while (termEnum.next()) {
        status.termCount++;
        final Term term = termEnum.term();
        lastTerm = term;

        final int docFreq = termEnum.docFreq();
        termPositions.seek(term);
        int lastDoc = -1;
        int freq0 = 0;
        status.totFreq += docFreq;
        while (termPositions.next()) {
          freq0++;
          final int doc = termPositions.doc();
          final int freq = termPositions.freq();
          if (doc <= lastDoc)
            throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc);
          if (doc >= maxDoc)
            throw new RuntimeException("term " + term + ": doc " + doc + " >= maxDoc " + maxDoc);

          lastDoc = doc;
          if (freq <= 0)
            throw new RuntimeException("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds");
            
          int lastPos = -1;
          status.totPos += freq;
          for(int j=0;j<freq;j++) {
            final int pos = termPositions.nextPosition();
            if (pos < -1)
              throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " is out of bounds");
            if (pos < lastPos)
              throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " < lastPos " + lastPos);
            lastPos = pos;
          }
        }

        // Test skipping
        if (docFreq >= postingsSkipInterval) {
          
          for(int idx=0;idx<7;idx++) {
            final int skipDocID = (int) (((idx+1)*(long) maxDoc)/8);
            termPositions.seek(term);
            if (!termPositions.skipTo(skipDocID)) {
              break;
            } else {

              final int docID = termPositions.doc();
              if (docID < skipDocID) {
                throw new RuntimeException("term " + term + ": skipTo(docID=" + skipDocID + ") returned docID=" + docID);
              }
              final int freq = termPositions.freq();
              if (freq <= 0) {
                throw new RuntimeException("termFreq " + freq + " is out of bounds");
              }
              int lastPosition = -1;
              for(int posUpto=0;posUpto<freq;posUpto++) {
                final int pos = termPositions.nextPosition();
                if (pos < 0) {
                  throw new RuntimeException("position " + pos + " is out of bounds");
                }
                if (pos <= lastPosition) {
                  throw new RuntimeException("position " + pos + " is <= lastPosition " + lastPosition);
                }
                lastPosition = pos;
              } 

              if (!termPositions.next()) {
                break;
              }
              final int nextDocID = termPositions.doc();
              if (nextDocID <= docID) {
                throw new RuntimeException("term " + term + ": skipTo(docID=" + skipDocID + "), then .next() returned docID=" + nextDocID + " vs prev docID=" + docID);
              }
            }
          }
        }

        // Now count how many deleted docs occurred in
        // this term:
        final int delCount;
        if (reader.hasDeletions()) {
          myTermDocs.seek(term);
          while(myTermDocs.next()) { }
          delCount = myTermDocs.delCount;
        } else {
          delCount = 0; 
        }

        if (freq0 + delCount != docFreq) {
          throw new RuntimeException("term " + term + " docFreq=" + 
                                     docFreq + " != num docs seen " + freq0 + " + num docs deleted " + delCount);
        }
      }

      // Test search on last term:
      if (lastTerm != null) {
        is.search(new TermQuery(lastTerm), 1);
      }

      msg("OK [" + status.termCount + " terms; " + status.totFreq + " terms/docs pairs; " + status.totPos + " tokens]");

    } catch (Throwable e) {
      msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
      status.error = e;
      if (infoStream != null) {
        e.printStackTrace(infoStream);
      }
    }

    return status;
  }
  
  /**
   * Test stored fields for a segment.
   */
  private Status.StoredFieldStatus testStoredFields(SegmentInfo info, SegmentReader reader, NumberFormat format) {
    final Status.StoredFieldStatus status = new Status.StoredFieldStatus();

    try {
      if (infoStream != null) {
        infoStream.print("    test: stored fields.......");
      }

      // Scan stored fields for all documents
      for (int j = 0; j < info.docCount; ++j) {
        if (!reader.isDeleted(j)) {
          status.docCount++;
          Document doc = reader.document(j);
          status.totFields += doc.getFields().size();
        }
      }      

      // Validate docCount
      if (status.docCount != reader.numDocs()) {
        throw new RuntimeException("docCount=" + status.docCount + " but saw " + status.docCount + " undeleted docs");
      }

      msg("OK [" + status.totFields + " total field count; avg " + 
          format.format((((float) status.totFields)/status.docCount)) + " fields per doc]");      
    } catch (Throwable e) {
      msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
      status.error = e;
      if (infoStream != null) {
        e.printStackTrace(infoStream);
      }
    }

    return status;
  }

  /**
   * Test term vectors for a segment.
   */
  private Status.TermVectorStatus testTermVectors(SegmentInfo info, SegmentReader reader, NumberFormat format) {
    final Status.TermVectorStatus status = new Status.TermVectorStatus();
    
    try {
      if (infoStream != null) {
        infoStream.print("    test: term vectors........");
      }

      for (int j = 0; j < info.docCount; ++j) {
        if (!reader.isDeleted(j)) {
          status.docCount++;
          TermFreqVector[] tfv = reader.getTermFreqVectors(j);
          if (tfv != null) {
            status.totVectors += tfv.length;
          }
        }
      }
      
      msg("OK [" + status.totVectors + " total vector count; avg " + 
          format.format((((float) status.totVectors) / status.docCount)) + " term/freq vector fields per doc]");
    } catch (Throwable e) {
      msg("ERROR [" + String.valueOf(e.getMessage()) + "]");
      status.error = e;
      if (infoStream != null) {
        e.printStackTrace(infoStream);
      }
    }
    
    return status;
  }

  /** Repairs the index using previously returned result
   *  from {@link #checkIndex}.  Note that this does not
   *  remove any of the unreferenced files after it's done;
   *  you must separately open an {@link IndexWriter}, which
   *  deletes unreferenced files when it's created.
   *
   * <p>WARNING: this writes a
   *  new segments file into the index, effectively removing
   *  all documents in broken segments from the index.
   *  BE CAREFUL.
   *
   * <p>WARNING: Make sure you only call this when the
   *  index is not opened  by any writer. */
  public void fixIndex(Status result) throws IOException {
    if (result.partial)
      throw new IllegalArgumentException("can only fix an index that was fully checked (this status checked a subset of segments)");
    result.newSegments.changed();
    result.newSegments.commit(result.dir);
  }

  private static boolean assertsOn;

  private static boolean testAsserts() {
    assertsOn = true;
    return true;
  }

  private static boolean assertsOn() {
    assert testAsserts();
    return assertsOn;
  }

  /** Command-line interface to check and fix an index.

    <p>
    Run it like this:
    <pre>
    java -ea:org.apache.lucene... org.apache.lucene.index.CheckIndex pathToIndex [-fix] [-segment X] [-segment Y]
    </pre>
    <ul>
    <li>-fix: actually write a new segments_N file, removing any problematic segments

    <li>-segment X: only check the specified
    segment(s).  This can be specified multiple times,
    to check more than one segment, eg <code>-segment _2
    -segment _a</code>.  You can't use this with the -fix
    option.
    </ul>

    <p>WARNING: -fix should only be used on an emergency basis as it will cause
                       documents (perhaps many) to be permanently removed from the index.  Always make
                       a backup copy of your index before running this!  Do not run this tool on an index
                       that is actively being written to.  You have been warned!

    <p>                Run without -fix, this tool will open the index, report version information
                       and report any exceptions it hits and what action it would take if -fix were
                       specified.  With -fix, this tool will remove any segments that have issues and
                       write a new segments_N file.  This means all documents contained in the affected
                       segments will be removed.

    <p>
                       This tool exits with exit code 1 if the index cannot be opened or has any
                       corruption, else 0.
   */
  public static void main(String[] args) throws IOException, InterruptedException {

    boolean doFix = false;
    List<String> onlySegments = new ArrayList();
    String indexPath = null;
    int i = 0;
    while(i < args.length) {
      if (args[i].equals("-fix")) {
        doFix = true;
        i++;
      } else if (args[i].equals("-segment")) {
        if (i == args.length-1) {
          System.out.println("ERROR: missing name for -segment option");
          System.exit(1);
        }
        onlySegments.add(args[i+1]);
        i += 2;
      } else {
        if (indexPath != null) {
          System.out.println("ERROR: unexpected extra argument '" + args[i] + "'");
          System.exit(1);
        }
        indexPath = args[i];
        i++;
      }
    }

    if (indexPath == null) {
      System.out.println("\nERROR: index path not specified");
      System.out.println("\nUsage: java org.apache.lucene.index.CheckIndex pathToIndex [-fix] [-segment X] [-segment Y]\n" +
                         "\n" +
                         "  -fix: actually write a new segments_N file, removing any problematic segments\n" +
                         "  -segment X: only check the specified segments.  This can be specified multiple\n" + 
                         "              times, to check more than one segment, eg '-segment _2 -segment _a'.\n" +
                         "              You can't use this with the -fix option\n" +
                         "\n" + 
                         "**WARNING**: -fix should only be used on an emergency basis as it will cause\n" +
                         "documents (perhaps many) to be permanently removed from the index.  Always make\n" +
                         "a backup copy of your index before running this!  Do not run this tool on an index\n" +
                         "that is actively being written to.  You have been warned!\n" +
                         "\n" +
                         "Run without -fix, this tool will open the index, report version information\n" +
                         "and report any exceptions it hits and what action it would take if -fix were\n" +
                         "specified.  With -fix, this tool will remove any segments that have issues and\n" + 
                         "write a new segments_N file.  This means all documents contained in the affected\n" +
                         "segments will be removed.\n" +
                         "\n" +
                         "This tool exits with exit code 1 if the index cannot be opened or has any\n" +
                         "corruption, else 0.\n");
      System.exit(1);
    }

    if (!assertsOn())
      System.out.println("\nNOTE: testing will be more thorough if you run java with '-ea:org.apache.lucene...', so assertions are enabled");

    if (onlySegments.size() == 0)
      onlySegments = null;
    else if (doFix) {
      System.out.println("ERROR: cannot specify both -fix and -segment");
      System.exit(1);
    }

    System.out.println("\nOpening index @ " + indexPath + "\n");
    Directory dir = null;
    try {
      dir = FSDirectory.open(new File(indexPath));
    } catch (Throwable t) {
      System.out.println("ERROR: could not open directory \"" + indexPath + "\"; exiting");
      t.printStackTrace(System.out);
      System.exit(1);
    }

    CheckIndex checker = new CheckIndex(dir);
    checker.setInfoStream(System.out);

    Status result = checker.checkIndex(onlySegments);
    if (result.missingSegments) {
      System.exit(1);
    }

    if (!result.clean) {
      if (!doFix) {
        System.out.println("WARNING: would write new segments file, and " + result.totLoseDocCount + " documents would be lost, if -fix were specified\n");
      } else {
        System.out.println("WARNING: " + result.totLoseDocCount + " documents will be lost\n");
        System.out.println("NOTE: will write new segments file in 5 seconds; this will remove " + result.totLoseDocCount + " docs from the index. THIS IS YOUR LAST CHANCE TO CTRL+C!");
        for(int s=0;s<5;s++) {
          Thread.sleep(1000);
          System.out.println("  " + (5-s) + "...");
        }
        System.out.println("Writing...");
        checker.fixIndex(result);
        System.out.println("OK");
        System.out.println("Wrote new segments file \"" + result.newSegments.getCurrentSegmentFileName() + "\"");
      }
    }
    System.out.println("");

    final int exitCode;
    if (result.clean == true)
      exitCode = 0;
    else
      exitCode = 1;
    System.exit(exitCode);
  }
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene CheckIndex.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.