alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (LogMergePolicy.java)

This example Lucene source code file (LogMergePolicy.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

arraylist, indexwriter, io, ioexception, ioexception, list, logmergepolicy, mergespecification, mergespecification, override, override, segmentinfo, segmentinfo, segmentinfoandlevel, string, util

The Lucene LogMergePolicy.java source code

package org.apache.lucene.index;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;

/** <p>This class implements a {@link MergePolicy} that tries
 *  to merge segments into levels of exponentially
 *  increasing size, where each level has fewer segments than
 *  the value of the merge factor. Whenever extra segments
 *  (beyond the merge factor upper bound) are encountered,
 *  all segments within the level are merged. You can get or
 *  set the merge factor using {@link #getMergeFactor()} and
 *  {@link #setMergeFactor(int)} respectively.</p>
 *
 * <p>This class is abstract and requires a subclass to
 * define the {@link #size} method which specifies how a
 * segment's size is determined.  {@link LogDocMergePolicy}
 * is one subclass that measures size by document count in
 * the segment.  {@link LogByteSizeMergePolicy} is another
 * subclass that measures size as the total byte size of the
 * file(s) for the segment.</p>
 */

public abstract class LogMergePolicy extends MergePolicy {

  /** Defines the allowed range of log(size) for each
   *  level.  A level is computed by taking the max segment
   *  log size, minus LEVEL_LOG_SPAN, and finding all
   *  segments falling within that range. */
  public static final double LEVEL_LOG_SPAN = 0.75;

  /** Default merge factor, which is how many segments are
   *  merged at a time */
  public static final int DEFAULT_MERGE_FACTOR = 10;

  /** Default maximum segment size.  A segment of this size
   *  or larger will never be merged.  @see setMaxMergeDocs */
  public static final int DEFAULT_MAX_MERGE_DOCS = Integer.MAX_VALUE;

  /** Default noCFSRatio.  If a merge's size is >= 10% of
   *  the index, then we disable compound file for it.
   *  @see #setNoCFSRatio */
  public static final double DEFAULT_NO_CFS_RATIO = 0.1;

  protected int mergeFactor = DEFAULT_MERGE_FACTOR;

  protected long minMergeSize;
  protected long maxMergeSize;
  // Although the core MPs set it explicitly, we must default in case someone
  // out there wrote his own LMP ...
  protected long maxMergeSizeForOptimize = Long.MAX_VALUE;
  protected int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS;

  protected double noCFSRatio = DEFAULT_NO_CFS_RATIO;

  protected boolean calibrateSizeByDeletes = true;
  
  protected boolean useCompoundFile = true;

  public LogMergePolicy() {
    super();
  }

  protected boolean verbose() {
    IndexWriter w = writer.get();
    return w != null && w.verbose();
  }

  /** @see #setNoCFSRatio */
  public double getNoCFSRatio() {
    return noCFSRatio;
  }

  /** If a merged segment will be more than this percentage
   *  of the total size of the index, leave the segment as
   *  non-compound file even if compound file is enabled.
   *  Set to 1.0 to always use CFS regardless of merge
   *  size. */
  public void setNoCFSRatio(double noCFSRatio) {
    if (noCFSRatio < 0.0 || noCFSRatio > 1.0) {
      throw new IllegalArgumentException("noCFSRatio must be 0.0 to 1.0 inclusive; got " + noCFSRatio);
    }
    this.noCFSRatio = noCFSRatio;
  }
  
  protected void message(String message) {
    if (verbose())
      writer.get().message("LMP: " + message);
  }

  /** <p>Returns the number of segments that are merged at
   * once and also controls the total number of segments
   * allowed to accumulate in the index.</p> */
  public int getMergeFactor() {
    return mergeFactor;
  }

  /** Determines how often segment indices are merged by
   * addDocument().  With smaller values, less RAM is used
   * while indexing, and searches on unoptimized indices are
   * faster, but indexing speed is slower.  With larger
   * values, more RAM is used during indexing, and while
   * searches on unoptimized indices are slower, indexing is
   * faster.  Thus larger values (> 10) are best for batch
   * index creation, and smaller values (< 10) for indices
   * that are interactively maintained. */
  public void setMergeFactor(int mergeFactor) {
    if (mergeFactor < 2)
      throw new IllegalArgumentException("mergeFactor cannot be less than 2");
    this.mergeFactor = mergeFactor;
  }

  // Javadoc inherited
  @Override
  public boolean useCompoundFile(SegmentInfos infos, SegmentInfo mergedInfo) throws IOException {
    final boolean doCFS;

    if (!useCompoundFile) {
      doCFS = false;
    } else if (noCFSRatio == 1.0) {
      doCFS = true;
    } else {
      long totalSize = 0;
      for (SegmentInfo info : infos)
        totalSize += size(info);

      doCFS = size(mergedInfo) <= noCFSRatio * totalSize;
    }
    return doCFS;
  }

  /** Sets whether compound file format should be used for
   *  newly flushed and newly merged segments. */
  public void setUseCompoundFile(boolean useCompoundFile) {
    this.useCompoundFile = useCompoundFile;
  }

  /** Returns true if newly flushed and newly merge segments
   *  are written in compound file format. @see
   *  #setUseCompoundFile */
  public boolean getUseCompoundFile() {
    return useCompoundFile;
  }

  /** Sets whether the segment size should be calibrated by
   *  the number of deletes when choosing segments for merge. */
  public void setCalibrateSizeByDeletes(boolean calibrateSizeByDeletes) {
    this.calibrateSizeByDeletes = calibrateSizeByDeletes;
  }

  /** Returns true if the segment size should be calibrated 
   *  by the number of deletes when choosing segments for merge. */
  public boolean getCalibrateSizeByDeletes() {
    return calibrateSizeByDeletes;
  }

  @Override
  public void close() {}

  abstract protected long size(SegmentInfo info) throws IOException;

  protected long sizeDocs(SegmentInfo info) throws IOException {
    if (calibrateSizeByDeletes) {
      int delCount = writer.get().numDeletedDocs(info);
      assert delCount <= info.docCount;
      return (info.docCount - (long)delCount);
    } else {
      return info.docCount;
    }
  }
  
  protected long sizeBytes(SegmentInfo info) throws IOException {
    long byteSize = info.sizeInBytes(true);
    if (calibrateSizeByDeletes) {
      int delCount = writer.get().numDeletedDocs(info);
      double delRatio = (info.docCount <= 0 ? 0.0f : ((float)delCount / (float)info.docCount));
      assert delRatio <= 1.0;
      return (info.docCount <= 0 ?  byteSize : (long)(byteSize * (1.0 - delRatio)));
    } else {
      return byteSize;
    }
  }
  
  protected boolean isOptimized(SegmentInfos infos, int maxNumSegments, Map<SegmentInfo,Boolean> segmentsToOptimize) throws IOException {
    final int numSegments = infos.size();
    int numToOptimize = 0;
    SegmentInfo optimizeInfo = null;
    boolean segmentIsOriginal = false;
    for(int i=0;i<numSegments && numToOptimize <= maxNumSegments;i++) {
      final SegmentInfo info = infos.info(i);
      final Boolean isOriginal = segmentsToOptimize.get(info);
      if (isOriginal != null) {
        segmentIsOriginal = isOriginal;
        numToOptimize++;
        optimizeInfo = info;
      }
    }

    return numToOptimize <= maxNumSegments &&
      (numToOptimize != 1 || !segmentIsOriginal || isOptimized(optimizeInfo));
  }

  /** Returns true if this single info is optimized (has no
   *  pending norms or deletes, is in the same dir as the
   *  writer, and matches the current compound file setting */
  protected boolean isOptimized(SegmentInfo info)
    throws IOException {
    IndexWriter w = writer.get();
    assert w != null;
    boolean hasDeletions = w.numDeletedDocs(info) > 0;
    return !hasDeletions &&
      !info.hasSeparateNorms() &&
      info.dir == w.getDirectory() &&
      (info.getUseCompoundFile() == useCompoundFile || noCFSRatio < 1.0);
  }

  /**
   * Returns the merges necessary to optimize the index, taking the max merge
   * size or max merge docs into consideration. This method attempts to respect
   * the {@code maxNumSegments} parameter, however it might be, due to size
   * constraints, that more than that number of segments will remain in the
   * index. Also, this method does not guarantee that exactly {@code
   * maxNumSegments} will remain, but <= that number.
   */
  private MergeSpecification findMergesForOptimizeSizeLimit(
      SegmentInfos infos, int maxNumSegments, int last) throws IOException {
    MergeSpecification spec = new MergeSpecification();
    final List<SegmentInfo> segments = infos.asList();

    int start = last - 1;
    while (start >= 0) {
      SegmentInfo info = infos.info(start);
      if (size(info) > maxMergeSizeForOptimize || sizeDocs(info) > maxMergeDocs) {
        if (verbose()) {
          message("optimize: skip segment=" + info + ": size is > maxMergeSize (" + maxMergeSizeForOptimize + ") or sizeDocs is > maxMergeDocs (" + maxMergeDocs + ")");
        }
        // need to skip that segment + add a merge for the 'right' segments,
        // unless there is only 1 which is optimized.
        if (last - start - 1 > 1 || (start != last - 1 && !isOptimized(infos.info(start + 1)))) {
          // there is more than 1 segment to the right of this one, or an unoptimized single segment.
          spec.add(new OneMerge(segments.subList(start + 1, last)));
        }
        last = start;
      } else if (last - start == mergeFactor) {
        // mergeFactor eligible segments were found, add them as a merge.
        spec.add(new OneMerge(segments.subList(start, last)));
        last = start;
      }
      --start;
    }

    // Add any left-over segments, unless there is just 1 already optimized.
    if (last > 0 && (++start + 1 < last || !isOptimized(infos.info(start)))) {
      spec.add(new OneMerge(segments.subList(start, last)));
    }

    return spec.merges.size() == 0 ? null : spec;
  }
  
  /**
   * Returns the merges necessary to optimize the index. This method constraints
   * the returned merges only by the {@code maxNumSegments} parameter, and
   * guaranteed that exactly that number of segments will remain in the index.
   */
  private MergeSpecification findMergesForOptimizeMaxNumSegments(SegmentInfos infos, int maxNumSegments, int last) throws IOException {
    MergeSpecification spec = new MergeSpecification();
    final List<SegmentInfo> segments = infos.asList();

    // First, enroll all "full" merges (size
    // mergeFactor) to potentially be run concurrently:
    while (last - maxNumSegments + 1 >= mergeFactor) {
      spec.add(new OneMerge(segments.subList(last - mergeFactor, last)));
      last -= mergeFactor;
    }

    // Only if there are no full merges pending do we
    // add a final partial (< mergeFactor segments) merge:
    if (0 == spec.merges.size()) {
      if (maxNumSegments == 1) {

        // Since we must optimize down to 1 segment, the
        // choice is simple:
        if (last > 1 || !isOptimized(infos.info(0))) {
          spec.add(new OneMerge(segments.subList(0, last)));
        }
      } else if (last > maxNumSegments) {

        // Take care to pick a partial merge that is
        // least cost, but does not make the index too
        // lopsided.  If we always just picked the
        // partial tail then we could produce a highly
        // lopsided index over time:

        // We must merge this many segments to leave
        // maxNumSegments in the index (from when
        // optimize was first kicked off):
        final int finalMergeSize = last - maxNumSegments + 1;

        // Consider all possible starting points:
        long bestSize = 0;
        int bestStart = 0;

        for(int i=0;i<last-finalMergeSize+1;i++) {
          long sumSize = 0;
          for(int j=0;j<finalMergeSize;j++)
            sumSize += size(infos.info(j+i));
          if (i == 0 || (sumSize < 2*size(infos.info(i-1)) && sumSize < bestSize)) {
            bestStart = i;
            bestSize = sumSize;
          }
        }

        spec.add(new OneMerge(segments.subList(bestStart, bestStart + finalMergeSize)));
      }
    }
    return spec.merges.size() == 0 ? null : spec;
  }
  
  /** Returns the merges necessary to optimize the index.
   *  This merge policy defines "optimized" to mean only the
   *  requested number of segments is left in the index, and
   *  respects the {@link #maxMergeSizeForOptimize} setting.
   *  By default, and assuming {@code maxNumSegments=1}, only
   *  one segment will be left in the index, where that segment
   *  has no deletions pending nor separate norms, and it is in
   *  compound file format if the current useCompoundFile
   *  setting is true.  This method returns multiple merges
   *  (mergeFactor at a time) so the {@link MergeScheduler}
   *  in use may make use of concurrency. */
  @Override
  public MergeSpecification findMergesForOptimize(SegmentInfos infos,
            int maxNumSegments, Map<SegmentInfo,Boolean> segmentsToOptimize) throws IOException {

    assert maxNumSegments > 0;
    if (verbose()) {
      message("findMergesForOptimize: maxNumSegs=" + maxNumSegments + " segsToOptimize="+ segmentsToOptimize);
    }

    // If the segments are already optimized (e.g. there's only 1 segment), or
    // there are <maxNumSegements, all optimized, nothing to do.
    if (isOptimized(infos, maxNumSegments, segmentsToOptimize)) return null;

    // Find the newest (rightmost) segment that needs to
    // be optimized (other segments may have been flushed
    // since optimize started):
    int last = infos.size();
    while (last > 0) {
      final SegmentInfo info = infos.info(--last);
      if (segmentsToOptimize.get(info) != null) {
        last++;
        break;
      }
    }

    if (last == 0) return null;
    
    // There is only one segment already, and it is optimized
    if (maxNumSegments == 1 && last == 1 && isOptimized(infos.info(0))) return null;

    // Check if there are any segments above the threshold
    boolean anyTooLarge = false;
    for (int i = 0; i < last; i++) {
      SegmentInfo info = infos.info(i);
      if (size(info) > maxMergeSizeForOptimize || sizeDocs(info) > maxMergeDocs) {
        anyTooLarge = true;
        break;
      }
    }

    if (anyTooLarge) {
      return findMergesForOptimizeSizeLimit(infos, maxNumSegments, last);
    } else {
      return findMergesForOptimizeMaxNumSegments(infos, maxNumSegments, last);
    }
  }

  /**
   * Finds merges necessary to expunge all deletes from the
   * index.  We simply merge adjacent segments that have
   * deletes, up to mergeFactor at a time.
   */ 
  @Override
  public MergeSpecification findMergesToExpungeDeletes(SegmentInfos segmentInfos)
      throws CorruptIndexException, IOException {
    final List<SegmentInfo> segments = segmentInfos.asList();
    final int numSegments = segments.size();

    if (verbose())
      message("findMergesToExpungeDeletes: " + numSegments + " segments");

    MergeSpecification spec = new MergeSpecification();
    int firstSegmentWithDeletions = -1;
    IndexWriter w = writer.get();
    assert w != null;
    for(int i=0;i<numSegments;i++) {
      final SegmentInfo info = segmentInfos.info(i);
      int delCount = w.numDeletedDocs(info);
      if (delCount > 0) {
        if (verbose())
          message("  segment " + info.name + " has deletions");
        if (firstSegmentWithDeletions == -1)
          firstSegmentWithDeletions = i;
        else if (i - firstSegmentWithDeletions == mergeFactor) {
          // We've seen mergeFactor segments in a row with
          // deletions, so force a merge now:
          if (verbose())
            message("  add merge " + firstSegmentWithDeletions + " to " + (i-1) + " inclusive");
          spec.add(new OneMerge(segments.subList(firstSegmentWithDeletions, i)));
          firstSegmentWithDeletions = i;
        }
      } else if (firstSegmentWithDeletions != -1) {
        // End of a sequence of segments with deletions, so,
        // merge those past segments even if it's fewer than
        // mergeFactor segments
        if (verbose())
          message("  add merge " + firstSegmentWithDeletions + " to " + (i-1) + " inclusive");
        spec.add(new OneMerge(segments.subList(firstSegmentWithDeletions, i)));
        firstSegmentWithDeletions = -1;
      }
    }

    if (firstSegmentWithDeletions != -1) {
      if (verbose())
        message("  add merge " + firstSegmentWithDeletions + " to " + (numSegments-1) + " inclusive");
      spec.add(new OneMerge(segments.subList(firstSegmentWithDeletions, numSegments)));
    }

    return spec;
  }

  private static class SegmentInfoAndLevel implements Comparable<SegmentInfoAndLevel> {
    SegmentInfo info;
    float level;
    int index;
    
    public SegmentInfoAndLevel(SegmentInfo info, float level, int index) {
      this.info = info;
      this.level = level;
      this.index = index;
    }

    // Sorts largest to smallest
    public int compareTo(SegmentInfoAndLevel other) {
      if (level < other.level)
        return 1;
      else if (level > other.level)
        return -1;
      else
        return 0;
    }
  }

  /** Checks if any merges are now necessary and returns a
   *  {@link MergePolicy.MergeSpecification} if so.  A merge
   *  is necessary when there are more than {@link
   *  #setMergeFactor} segments at a given level.  When
   *  multiple levels have too many segments, this method
   *  will return multiple merges, allowing the {@link
   *  MergeScheduler} to use concurrency. */
  @Override
  public MergeSpecification findMerges(SegmentInfos infos) throws IOException {

    final int numSegments = infos.size();
    if (verbose())
      message("findMerges: " + numSegments + " segments");

    // Compute levels, which is just log (base mergeFactor)
    // of the size of each segment
    final List<SegmentInfoAndLevel> levels = new ArrayList();
    final float norm = (float) Math.log(mergeFactor);

    final Collection<SegmentInfo> mergingSegments = writer.get().getMergingSegments();

    for(int i=0;i<numSegments;i++) {
      final SegmentInfo info = infos.info(i);
      long size = size(info);

      // Floor tiny segments
      if (size < 1) {
        size = 1;
      }

      final SegmentInfoAndLevel infoLevel = new SegmentInfoAndLevel(info, (float) Math.log(size)/norm, i);
      levels.add(infoLevel);

      if (verbose()) {
        final long segBytes = sizeBytes(info);
        String extra = mergingSegments.contains(info) ? " [merging]" : "";
        if (size >= maxMergeSize) {
          extra += " [skip: too large]";
        }
        message("seg=" + writer.get().segString(info) + " level=" + infoLevel.level + " size=" + String.format("%.3f MB", segBytes/1024/1024.) + extra);
      }
    }

    final float levelFloor;
    if (minMergeSize <= 0)
      levelFloor = (float) 0.0;
    else
      levelFloor = (float) (Math.log(minMergeSize)/norm);

    // Now, we quantize the log values into levels.  The
    // first level is any segment whose log size is within
    // LEVEL_LOG_SPAN of the max size, or, who has such as
    // segment "to the right".  Then, we find the max of all
    // other segments and use that to define the next level
    // segment, etc.

    MergeSpecification spec = null;

    final int numMergeableSegments = levels.size();

    int start = 0;
    while(start < numMergeableSegments) {

      // Find max level of all segments not already
      // quantized.
      float maxLevel = levels.get(start).level;
      for(int i=1+start;i<numMergeableSegments;i++) {
        final float level = levels.get(i).level;
        if (level > maxLevel)
          maxLevel = level;
      }

      // Now search backwards for the rightmost segment that
      // falls into this level:
      float levelBottom;
      if (maxLevel <= levelFloor)
        // All remaining segments fall into the min level
        levelBottom = -1.0F;
      else {
        levelBottom = (float) (maxLevel - LEVEL_LOG_SPAN);

        // Force a boundary at the level floor
        if (levelBottom < levelFloor && maxLevel >= levelFloor)
          levelBottom = levelFloor;
      }

      int upto = numMergeableSegments-1;
      while(upto >= start) {
        if (levels.get(upto).level >= levelBottom) {
          break;
        }
        upto--;
      }
      if (verbose())
        message("  level " + levelBottom + " to " + maxLevel + ": " + (1+upto-start) + " segments");

      // Finally, record all merges that are viable at this level:
      int end = start + mergeFactor;
      while(end <= 1+upto) {
        boolean anyTooLarge = false;
        boolean anyMerging = false;
        for(int i=start;i<end;i++) {
          final SegmentInfo info = levels.get(i).info;
          anyTooLarge |= (size(info) >= maxMergeSize || sizeDocs(info) >= maxMergeDocs);
          if (mergingSegments.contains(info)) {
            anyMerging = true;
            break;
          }
        }

        if (anyMerging) {
          // skip
        } else if (!anyTooLarge) {
          if (spec == null)
            spec = new MergeSpecification();
          final List<SegmentInfo> mergeInfos = new ArrayList();
          for(int i=start;i<end;i++) {
            mergeInfos.add(levels.get(i).info);
            assert infos.contains(levels.get(i).info);
          }
          if (verbose()) {
            message("  add merge=" + writer.get().segString(mergeInfos) + " start=" + start + " end=" + end);
          }
          spec.add(new OneMerge(mergeInfos));
        } else if (verbose()) {
          message("    " + start + " to " + end + ": contains segment over maxMergeSize or maxMergeDocs; skipping");
        }

        start = end;
        end = start + mergeFactor;
      }

      start = 1+upto;
    }

    return spec;
  }

  /** <p>Determines the largest segment (measured by
   * document count) that may be merged with other segments.
   * Small values (e.g., less than 10,000) are best for
   * interactive indexing, as this limits the length of
   * pauses while indexing to a few seconds.  Larger values
   * are best for batched indexing and speedier
   * searches.</p>
   *
   * <p>The default value is {@link Integer#MAX_VALUE}.

* * <p>The default merge policy ({@link * LogByteSizeMergePolicy}) also allows you to set this * limit by net size (in MB) of the segment, using {@link * LogByteSizeMergePolicy#setMaxMergeMB}.</p> */ public void setMaxMergeDocs(int maxMergeDocs) { this.maxMergeDocs = maxMergeDocs; } /** Returns the largest segment (measured by document * count) that may be merged with other segments. * @see #setMaxMergeDocs */ public int getMaxMergeDocs() { return maxMergeDocs; } @Override public String toString() { StringBuilder sb = new StringBuilder("[" + getClass().getSimpleName() + ": "); sb.append("minMergeSize=").append(minMergeSize).append(", "); sb.append("mergeFactor=").append(mergeFactor).append(", "); sb.append("maxMergeSize=").append(maxMergeSize).append(", "); sb.append("maxMergeSizeForOptimize=").append(maxMergeSizeForOptimize).append(", "); sb.append("calibrateSizeByDeletes=").append(calibrateSizeByDeletes).append(", "); sb.append("maxMergeDocs=").append(maxMergeDocs).append(", "); sb.append("useCompoundFile=").append(useCompoundFile).append(", "); sb.append("noCFSRatio=").append(noCFSRatio); sb.append("]"); return sb.toString(); } }

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene LogMergePolicy.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.