alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (TieredMergePolicy.java)

This example Lucene source code file (TieredMergePolicy.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

arraylist, illegalargumentexception, io, ioexception, ioexception, list, mergescore, mergespecification, mergespecification, onemerge, override, override, segmentinfo, tieredmergepolicy, tieredmergepolicy, util

The Lucene TieredMergePolicy.java source code

package org.apache.lucene.index;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.Map;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Comparator;
import java.util.List;
import java.util.ArrayList;

/**
 *  Merges segments of approximately equal size, subject to
 *  an allowed number of segments per tier.  This is similar
 *  to {@link LogByteSizeMergePolicy}, except this merge
 *  policy is able to merge non-adjacent segment, and
 *  separates how many segments are merged at once ({@link
 *  #setMaxMergeAtOnce}) from how many segments are allowed
 *  per tier ({@link #setSegmentsPerTier}).  This merge
 *  policy also does not over-merge (ie, cascade merges). 
 *
 *  <p>For normal merging, this policy first computes a
 *  "budget" of how many segments are allowed by be in the
 *  index.  If the index is over-budget, then the policy
 *  sorts segments by decresing size (pro-rating by percent
 *  deletes), and then finds the least-cost merge.  Merge
 *  cost is measured by a combination of the "skew" of the
 *  merge (size of largest seg divided by smallest seg),
 *  total merge size and pct deletes reclaimed,
 *  so that merges with lower skew, smaller size
 *  and those reclaiming more deletes, are
 *  favored.
 *
 *  <p>If a merge will produce a segment that's larger than
 *  {@link #setMaxMergedSegmentMB}, then the policy will
 *  merge fewer segments (down to 1 at once, if that one has
 *  deletions) to keep the segment size under budget.
 *      
 *  <p: this policy freely merges non-adjacent
 *  segments; if this is a problem, use {@link
 *  LogMergePolicy}.
 *
 *  <p>NOTE: This policy always merges by byte size
 *  of the segments, always pro-rates by percent deletes,
 *  and does not apply any maximum segment size during
 *  optimize (unlike {@link LogByteSizeMergePolicy}.
 *
 *  @lucene.experimental
 */

// TODO
//   - we could try to take into account whether a large
//     merge is already running (under CMS) and then bias
//     ourselves towards picking smaller merges if so (or,
//     maybe CMS should do so)

public class TieredMergePolicy extends MergePolicy {

  private int maxMergeAtOnce = 10;
  private long maxMergedSegmentBytes = 5*1024*1024*1024L;
  private int maxMergeAtOnceExplicit = 30;

  private long floorSegmentBytes = 2*1024*1024L;
  private double segsPerTier = 10.0;
  private double expungeDeletesPctAllowed = 10.0;
  private boolean useCompoundFile = true;
  private double noCFSRatio = 0.1;
  private double reclaimDeletesWeight = 2.0;

  /** Maximum number of segments to be merged at a time
   *  during "normal" merging.  For explicit merging (eg,
   *  optimize or expungeDeletes was called), see {@link
   *  #setMaxMergeAtOnceExplicit}.  Default is 10. */
  public TieredMergePolicy setMaxMergeAtOnce(int v) {
    if (v < 2) {
      throw new IllegalArgumentException("maxMergeAtOnce must be > 1 (got " + v + ")");
    }
    maxMergeAtOnce = v;
    return this;
  }

  /** @see #setMaxMergeAtOnce */
  public int getMaxMergeAtOnce() {
    return maxMergeAtOnce;
  }

  // TODO: should addIndexes do explicit merging, too?  And,
  // if user calls IW.maybeMerge "explicitly"

  /** Maximum number of segments to be merged at a time,
   *  during optimize or expungeDeletes. Default is 30. */
  public TieredMergePolicy setMaxMergeAtOnceExplicit(int v) {
    if (v < 2) {
      throw new IllegalArgumentException("maxMergeAtOnceExplicit must be > 1 (got " + v + ")");
    }
    maxMergeAtOnceExplicit = v;
    return this;
  }

  /** @see #setMaxMergeAtOnceExplicit */
  public int getMaxMergeAtOnceExplicit() {
    return maxMergeAtOnceExplicit;
  }

  /** Maximum sized segment to produce during
   *  normal merging.  This setting is approximate: the
   *  estimate of the merged segment size is made by summing
   *  sizes of to-be-merged segments (compensating for
   *  percent deleted docs).  Default is 5 GB. */
  public TieredMergePolicy setMaxMergedSegmentMB(double v) {
    maxMergedSegmentBytes = (long) (v*1024*1024);
    return this;
  }

  /** @see #getMaxMergedSegmentMB */
  public double getMaxMergedSegmentMB() {
    return maxMergedSegmentBytes/1024/1024.;
  }

  /** Controls how aggressively merges that reclaim more
   *  deletions are favored.  Higher values favor selecting
   *  merges that reclaim deletions.  A value of 0.0 means
   *  deletions don't impact merge selection. */
  public TieredMergePolicy setReclaimDeletesWeight(double v) {
    if (v < 0.0) {
      throw new IllegalArgumentException("reclaimDeletesWeight must be >= 0.0 (got " + v + ")");
    }
    reclaimDeletesWeight = v;
    return this;
  }

  /** See {@link #setReclaimDeletesWeight}. */
  public double getReclaimDeletesWeight() {
    return reclaimDeletesWeight;
  }

  /** Segments smaller than this are "rounded up" to this
   *  size, ie treated as equal (floor) size for merge
   *  selection.  This is to prevent frequent flushing of
   *  tiny segments from allowing a long tail in the index.
   *  Default is 2 MB. */
  public TieredMergePolicy setFloorSegmentMB(double v) {
    if (v <= 0.0) {
      throw new IllegalArgumentException("floorSegmentMB must be >= 0.0 (got " + v + ")");
    }
    floorSegmentBytes = (long) (v*1024*1024);
    return this;
  }

  /** @see #setFloorSegmentMB */
  public double getFloorSegmentMB() {
    return floorSegmentBytes/1024*1024.;
  }

  /** When expungeDeletes is called, we only merge away a
   *  segment if its delete percentage is over this
   *  threshold.  Default is 10%. */ 
  public TieredMergePolicy setExpungeDeletesPctAllowed(double v) {
    if (v < 0.0 || v > 100.0) {
      throw new IllegalArgumentException("expungeDeletesPctAllowed must be between 0.0 and 100.0 inclusive (got " + v + ")");
    }
    expungeDeletesPctAllowed = v;
    return this;
  }

  /** @see #setExpungeDeletesPctAllowed */
  public double getExpungeDeletesPctAllowed() {
    return expungeDeletesPctAllowed;
  }

  /** Sets the allowed number of segments per tier.  Smaller
   *  values mean more merging but fewer segments.
   *
   *  <p>NOTE: this value should be >= the {@link
   *  #setMaxMergeAtOnce} otherwise you'll force too much
   *  merging to occur.</p>
   *
   *  <p>Default is 10.0.

*/ public TieredMergePolicy setSegmentsPerTier(double v) { if (v < 2.0) { throw new IllegalArgumentException("segmentsPerTier must be >= 2.0 (got " + v + ")"); } segsPerTier = v; return this; } /** @see #setSegmentsPerTier */ public double getSegmentsPerTier() { return segsPerTier; } /** Sets whether compound file format should be used for * newly flushed and newly merged segments. Default * true. */ public TieredMergePolicy setUseCompoundFile(boolean useCompoundFile) { this.useCompoundFile = useCompoundFile; return this; } /** @see #setUseCompoundFile */ public boolean getUseCompoundFile() { return useCompoundFile; } /** If a merged segment will be more than this percentage * of the total size of the index, leave the segment as * non-compound file even if compound file is enabled. * Set to 1.0 to always use CFS regardless of merge * size. Default is 0.1. */ public TieredMergePolicy setNoCFSRatio(double noCFSRatio) { if (noCFSRatio < 0.0 || noCFSRatio > 1.0) { throw new IllegalArgumentException("noCFSRatio must be 0.0 to 1.0 inclusive; got " + noCFSRatio); } this.noCFSRatio = noCFSRatio; return this; } /** @see #setNoCFSRatio */ public double getNoCFSRatio() { return noCFSRatio; } private class SegmentByteSizeDescending implements Comparator<SegmentInfo> { public int compare(SegmentInfo o1, SegmentInfo o2) { try { final long sz1 = size(o1); final long sz2 = size(o2); if (sz1 > sz2) { return -1; } else if (sz2 > sz1) { return 1; } else { return o1.name.compareTo(o2.name); } } catch (IOException ioe) { throw new RuntimeException(ioe); } } } private final Comparator<SegmentInfo> segmentByteSizeDescending = new SegmentByteSizeDescending(); protected static abstract class MergeScore { abstract double getScore(); abstract String getExplanation(); } @Override public MergeSpecification findMerges(SegmentInfos infos) throws IOException { if (verbose()) { message("findMerges: " + infos.size() + " segments"); } if (infos.size() == 0) { return null; } final Collection<SegmentInfo> merging = writer.get().getMergingSegments(); final Collection<SegmentInfo> toBeMerged = new HashSet(); final List<SegmentInfo> infosSorted = new ArrayList(infos.asList()); Collections.sort(infosSorted, segmentByteSizeDescending); // Compute total index bytes & print details about the index long totIndexBytes = 0; long minSegmentBytes = Long.MAX_VALUE; for(SegmentInfo info : infosSorted) { final long segBytes = size(info); if (verbose()) { String extra = merging.contains(info) ? " [merging]" : ""; if (segBytes >= maxMergedSegmentBytes/2.0) { extra += " [skip: too large]"; } else if (segBytes < floorSegmentBytes) { extra += " [floored]"; } message(" seg=" + writer.get().segString(info) + " size=" + String.format("%.3f", segBytes/1024/1024.) + " MB" + extra); } minSegmentBytes = Math.min(segBytes, minSegmentBytes); // Accum total byte size totIndexBytes += segBytes; } // If we have too-large segments, grace them out // of the maxSegmentCount: int tooBigCount = 0; while (tooBigCount < infosSorted.size() && size(infosSorted.get(tooBigCount)) >= maxMergedSegmentBytes/2.0) { totIndexBytes -= size(infosSorted.get(tooBigCount)); tooBigCount++; } minSegmentBytes = floorSize(minSegmentBytes); // Compute max allowed segs in the index long levelSize = minSegmentBytes; long bytesLeft = totIndexBytes; double allowedSegCount = 0; while(true) { final double segCountLevel = bytesLeft / (double) levelSize; if (segCountLevel < segsPerTier) { allowedSegCount += Math.ceil(segCountLevel); break; } allowedSegCount += segsPerTier; bytesLeft -= segsPerTier * levelSize; levelSize *= maxMergeAtOnce; } int allowedSegCountInt = (int) allowedSegCount; MergeSpecification spec = null; // Cycle to possibly select more than one merge: while(true) { long mergingBytes = 0; // Gather eligible segments for merging, ie segments // not already being merged and not already picked (by // prior iteration of this loop) for merging: final List<SegmentInfo> eligible = new ArrayList(); for(int idx = tooBigCount; idx<infosSorted.size(); idx++) { final SegmentInfo info = infosSorted.get(idx); if (merging.contains(info)) { mergingBytes += info.sizeInBytes(true); } else if (!toBeMerged.contains(info)) { eligible.add(info); } } final boolean maxMergeIsRunning = mergingBytes >= maxMergedSegmentBytes; message(" allowedSegmentCount=" + allowedSegCountInt + " vs count=" + infosSorted.size() + " (eligible count=" + eligible.size() + ") tooBigCount=" + tooBigCount); if (eligible.size() == 0) { return spec; } if (eligible.size() >= allowedSegCountInt) { // OK we are over budget -- find best merge! MergeScore bestScore = null; List<SegmentInfo> best = null; boolean bestTooLarge = false; long bestMergeBytes = 0; // Consider all merge starts: for(int startIdx = 0;startIdx <= eligible.size()-maxMergeAtOnce; startIdx++) { long totAfterMergeBytes = 0; final List<SegmentInfo> candidate = new ArrayList(); boolean hitTooLarge = false; for(int idx = startIdx;idx<eligible.size() && candidate.size() < maxMergeAtOnce;idx++) { final SegmentInfo info = eligible.get(idx); final long segBytes = size(info); if (totAfterMergeBytes + segBytes > maxMergedSegmentBytes) { hitTooLarge = true; // NOTE: we continue, so that we can try // "packing" smaller segments into this merge // to see if we can get closer to the max // size; this in general is not perfect since // this is really "bin packing" and we'd have // to try different permutations. continue; } candidate.add(info); totAfterMergeBytes += segBytes; } final MergeScore score = score(candidate, hitTooLarge, mergingBytes); message(" maybe=" + writer.get().segString(candidate) + " score=" + score.getScore() + " " + score.getExplanation() + " tooLarge=" + hitTooLarge + " size=" + String.format("%.3f MB", totAfterMergeBytes/1024./1024.)); // If we are already running a max sized merge // (maxMergeIsRunning), don't allow another max // sized merge to kick off: if ((bestScore == null || score.getScore() < bestScore.getScore()) && (!hitTooLarge || !maxMergeIsRunning)) { best = candidate; bestScore = score; bestTooLarge = hitTooLarge; bestMergeBytes = totAfterMergeBytes; } } if (best != null) { if (spec == null) { spec = new MergeSpecification(); } final OneMerge merge = new OneMerge(best); spec.add(merge); for(SegmentInfo info : merge.segments) { toBeMerged.add(info); } if (verbose()) { message(" add merge=" + writer.get().segString(merge.segments) + " size=" + String.format("%.3f MB", bestMergeBytes/1024./1024.) + " score=" + String.format("%.3f", bestScore.getScore()) + " " + bestScore.getExplanation() + (bestTooLarge ? " [max merge]" : "")); } } else { return spec; } } else { return spec; } } } /** Expert: scores one merge; subclasses can override. */ protected MergeScore score(List<SegmentInfo> candidate, boolean hitTooLarge, long mergingBytes) throws IOException { long totBeforeMergeBytes = 0; long totAfterMergeBytes = 0; long totAfterMergeBytesFloored = 0; for(SegmentInfo info : candidate) { final long segBytes = size(info); totAfterMergeBytes += segBytes; totAfterMergeBytesFloored += floorSize(segBytes); totBeforeMergeBytes += info.sizeInBytes(true); } // Measure "skew" of the merge, which can range // from 1.0/numSegsBeingMerged (good) to 1.0 // (poor): final double skew; if (hitTooLarge) { // Pretend the merge has perfect skew; skew doesn't // matter in this case because this merge will not // "cascade" and so it cannot lead to N^2 merge cost // over time: skew = 1.0/maxMergeAtOnce; } else { skew = ((double) floorSize(size(candidate.get(0))))/totAfterMergeBytesFloored; } // Strongly favor merges with less skew (smaller // mergeScore is better): double mergeScore = skew; // Gently favor smaller merges over bigger ones. We // don't want to make this exponent too large else we // can end up doing poor merges of small segments in // order to avoid the large merges: mergeScore *= Math.pow(totAfterMergeBytes, 0.05); // Strongly favor merges that reclaim deletes: final double nonDelRatio = ((double) totAfterMergeBytes)/totBeforeMergeBytes; mergeScore *= Math.pow(nonDelRatio, reclaimDeletesWeight); final double finalMergeScore = mergeScore; return new MergeScore() { @Override public double getScore() { return finalMergeScore; } @Override public String getExplanation() { return "skew=" + String.format("%.3f", skew) + " nonDelRatio=" + String.format("%.3f", nonDelRatio); } }; } @Override public MergeSpecification findMergesForOptimize(SegmentInfos infos, int maxSegmentCount, Map<SegmentInfo,Boolean> segmentsToOptimize) throws IOException { if (verbose()) { message("findMergesForOptimize maxSegmentCount=" + maxSegmentCount + " infos=" + writer.get().segString(infos) + " segmentsToOptimize=" + segmentsToOptimize); } List<SegmentInfo> eligible = new ArrayList(); boolean optimizeMergeRunning = false; final Collection<SegmentInfo> merging = writer.get().getMergingSegments(); boolean segmentIsOriginal = false; for(SegmentInfo info : infos) { final Boolean isOriginal = segmentsToOptimize.get(info); if (isOriginal != null) { segmentIsOriginal = isOriginal; if (!merging.contains(info)) { eligible.add(info); } else { optimizeMergeRunning = true; } } } if (eligible.size() == 0) { return null; } if ((maxSegmentCount > 1 && eligible.size() <= maxSegmentCount) || (maxSegmentCount == 1 && eligible.size() == 1 && (!segmentIsOriginal || isOptimized(eligible.get(0))))) { if (verbose()) { message("already optimized"); } return null; } Collections.sort(eligible, segmentByteSizeDescending); if (verbose()) { message("eligible=" + eligible); message("optimizeMergeRunning=" + optimizeMergeRunning); } int end = eligible.size(); MergeSpecification spec = null; // Do full merges, first, backwards: while(end >= maxMergeAtOnceExplicit + maxSegmentCount - 1) { if (spec == null) { spec = new MergeSpecification(); } final OneMerge merge = new OneMerge(eligible.subList(end-maxMergeAtOnceExplicit, end)); if (verbose()) { message("add merge=" + writer.get().segString(merge.segments)); } spec.add(merge); end -= maxMergeAtOnceExplicit; } if (spec == null && !optimizeMergeRunning) { // Do final merge final int numToMerge = end - maxSegmentCount + 1; final OneMerge merge = new OneMerge(eligible.subList(end-numToMerge, end)); if (verbose()) { message("add final merge=" + merge.segString(writer.get().getDirectory())); } spec = new MergeSpecification(); spec.add(merge); } return spec; } @Override public MergeSpecification findMergesToExpungeDeletes(SegmentInfos infos) throws CorruptIndexException, IOException { if (verbose()) { message("findMergesToExpungeDeletes infos=" + writer.get().segString(infos) + " expungeDeletesPctAllowed=" + expungeDeletesPctAllowed); } final List<SegmentInfo> eligible = new ArrayList(); final Collection<SegmentInfo> merging = writer.get().getMergingSegments(); for(SegmentInfo info : infos) { double pctDeletes = 100.*((double) writer.get().numDeletedDocs(info))/info.docCount; if (pctDeletes > expungeDeletesPctAllowed && !merging.contains(info)) { eligible.add(info); } } if (eligible.size() == 0) { return null; } Collections.sort(eligible, segmentByteSizeDescending); if (verbose()) { message("eligible=" + eligible); } int start = 0; MergeSpecification spec = null; while(start < eligible.size()) { long totAfterMergeBytes = 0; int upto = start; boolean done = false; while(upto < start + maxMergeAtOnceExplicit) { if (upto == eligible.size()) { done = true; break; } final SegmentInfo info = eligible.get(upto); final long segBytes = size(info); if (totAfterMergeBytes + segBytes > maxMergedSegmentBytes) { // TODO: we could be smarter here, eg cherry // picking smaller merges that'd sum up to just // around the max size break; } totAfterMergeBytes += segBytes; upto++; } if (upto == start) { // Single segment is too big; grace it start++; continue; } if (spec == null) { spec = new MergeSpecification(); } final OneMerge merge = new OneMerge(eligible.subList(start, upto)); if (verbose()) { message("add merge=" + writer.get().segString(merge.segments)); } spec.add(merge); start = upto; if (done) { break; } } return spec; } @Override public boolean useCompoundFile(SegmentInfos infos, SegmentInfo mergedInfo) throws IOException { final boolean doCFS; if (!useCompoundFile) { doCFS = false; } else if (noCFSRatio == 1.0) { doCFS = true; } else { long totalSize = 0; for (SegmentInfo info : infos) totalSize += size(info); doCFS = size(mergedInfo) <= noCFSRatio * totalSize; } return doCFS; } @Override public void close() { } private boolean isOptimized(SegmentInfo info) throws IOException { IndexWriter w = writer.get(); assert w != null; boolean hasDeletions = w.numDeletedDocs(info) > 0; return !hasDeletions && !info.hasSeparateNorms() && info.dir == w.getDirectory() && (info.getUseCompoundFile() == useCompoundFile || noCFSRatio < 1.0); } // Segment size in bytes, pro-rated by % deleted private long size(SegmentInfo info) throws IOException { final long byteSize = info.sizeInBytes(true); final int delCount = writer.get().numDeletedDocs(info); final double delRatio = (info.docCount <= 0 ? 0.0f : ((double)delCount / (double)info.docCount)); assert delRatio <= 1.0; return (long) (byteSize * (1.0-delRatio)); } private long floorSize(long bytes) { return Math.max(floorSegmentBytes, bytes); } private boolean verbose() { IndexWriter w = writer.get(); return w != null && w.verbose(); } private void message(String message) { if (verbose()) { writer.get().message("TMP: " + message); } } @Override public String toString() { StringBuilder sb = new StringBuilder("[" + getClass().getSimpleName() + ": "); sb.append("maxMergeAtOnce=").append(maxMergeAtOnce).append(", "); sb.append("maxMergeAtOnceExplicit=").append(maxMergeAtOnceExplicit).append(", "); sb.append("maxMergedSegmentMB=").append(maxMergedSegmentBytes/1024/1024.).append(", "); sb.append("floorSegmentMB=").append(floorSegmentBytes/1024/1024.).append(", "); sb.append("expungeDeletesPctAllowed=").append(expungeDeletesPctAllowed).append(", "); sb.append("segmentsPerTier=").append(segsPerTier).append(", "); sb.append("useCompoundFile=").append(useCompoundFile).append(", "); sb.append("noCFSRatio=").append(noCFSRatio); return sb.toString(); } }

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene TieredMergePolicy.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.