alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (IndexSorter.java)

This example Lucene source code file (IndexSorter.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

docscore, document, indexsorter, indexsorter, io, ioexception, ioexception, log, logging, override, override, postingmap, sortingreader, string, string, termpositions, unsupportedoperationexception, util

The Lucene IndexSorter.java source code

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.index;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.logging.Logger;

import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.IndexWriter;       // javadocs
import org.apache.lucene.store.*;
import org.apache.lucene.util.Version;

/** Sort an index by document importance factor. Higher scoring documents are
 * assigned smaller document numbers. Document weights are obtained from a
 * specified field, which has to be single-valued and stored, with string value
 * that represents a float number. Stored fields in the output index remain
 * consistent, i.e. both stored fields and postings are renumbered in sync.
 *
 * <p>NOTE: this tool is unaware of documents added
 * atomically via {@link IndexWriter#addDocuments} or {@link
 * IndexWriter#updateDocuments}, which means it can easily
 * break up such document groups.
 */
public class IndexSorter {
  private static final Logger LOG = Logger.getLogger(IndexSorter.class.getName());
  
  private static class PostingMap implements Comparable<PostingMap> {
    private int newDoc;
    private long offset;

    public int compareTo(PostingMap pm) {              // order by newDoc id
      return this.newDoc - pm.newDoc;
    }
  }

  private static class SortedTermPositions implements TermPositions {
    private TermPositions original;
    private int[] oldToNew;

    private int docFreq;

    private PostingMap[] postingMaps = new PostingMap[0];
    private int pointer;

    private int freq;
    private int position;

    private static final String TEMP_FILE = "temp";
    private final RAMDirectory tempDir = new RAMDirectory();
    private RAMOutputStream out;
    private IndexInput in;

    public SortedTermPositions(TermPositions original, int[] oldToNew) {
      this.original = original;
      this.oldToNew = oldToNew;
      try {
        out = (RAMOutputStream)tempDir.createOutput(TEMP_FILE);
      } catch (IOException ioe) {
        LOG.warning("Error creating temporary output: " + ioe);
      }
    }

    public void seek(Term term) throws IOException {
      throw new UnsupportedOperationException();
    }

    public void seek(TermEnum terms) throws IOException {
      original.seek(terms);

      docFreq = terms.docFreq();
      pointer = -1;

      if (docFreq > postingMaps.length) {         // grow postingsMap
        PostingMap[] newMap = new PostingMap[docFreq];
        System.arraycopy(postingMaps, 0, newMap, 0, postingMaps.length);
        for (int i = postingMaps.length; i < docFreq; i++) {
          newMap[i] = new PostingMap();
        }
        postingMaps = newMap;
      }

      out.reset();

      int i = 0;
      while (original.next()) {
        PostingMap map = postingMaps[i++];
        map.newDoc = oldToNew[original.doc()];    // remap the newDoc id
        map.offset = out.getFilePointer();        // save pointer to buffer

        final int tf = original.freq();           // buffer tf & positions
        out.writeVInt(tf);
        int prevPosition = 0;
        for (int j = tf; j > 0; j--) {            // delta encode positions
          int p = original.nextPosition();
          out.writeVInt(p - prevPosition);
          prevPosition = p;
        }
      }
      out.flush();
      docFreq = i;                                // allow for deletions
      
      Arrays.sort(postingMaps, 0, docFreq);       // resort by mapped doc ids

      // NOTE: this might be substantially faster if RAMInputStream were public
      // and supported a reset() operation.
      in = tempDir.openInput(TEMP_FILE);
    }
        
    public boolean next() throws IOException {
      pointer++;
      if (pointer < docFreq) {
        in.seek(postingMaps[pointer].offset);
        freq = in.readVInt();
        position = 0;
        return true;
      }
      return false;
    }
      
    public int doc() { return postingMaps[pointer].newDoc; }
    public int freq() { return freq; }

    public int nextPosition() throws IOException {
      int positionIncrement = in.readVInt();
      position += positionIncrement;
      return position;
    }

    public int read(int[] docs, int[] freqs) {
      throw new UnsupportedOperationException();
    }
    public boolean skipTo(int target) {
      throw new UnsupportedOperationException();
    }

    public byte[] getPayload(byte[] data, int offset) throws IOException {
      return null;
    }

    public int getPayloadLength() {
      return 0;
    }

    public boolean isPayloadAvailable() {
      return false;
    }

    public void close() throws IOException {
      original.close();
    }

  }

  private static class SortingReader extends FilterIndexReader {
    
    private int[] oldToNew;
    private int[] newToOld;

    public SortingReader(IndexReader oldReader, int[] oldToNew) {
      super(oldReader);
      this.oldToNew = oldToNew;
      
      this.newToOld = new int[oldReader.maxDoc()];
      int oldDoc = 0;
      while (oldDoc < oldToNew.length) {
        int newDoc = oldToNew[oldDoc];
        if (newDoc != -1) {
          newToOld[newDoc] = oldDoc;
        }
        oldDoc++;
      }
    }

    @Override
    public IndexReader[] getSequentialSubReaders() {
      return null;
    }

    @Override
    public Document document(int n) throws IOException {
      return document(n, null);
    }

    @Override
    public Document document(int n, FieldSelector fieldSelector)
        throws CorruptIndexException, IOException {
      return super.document(newToOld[n], fieldSelector);
    }

    @Override
    public boolean isDeleted(int n) {
      return false;
    }

    @Override
    public byte[] norms(String f) throws IOException {
      throw new UnsupportedOperationException();
    }

    @Override
    public void norms(String f, byte[] norms, int offset) throws IOException {
      byte[] oldNorms = super.norms(f);
      int oldDoc = 0;
      while (oldDoc < oldNorms.length) {
        int newDoc = oldToNew[oldDoc];
        if (newDoc != -1) {
          norms[newDoc] = oldNorms[oldDoc];
        }
        oldDoc++;
      }
    }

    @Override
    protected void doSetNorm(int d, String f, byte b) throws IOException {
      throw new UnsupportedOperationException();
    }

    @Override
    public TermDocs termDocs() throws IOException {
      throw new UnsupportedOperationException();
    }
    
    @Override
    public TermPositions termPositions() throws IOException {
      return new SortedTermPositions(super.termPositions(), oldToNew);
    }

    @Override
    public TermFreqVector[] getTermFreqVectors(int docNumber)
            throws IOException {
      return super.getTermFreqVectors(newToOld[docNumber]);
    }

    @Override
    protected void doDelete(int n) throws IOException { 
      throw new UnsupportedOperationException();
    }

  }

  private static class DocScore implements Comparable<DocScore> {
    private int oldDoc;
    private float score;

    public int compareTo(DocScore that) {            // order by score, oldDoc
      if (this.score == that.score) {
        return this.oldDoc - that.oldDoc;
      } else {
        return this.score < that.score ? 1 : -1 ;
      }
    }
    
    @Override
    public String toString() {
      return "oldDoc=" + oldDoc + ",score=" + score;
    }
  }

  public IndexSorter() {
    
  }
  
  public void sort(Directory input, Directory output, String field) throws IOException {
    LOG.info("IndexSorter: starting.");
    long start = System.currentTimeMillis();
    IndexReader reader = IndexReader.open(input, true);

    SortingReader sorter = new SortingReader(reader, oldToNew(reader, field));
    IndexWriterConfig cfg = new IndexWriterConfig(Version.LUCENE_31, new WhitespaceAnalyzer(Version.LUCENE_31));
    IndexWriter writer = new IndexWriter(output, cfg);
    writer.addIndexes(new IndexReader[] { sorter });
    writer.close();
    long end = System.currentTimeMillis();
    LOG.info("IndexSorter: done, " + (end - start)
        + " total milliseconds");
  }

  private static int[] oldToNew(IndexReader reader, String field) throws IOException {
    int readerMax = reader.maxDoc();
    DocScore[] newToOld = new DocScore[readerMax];
    FieldSelector fSel = new MapFieldSelector(field);

    for (int oldDoc = 0; oldDoc < readerMax; oldDoc++) {
      float score;
      if (reader.isDeleted(oldDoc)) {
        score = 0.0f;
      } else {
        Document d = reader.document(oldDoc, fSel);
        try {
          score = Float.parseFloat(d.get(field));
        } catch (Exception e) {
          score = 0.0f;
        }
      }
      DocScore docScore = new DocScore();
      docScore.oldDoc = oldDoc;
      docScore.score = score;
      newToOld[oldDoc] = docScore;
    }
    Arrays.sort(newToOld);

    int[] oldToNew = new int[readerMax];
    for (int newDoc = 0; newDoc < readerMax; newDoc++) {
      DocScore docScore = newToOld[newDoc];
      oldToNew[docScore.oldDoc] = newDoc;
    }    
    return oldToNew;
  }

  /** */
  public static void main(String[] args) throws Exception {
    Directory input, output;
    String field;
      
    String usage = "IndexSorter <input>  ";

    if (args.length < 3) {
      System.err.println("Usage: " + usage);
      System.exit(-1);
    }

    input = FSDirectory.open(new File(args[0]));
    File out = new File(args[1]);
    if (!out.exists()) out.mkdirs();
    output = FSDirectory.open(out);
    field = args[2];
    IndexSorter sorter = new IndexSorter();
    try {
      sorter.sort(input, output, field);
    } catch (Exception e) {
      LOG.warning("IndexSorter: " + e);
    }
  }
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene IndexSorter.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.