alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (WriteLineDocTask.java)

This example Lucene source code file (WriteLineDocTask.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

default_fields, exception, fields_header_indicator, illegalargumentexception, io, override, override, regex, sep, string, string, stringbuilder, stringbuilder, threadlocal, threadlocal, util, writelinedoctask

The Lucene WriteLineDocTask.java source code

package org.apache.lucene.benchmark.byTask.tasks;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.BufferedWriter;
import java.io.File;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.Arrays;
import java.util.HashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;

/**
 * A task which writes documents, one line per document. Each line is in the
 * following format: title <TAB> date <TAB> body. The output of this
 * task can be consumed by
 * {@link org.apache.lucene.benchmark.byTask.feeds.LineDocSource} and is intended
 * to save the IO overhead of opening a file per document to be indexed.
 * <p>
 * The format of the output is set according to the output file extension.
 * Compression is recommended when the output file is expected to be large.
 * See info on file extensions in 
 * {@link org.apache.lucene.benchmark.byTask.utils.StreamUtils.Type}
 * <p> 
 * Supports the following parameters:
 * <ul>
 * <li>line.file.out - the name of the file to write the output to. That
 * parameter is mandatory. <b>NOTE: the file is re-created.
 * <li>line.fields - which fields should be written in each line.
 * (optional, default: {@link #DEFAULT_FIELDS}).
 * <li>sufficient.fields - list of field names, separated by comma, which, 
 * if all of them are missing, the document will be skipped. For example, to require 
 * that at least one of f1,f2 is not empty, specify: "f1,f2" in this field. To specify
 * that no field is required, i.e. that even empty docs should be emitted, specify <b>",".    
 * (optional, default: {@link #DEFAULT_SUFFICIENT_FIELDS}).
 * </ul>
 * <b>NOTE: this class is not thread-safe and if used by multiple threads the
 * output is unspecified (as all will write to the same output file in a
 * non-synchronized way).
 */
public class WriteLineDocTask extends PerfTask {

  public static final String FIELDS_HEADER_INDICATOR = "FIELDS_HEADER_INDICATOR###";

  public final static char SEP = '\t';
  
  /**
   * Fields to be written by default
   */
  public static final String[] DEFAULT_FIELDS = new String[] {
    DocMaker.TITLE_FIELD,
    DocMaker.DATE_FIELD,
    DocMaker.BODY_FIELD,
  };
  
  /**
   * Default fields which at least one of them is required to not skip the doc.
   */
  public static final String DEFAULT_SUFFICIENT_FIELDS = DocMaker.TITLE_FIELD +',' + DocMaker.BODY_FIELD;
  
  private int docSize = 0;
  private PrintWriter lineFileOut = null;
  private DocMaker docMaker;
  private ThreadLocal<StringBuilder> threadBuffer = new ThreadLocal();
  private ThreadLocal<Matcher> threadNormalizer = new ThreadLocal();
  private final String[] fieldsToWrite;;
  private final boolean[] sufficientFields;
  private final boolean checkSufficientFields;
  
  public WriteLineDocTask(PerfRunData runData) throws Exception {
    super(runData);
    Config config = runData.getConfig();
    String fname = config.get("line.file.out", null);
    if (fname == null) {
      throw new IllegalArgumentException("line.file.out must be set");
    }
    OutputStream out = StreamUtils.outputStream(new File(fname));
    lineFileOut = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), StreamUtils.BUFFER_SIZE));
    docMaker = runData.getDocMaker();
    
    // init fields 
    String f2r = config.get("line.fields",null);
    if (f2r == null) {
      fieldsToWrite = DEFAULT_FIELDS;
    } else {
      if (f2r.indexOf(SEP)>=0) {
        throw new IllegalArgumentException("line.fields "+f2r+" should not contain the separator char: "+SEP);
      }
      fieldsToWrite = f2r.split(","); 
    }
    
    // init sufficient fields
    sufficientFields = new boolean[fieldsToWrite.length];
    String suff = config.get("sufficient.fields",DEFAULT_SUFFICIENT_FIELDS);
    if (",".equals(suff)) {
      checkSufficientFields = false;
    } else {
      checkSufficientFields = true;
      HashSet<String> sf = new HashSet(Arrays.asList(suff.split(",")));
      for (int i=0; i<fieldsToWrite.length; i++) {
        if (sf.contains(fieldsToWrite[i])) {
          sufficientFields[i] = true;
        }
      }
    }
    
    writeHeader();
  }

  /**
   * Write a header to the lines file - indicating how to read the file later 
   */
  private void writeHeader() {
    StringBuilder sb = threadBuffer.get();
    if (sb == null) {
      sb = new StringBuilder();
      threadBuffer.set(sb);
    }
    sb.setLength(0);
    sb.append(FIELDS_HEADER_INDICATOR);
    for (String f : fieldsToWrite) {
      sb.append(SEP).append(f);
    }
    lineFileOut.println(sb.toString());
  }

  @Override
  protected String getLogMessage(int recsCount) {
    return "Wrote " + recsCount + " line docs";
  }
  
  @Override
  public int doLogic() throws Exception {
    Document doc = docSize > 0 ? docMaker.makeDocument(docSize) : docMaker.makeDocument();

    Matcher matcher = threadNormalizer.get();
    if (matcher == null) {
      matcher = Pattern.compile("[\t\r\n]+").matcher("");
      threadNormalizer.set(matcher);
    }
    
    StringBuilder sb = threadBuffer.get();
    if (sb == null) {
      sb = new StringBuilder();
      threadBuffer.set(sb);
    }
    sb.setLength(0);

    boolean sufficient = !checkSufficientFields;
    for (int i=0; i<fieldsToWrite.length; i++) {
      Field f = doc.getField(fieldsToWrite[i]);
      String text = f == null ? "" : matcher.reset(f.stringValue()).replaceAll(" ").trim();
      sb.append(text).append(SEP);
      sufficient |= text.length()>0 && sufficientFields[i];
    }
    if (sufficient) {
      sb.setLength(sb.length()-1); // remove redundant last separator
      // lineFileOut is a PrintWriter, which synchronizes internally in println.
      lineFileOut.println(sb.toString());
    }

    return 1;
  }

  @Override
  public void close() throws Exception {
    lineFileOut.close();
    super.close();
  }
  
  /**
   * Set the params (docSize only)
   * @param params docSize, or 0 for no limit.
   */
  @Override
  public void setParams(String params) {
    super.setParams(params);
    docSize = (int) Float.parseFloat(params); 
  }

  @Override
  public boolean supportsParams() {
    return true;
  }
  
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene WriteLineDocTask.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.