alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (TestPerfTasksLogic.java)

This example Lucene source code file (TestPerfTasksLogic.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

adddoc, adddoc, adddocs, benchmark, closeindex, createindex, createindex, exception, exception, indexreader, io, resetsystemerase, rounds, string, string, text, util

The Lucene TestPerfTasksLogic.java source code

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.benchmark.byTask;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.StringReader;
import java.text.Collator;
import java.util.List;
import java.util.Locale;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.benchmark.BenchmarkTestCase;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker;
import org.apache.lucene.benchmark.byTask.stats.TaskStats;
import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
import org.apache.lucene.collation.CollationKeyAnalyzer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogDocMergePolicy;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.index.SerialMergeScheduler;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.search.FieldCache.StringIndex;
import org.apache.lucene.search.FieldCache;

/**
 * Test very simply that perf tasks - simple algorithms - are doing what they should.
 */
public class TestPerfTasksLogic extends BenchmarkTestCase {

  @Override
  public void setUp() throws Exception {
    super.setUp();
    copyToWorkDir("reuters.first20.lines.txt");
  }

  /**
   * Test index creation logic
   */
  public void testIndexAndSearchTasks() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "ResetSystemErase",
        "CreateIndex",
        "{ AddDoc } : 1000",
        "Optimize",
        "CloseIndex",
        "OpenReader",
        "{ CountingSearchTest } : 200",
        "CloseReader",
        "[ CountingSearchTest > : 70",
        "[ CountingSearchTest > : 9",
    };
    
    // 2. we test this value later
    CountingSearchTestTask.numSearches = 0;
    
    // 3. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);

    // 4. test specific checks after the benchmark run completed.
    assertEquals("TestSearchTask was supposed to be called!",279,CountingSearchTestTask.numSearches);
    assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
    // now we should be able to open the index for write. 
    IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),
        new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))
            .setOpenMode(OpenMode.APPEND));
    iw.close();
    IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
    assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
    ir.close();
  }

  /**
   * Test timed sequence task.
   */
  public void testTimedSearchTask() throws Exception {
    String algLines[] = {
        "log.step=100000",
        "ResetSystemErase",
        "CreateIndex",
        "{ AddDoc } : 100",
        "Optimize",
        "CloseIndex",
        "OpenReader",
        "{ CountingSearchTest } : .5s",
        "CloseReader",
    };

    CountingSearchTestTask.numSearches = 0;
    execBenchmark(algLines);
    assertTrue(CountingSearchTestTask.numSearches > 0);
    long elapsed = CountingSearchTestTask.prevLastMillis - CountingSearchTestTask.startMillis;
    assertTrue("elapsed time was " + elapsed + " msec", elapsed <= 1500);
  }

  // disabled until we fix BG thread prio -- this test
  // causes build to hang
  public void testBGSearchTaskThreads() throws Exception {
    String algLines[] = {
        "log.time.step.msec = 100",
        "log.step=100000",
        "ResetSystemErase",
        "CreateIndex",
        "{ AddDoc } : 1000",
        "Optimize",
        "CloseIndex",
        "OpenReader",
        "{",
        "  [ \"XSearch\" { CountingSearchTest > : * ] : 2 &-1",
        "  Wait(0.5)",
        "}",
        "CloseReader",
        "RepSumByPref X"
    };

    CountingSearchTestTask.numSearches = 0;
    execBenchmark(algLines);
    assertTrue(CountingSearchTestTask.numSearches > 0);
  }

  public void testHighlighting() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "doc.stored=true",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "query.maker=" + ReutersQueryMaker.class.getName(),
        "ResetSystemErase",
        "CreateIndex",
        "{ AddDoc } : 100",
        "Optimize",
        "CloseIndex",
        "OpenReader(true)",
        "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
        "CloseReader",
    };

    // 2. we test this value later
    CountingHighlighterTestTask.numHighlightedResults = 0;
    CountingHighlighterTestTask.numDocsRetrieved = 0;
    // 3. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);

    // 4. test specific checks after the benchmark run completed.
    assertEquals("TestSearchTask was supposed to be called!",92,CountingHighlighterTestTask.numDocsRetrieved);
    //pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs
    //we probably should use a different doc/query maker, but...
    assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0);

    assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
    // now we should be able to open the index for write.
    IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(OpenMode.APPEND));
    iw.close();
    IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
    assertEquals("100 docs were added to the index, this is what we expect to find!",100,ir.numDocs());
    ir.close();
  }

  public void testHighlightingTV() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "doc.stored=true",//doc storage is required in order to have text to highlight
        "doc.term.vector.offsets=true",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "query.maker=" + ReutersQueryMaker.class.getName(),
        "ResetSystemErase",
        "CreateIndex",
        "{ AddDoc } : 1000",
        "Optimize",
        "CloseIndex",
        "OpenReader(false)",
        "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
        "CloseReader",
    };

    // 2. we test this value later
    CountingHighlighterTestTask.numHighlightedResults = 0;
    CountingHighlighterTestTask.numDocsRetrieved = 0;
    // 3. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);

    // 4. test specific checks after the benchmark run completed.
    assertEquals("TestSearchTask was supposed to be called!",92,CountingHighlighterTestTask.numDocsRetrieved);
    //pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs
    //we probably should use a different doc/query maker, but...
    assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0);

    assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
    // now we should be able to open the index for write.
    IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(OpenMode.APPEND));
    iw.close();
    IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
    assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
    ir.close();
  }

  public void testHighlightingNoTvNoStore() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "doc.stored=false",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "query.maker=" + ReutersQueryMaker.class.getName(),
        "ResetSystemErase",
        "CreateIndex",
        "{ AddDoc } : 1000",
        "Optimize",
        "CloseIndex",
        "OpenReader",
        "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
        "CloseReader",
    };

    // 2. we test this value later
    CountingHighlighterTestTask.numHighlightedResults = 0;
    CountingHighlighterTestTask.numDocsRetrieved = 0;
    // 3. execute the algorithm  (required in every "logic" test)
    try {
      Benchmark benchmark = execBenchmark(algLines);
      assertTrue("CountingHighlighterTest should have thrown an exception", false);
      assertNotNull(benchmark); // (avoid compile warning on unused variable)
    } catch (Exception e) {
      assertTrue(true);
    }
  }

  /**
   * Test Exhasting Doc Maker logic
   */
  public void testExhaustContentSource() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource",
        "content.source.log.step=1",
        "doc.term.vector=false",
        "content.source.forever=false",
        "directory=RAMDirectory",
        "doc.stored=false",
        "doc.tokenized=false",
        "# ----- alg ",
        "CreateIndex",
        "{ AddDoc } : * ",
        "Optimize",
        "CloseIndex",
        "OpenReader",
        "{ CountingSearchTest } : 100",
        "CloseReader",
        "[ CountingSearchTest > : 30",
        "[ CountingSearchTest > : 9",
    };
    
    // 2. we test this value later
    CountingSearchTestTask.numSearches = 0;
    
    // 3. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);

    // 4. test specific checks after the benchmark run completed.
    assertEquals("TestSearchTask was supposed to be called!",139,CountingSearchTestTask.numSearches);
    assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
    // now we should be able to open the index for write. 
    IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(OpenMode.APPEND));
    iw.close();
    IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
    assertEquals("1 docs were added to the index, this is what we expect to find!",1,ir.numDocs());
    ir.close();
  }

  // LUCENE-1994: test thread safety of SortableSingleDocMaker
  public void testDocMakerThreadSafety() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.SortableSingleDocSource",
        "doc.term.vector=false",
        "log.step.AddDoc=10000",
        "content.source.forever=true",
        "directory=RAMDirectory",
        "doc.reuse.fields=false",
        "doc.stored=false",
        "doc.tokenized=false",
        "doc.index.props=true",
        "# ----- alg ",
        "CreateIndex",
        "[ { AddDoc > : 250 ] : 4",
        "CloseIndex",
    };
    
    // 2. we test this value later
    CountingSearchTestTask.numSearches = 0;
    
    // 3. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);

    IndexReader r = IndexReader.open(benchmark.getRunData().getDirectory(), true);
    StringIndex idx = FieldCache.DEFAULT.getStringIndex(r, "country");
    final int maxDoc = r.maxDoc();
    assertEquals(1000, maxDoc);
    for(int i=0;i<1000;i++) {
      assertNotNull("doc " + i + " has null country", idx.lookup[idx.order[i]]);
    }
    r.close();
  }

  /**
   * Test Parallel Doc Maker logic (for LUCENE-940)
   */
  public void testParallelDocMaker() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "content.source.log.step=3",
        "doc.term.vector=false",
        "content.source.forever=false",
        "directory=FSDirectory",
        "doc.stored=false",
        "doc.tokenized=false",
        "# ----- alg ",
        "CreateIndex",
        "[ { AddDoc } : * ] : 4 ",
        "CloseIndex",
    };
    
    // 2. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);

    // 3. test number of docs in the index
    IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
    int ndocsExpected = 20; // first 20 reuters docs.
    assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
    ir.close();
  }

  /**
   * Test WriteLineDoc and LineDocSource.
   */
  public void testLineDocFile() throws Exception {
    File lineFile = new File(TEMP_DIR, "test.reuters.lines.txt");

    // We will call WriteLineDocs this many times
    final int NUM_TRY_DOCS = 50;

    // Creates a line file with first 50 docs from SingleDocSource
    String algLines1[] = {
      "# ----- properties ",
      "content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource",
      "content.source.forever=true",
      "line.file.out=" + lineFile.getAbsolutePath().replace('\\', '/'),
      "# ----- alg ",
      "{WriteLineDoc()}:" + NUM_TRY_DOCS,
    };

    // Run algo
    Benchmark benchmark = execBenchmark(algLines1);

    BufferedReader r = new BufferedReader(new FileReader(lineFile));
    int numLines = 0;
    String line;
    while((line = r.readLine()) != null) {
      if (numLines==0 && line.startsWith(WriteLineDocTask.FIELDS_HEADER_INDICATOR)) {
        continue; // do not count the header line as a doc 
      }
      numLines++;
    }
    r.close();
    assertEquals("did not see the right number of docs; should be " + NUM_TRY_DOCS + " but was " + numLines, NUM_TRY_DOCS, numLines);
    
    // Index the line docs
    String algLines2[] = {
      "# ----- properties ",
      "analyzer=org.apache.lucene.analysis.WhitespaceAnalyzer",
      "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
      "docs.file=" + lineFile.getAbsolutePath().replace('\\', '/'),
      "content.source.forever=false",
      "doc.reuse.fields=false",
      "ram.flush.mb=4",
      "# ----- alg ",
      "ResetSystemErase",
      "CreateIndex",
      "{AddDoc}: *",
      "CloseIndex",
    };
    
    // Run algo
    benchmark = execBenchmark(algLines2);

    // now we should be able to open the index for write. 
    IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),
        new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))
            .setOpenMode(OpenMode.APPEND));
    iw.close();

    IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
    assertEquals(numLines + " lines were created but " + ir.numDocs() + " docs are in the index", numLines, ir.numDocs());
    ir.close();

    lineFile.delete();
  }
  
  /**
   * Test ReadTokensTask
   */
  public void testReadTokens() throws Exception {

    // We will call ReadTokens on this many docs
    final int NUM_DOCS = 20;

    // Read tokens from first NUM_DOCS docs from Reuters and
    // then build index from the same docs
    String algLines1[] = {
      "# ----- properties ",
      "analyzer=org.apache.lucene.analysis.WhitespaceAnalyzer",
      "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
      "docs.file=" + getReuters20LinesFile(),
      "# ----- alg ",
      "{ReadTokens}: " + NUM_DOCS,
      "ResetSystemErase",
      "CreateIndex",
      "{AddDoc}: " + NUM_DOCS,
      "CloseIndex",
    };

    // Run algo
    Benchmark benchmark = execBenchmark(algLines1);

    List<TaskStats> stats = benchmark.getRunData().getPoints().taskStats();

    // Count how many tokens all ReadTokens saw
    int totalTokenCount1 = 0;
    for (final TaskStats stat : stats) {
      if (stat.getTask().getName().equals("ReadTokens")) {
        totalTokenCount1 += stat.getCount();
      }
    }

    // Separately count how many tokens are actually in the index:
    IndexReader reader = IndexReader.open(benchmark.getRunData().getDirectory(), true);
    assertEquals(NUM_DOCS, reader.numDocs());

    TermEnum terms = reader.terms();
    TermDocs termDocs = reader.termDocs();
    int totalTokenCount2 = 0;
    while(terms.next()) {
      Term term = terms.term();
      /* not-tokenized, but indexed field */
      if (term != null && term.field() != DocMaker.ID_FIELD && term.field() != DocMaker.DATE_MSEC_FIELD && term.field() != DocMaker.TIME_SEC_FIELD) {
          termDocs.seek(terms.term());
        while (termDocs.next())
          totalTokenCount2 += termDocs.freq();
      }
    }
    reader.close();

    // Make sure they are the same
    assertEquals(totalTokenCount1, totalTokenCount2);
  }
  
  /**
   * Test that " {[AddDoc(4000)]: 4} : * " works corrcetly (for LUCENE-941)
   */
  public void testParallelExhausted() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "content.source.log.step=3",
        "doc.term.vector=false",
        "content.source.forever=false",
        "directory=RAMDirectory",
        "doc.stored=false",
        "doc.tokenized=false",
        "task.max.depth.log=1",
        "# ----- alg ",
        "CreateIndex",
        "{ [ AddDoc]: 4} : * ",
        "ResetInputs ",
        "{ [ AddDoc]: 4} : * ",
        "WaitForMerges",
        "CloseIndex",
    };
    
    // 2. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);

    // 3. test number of docs in the index
    IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
    int ndocsExpected = 2 * 20; // first 20 reuters docs.
    assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
    ir.close();
  }


  /**
   * Test that exhaust in loop works as expected (LUCENE-1115).
   */
  public void testExhaustedLooped() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "content.source.log.step=3",
        "doc.term.vector=false",
        "content.source.forever=false",
        "directory=RAMDirectory",
        "doc.stored=false",
        "doc.tokenized=false",
        "task.max.depth.log=1",
        "# ----- alg ",
        "{ \"Rounds\"",
        "  ResetSystemErase",
        "  CreateIndex",
        "  { \"AddDocs\"  AddDoc > : * ",
        "  WaitForMerges",
        "  CloseIndex",
        "} : 2",
    };
    
    // 2. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);

    // 3. test number of docs in the index
    IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
    int ndocsExpected = 20;  // first 20 reuters docs.
    assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
    ir.close();
  }
  
  /**
   * Test that we can close IndexWriter with argument "false".
   */
  public void testCloseIndexFalse() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "ram.flush.mb=-1",
        "max.buffered=2",
        "content.source.log.step=3",
        "doc.term.vector=false",
        "content.source.forever=false",
        "directory=RAMDirectory",
        "doc.stored=false",
        "doc.tokenized=false",
        "debug.level=1",
        "# ----- alg ",
        "{ \"Rounds\"",
        "  ResetSystemErase",
        "  CreateIndex",
        "  { \"AddDocs\"  AddDoc > : * ",
        "  CloseIndex(false)",
        "} : 2",
    };
    
    // 2. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);

    // 3. test number of docs in the index
    IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
    int ndocsExpected = 20; // first 20 reuters docs.
    assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
    ir.close();
  }

  public static class MyMergeScheduler extends SerialMergeScheduler {
    boolean called;
    public MyMergeScheduler() {
      super();
      called = true;
    }
  }

  public void testDeleteByPercent() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "ram.flush.mb=-1",
        "max.buffered=2",
        "content.source.log.step=3",
        "doc.term.vector=false",
        "content.source.forever=false",
        "directory=RAMDirectory",
        "doc.stored=false",
        "doc.tokenized=false",
        "debug.level=1",
        "# ----- alg ",
        "CreateIndex",
        "{ \"AddDocs\"  AddDoc > : * ",
        "CloseIndex()",
        "OpenReader(false)",
        "DeleteByPercent(20)",
        "CloseReader"
    };
    
    // 2. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);

    // 3. test number of docs in the index
    IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
    int ndocsExpected = 16; // first 20 reuters docs, minus 20%
    assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
    ir.close();
  }

  /**
   * Test that we can set merge scheduler".
   */
  public void testMergeScheduler() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "content.source.log.step=3",
        "doc.term.vector=false",
        "content.source.forever=false",
        "directory=RAMDirectory",
        "merge.scheduler=" + MyMergeScheduler.class.getName(),
        "doc.stored=false",
        "doc.tokenized=false",
        "debug.level=1",
        "# ----- alg ",
        "{ \"Rounds\"",
        "  ResetSystemErase",
        "  CreateIndex",
        "  { \"AddDocs\"  AddDoc > : * ",
        "} : 2",
    };
    // 2. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);

    assertTrue("did not use the specified MergeScheduler",
        ((MyMergeScheduler) benchmark.getRunData().getIndexWriter().getConfig()
            .getMergeScheduler()).called);
    benchmark.getRunData().getIndexWriter().close();

    // 3. test number of docs in the index
    IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
    int ndocsExpected = 20; // first 20 reuters docs.
    assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
    ir.close();
  }

  public static class MyMergePolicy extends LogDocMergePolicy {
    boolean called;
    public MyMergePolicy() {
      called = true;
    }
  }
  
  /**
   * Test that we can set merge policy".
   */
  public void testMergePolicy() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "content.source.log.step=3",
        "ram.flush.mb=-1",
        "max.buffered=2",
        "doc.term.vector=false",
        "content.source.forever=false",
        "directory=RAMDirectory",
        "merge.policy=" + MyMergePolicy.class.getName(),
        "doc.stored=false",
        "doc.tokenized=false",
        "debug.level=1",
        "# ----- alg ",
        "{ \"Rounds\"",
        "  ResetSystemErase",
        "  CreateIndex",
        "  { \"AddDocs\"  AddDoc > : * ",
        "} : 2",
    };

    // 2. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);
    assertTrue("did not use the specified MergePolicy", ((MyMergePolicy) benchmark.getRunData().getIndexWriter().getConfig().getMergePolicy()).called);
    benchmark.getRunData().getIndexWriter().close();
    
    // 3. test number of docs in the index
    IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
    int ndocsExpected = 20; // first 20 reuters docs.
    assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
    ir.close();
  }

  /**
   * Test that IndexWriter settings stick.
   */
  public void testIndexWriterSettings() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "content.source.log.step=3",
        "ram.flush.mb=-1",
        "max.buffered=2",
        "compound=cmpnd:true:false",
        "doc.term.vector=vector:false:true",
        "content.source.forever=false",
        "directory=RAMDirectory",
        "doc.stored=false",
        "merge.factor=3",
        "doc.tokenized=false",
        "debug.level=1",
        "# ----- alg ",
        "{ \"Rounds\"",
        "  ResetSystemErase",
        "  CreateIndex",
        "  { \"AddDocs\"  AddDoc > : * ",
        "  NewRound",
        "} : 2",
    };

    // 2. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);
    final IndexWriter writer = benchmark.getRunData().getIndexWriter();
    assertEquals(2, writer.getConfig().getMaxBufferedDocs());
    assertEquals(IndexWriterConfig.DISABLE_AUTO_FLUSH, (int) writer.getConfig().getRAMBufferSizeMB());
    assertEquals(3, ((LogMergePolicy) writer.getConfig().getMergePolicy()).getMergeFactor());
    assertFalse(((LogMergePolicy) writer.getConfig().getMergePolicy()).getUseCompoundFile());
    writer.close();
    Directory dir = benchmark.getRunData().getDirectory();
    IndexReader reader = IndexReader.open(dir, true);
    TermFreqVector [] tfv = reader.getTermFreqVectors(0);
    assertNotNull(tfv);
    assertTrue(tfv.length > 0);
    reader.close();
  }

  /**
   * Test that we can call optimize(maxNumSegments).
   */
  public void testOptimizeMaxNumSegments() throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "content.source.log.step=3",
        "ram.flush.mb=-1",
        "max.buffered=3",
        "doc.term.vector=false",
        "content.source.forever=false",
        "directory=RAMDirectory",
        "merge.policy=org.apache.lucene.index.LogDocMergePolicy",
        "doc.stored=false",
        "doc.tokenized=false",
        "debug.level=1",
        "# ----- alg ",
        "{ \"Rounds\"",
        "  ResetSystemErase",
        "  CreateIndex",
        "  { \"AddDocs\"  AddDoc > : * ",
        "  Optimize(3)",
        "  CloseIndex()",
        "} : 2",
    };
    
    // 2. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);

    // 3. test number of docs in the index
    IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory(), true);
    int ndocsExpected = 20; // first 20 reuters docs.
    assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
    ir.close();

    // Make sure we have 3 segments:
    SegmentInfos infos = new SegmentInfos();
    infos.read(benchmark.getRunData().getDirectory());
    assertEquals(3, infos.size());
  }
  
  /**
   * Test disabling task count (LUCENE-1136).
   */
  public void testDisableCounting() throws Exception {
    doTestDisableCounting(true);
    doTestDisableCounting(false);
  }

  private void doTestDisableCounting(boolean disable) throws Exception {
    // 1. alg definition (required in every "logic" test)
    String algLines[] = disableCountingLines(disable);
    
    // 2. execute the algorithm  (required in every "logic" test)
    Benchmark benchmark = execBenchmark(algLines);

    // 3. test counters
    int n = disable ? 0 : 1;
    int nChecked = 0;
    for (final TaskStats stats : benchmark.getRunData().getPoints().taskStats()) {
      String taskName = stats.getTask().getName();
      if (taskName.equals("Rounds")) {
        assertEquals("Wrong total count!",20+2*n,stats.getCount());
        nChecked++;
      } else if (taskName.equals("CreateIndex")) {
        assertEquals("Wrong count for CreateIndex!",n,stats.getCount());
        nChecked++;
      } else if (taskName.equals("CloseIndex")) {
        assertEquals("Wrong count for CloseIndex!",n,stats.getCount());
        nChecked++;
      }
    }
    assertEquals("Missing some tasks to check!",3,nChecked);
  }

  private String[] disableCountingLines (boolean disable) {
    String dis = disable ? "-" : "";
    return new String[] {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "content.source.log.step=30",
        "doc.term.vector=false",
        "content.source.forever=false",
        "directory=RAMDirectory",
        "doc.stored=false",
        "doc.tokenized=false",
        "task.max.depth.log=1",
        "# ----- alg ",
        "{ \"Rounds\"",
        "  ResetSystemErase",
        "  "+dis+"CreateIndex",            // optionally disable counting here
        "  { \"AddDocs\"  AddDoc > : * ",
        "  "+dis+"  CloseIndex",             // optionally disable counting here (with extra blanks)
        "}",
        "RepSumByName",
    };
  }

  /**
   * Test that we can change the Locale in the runData,
   * that it is parsed as we expect.
   */
  public void testLocale() throws Exception {
    // empty Locale: clear it (null)
    Benchmark benchmark = execBenchmark(getLocaleConfig(""));
    assertNull(benchmark.getRunData().getLocale());

    // ROOT locale
    benchmark = execBenchmark(getLocaleConfig("ROOT"));
    assertEquals(new Locale(""), benchmark.getRunData().getLocale());
    
    // specify just a language 
    benchmark = execBenchmark(getLocaleConfig("de"));
    assertEquals(new Locale("de"), benchmark.getRunData().getLocale());
    
    // specify language + country
    benchmark = execBenchmark(getLocaleConfig("en,US"));
    assertEquals(new Locale("en", "US"), benchmark.getRunData().getLocale());
    
    // specify language + country + variant
    benchmark = execBenchmark(getLocaleConfig("no,NO,NY"));
    assertEquals(new Locale("no", "NO", "NY"), benchmark.getRunData().getLocale());
  }
   
  private String[] getLocaleConfig(String localeParam) {
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "content.source.log.step=3",
        "content.source.forever=false",
        "directory=RAMDirectory",
        "# ----- alg ",
        "{ \"Rounds\"",
        "  ResetSystemErase",
        "  NewLocale(" + localeParam + ")",
        "  CreateIndex",
        "  { \"AddDocs\"  AddDoc > : * ",
        "  NewRound",
        "} : 1",
    };
    return algLines;
  }
  
  /**
   * Test that we can create CollationAnalyzers.
   */
  public void testCollator() throws Exception {
    // ROOT locale
    Benchmark benchmark = execBenchmark(getCollatorConfig("ROOT", "impl:jdk"));
    CollationKeyAnalyzer expected = new CollationKeyAnalyzer(Collator
        .getInstance(new Locale("")));
    assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
    
    // specify just a language
    benchmark = execBenchmark(getCollatorConfig("de", "impl:jdk"));
    expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("de")));
    assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
    
    // specify language + country
    benchmark = execBenchmark(getCollatorConfig("en,US", "impl:jdk"));
    expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("en",
        "US")));
    assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
    
    // specify language + country + variant
    benchmark = execBenchmark(getCollatorConfig("no,NO,NY", "impl:jdk"));
    expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("no",
        "NO", "NY")));
    assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar");
  }
  
  private void assertEqualCollation(Analyzer a1, Analyzer a2, String text)
      throws Exception {
    TokenStream ts1 = a1.tokenStream("bogus", new StringReader(text));
    TokenStream ts2 = a2.tokenStream("bogus", new StringReader(text));
    ts1.reset();
    ts2.reset();
    CharTermAttribute termAtt1 = ts1.addAttribute(CharTermAttribute.class);
    CharTermAttribute termAtt2 = ts2.addAttribute(CharTermAttribute.class);
    assertTrue(ts1.incrementToken());
    assertTrue(ts2.incrementToken());
    assertEquals(termAtt1.toString(), termAtt2.toString());
    assertFalse(ts1.incrementToken());
    assertFalse(ts2.incrementToken());
    ts1.close();
    ts2.close();
  }
  
  private String[] getCollatorConfig(String localeParam, 
      String collationParam) {
    String algLines[] = {
        "# ----- properties ",
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "content.source.log.step=3",
        "content.source.forever=false",
        "directory=RAMDirectory",
        "# ----- alg ",
        "{ \"Rounds\"",
        "  ResetSystemErase",
        "  NewLocale(" + localeParam + ")",
        "  NewCollationAnalyzer(" + collationParam + ")",
        "  CreateIndex",
        "  { \"AddDocs\"  AddDoc > : * ",
        "  NewRound",
        "} : 1",
    };
    return algLines;
  }
  
  /**
   * Test that we can create ShingleAnalyzerWrappers.
   */
  public void testShingleAnalyzer() throws Exception {
    String text = "one,two,three, four five six";
    
    // Default analyzer, maxShingleSize, and outputUnigrams
    Benchmark benchmark = execBenchmark(getShingleConfig(""));
    benchmark.getRunData().getAnalyzer().tokenStream
      ("bogus", new StringReader(text)).close();
    assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
                       new String[] {"one", "one two", "two", "two three",
                                     "three", "three four", "four", "four five",
                                     "five", "five six", "six"});
    // Default analyzer, maxShingleSize = 3, and outputUnigrams = false
    benchmark = execBenchmark
      (getShingleConfig("maxShingleSize:3,outputUnigrams:false"));
    assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
                       new String[] { "one two", "one two three", "two three",
                                      "two three four", "three four", 
                                      "three four five", "four five",
                                      "four five six", "five six" });
    // WhitespaceAnalyzer, default maxShingleSize and outputUnigrams
    benchmark = execBenchmark
      (getShingleConfig("analyzer:WhitespaceAnalyzer"));
    assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
                       new String[] { "one,two,three,", "one,two,three, four",
                                      "four", "four five", "five", "five six", 
                                      "six" });
    
    // WhitespaceAnalyzer, maxShingleSize=3 and outputUnigrams=false
    benchmark = execBenchmark
      (getShingleConfig
        ("outputUnigrams:false,maxShingleSize:3,analyzer:WhitespaceAnalyzer"));
    assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
                       new String[] { "one,two,three, four", 
                                      "one,two,three, four five",
                                      "four five", "four five six",
                                      "five six" });
  }
  
  private void assertEqualShingle
    (Analyzer analyzer, String text, String[] expected) throws Exception {
    BaseTokenStreamTestCase.assertAnalyzesTo(analyzer, text, expected);
  }
  
  private String[] getShingleConfig(String params) { 
    String algLines[] = {
        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
        "docs.file=" + getReuters20LinesFile(),
        "content.source.forever=false",
        "directory=RAMDirectory",
        "NewShingleAnalyzer(" + params + ")",
        "CreateIndex",
        "{ \"AddDocs\"  AddDoc > : * "
    };
    return algLines;
  }
  
  private String getReuters20LinesFile() {
    return getWorkDirResourcePath("reuters.first20.lines.txt");
  }  
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene TestPerfTasksLogic.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.