alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (AbstractTestCase.java)

This example Lucene source code file (AbstractTestCase.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

analyzer, basicngramtokenizer, document, exception, exception, f, io, ioexception, override, override, query, query, reader, string, string, util

The Lucene AbstractTestCase.java source code

package org.apache.lucene.search.vectorhighlight;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.Reader;
import java.util.Collection;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Field.TermVector;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;

public abstract class AbstractTestCase extends LuceneTestCase {

  protected final String F = "f";
  protected final String F1 = "f1";
  protected final String F2 = "f2";
  protected Directory dir;
  protected Analyzer analyzerW;
  protected Analyzer analyzerB;
  protected Analyzer analyzerK;
  protected IndexReader reader;  
  protected QueryParser paW;
  protected QueryParser paB;
  
  protected static final String[] shortMVValues = {
    "",
    "",
    "a b c",
    "",   // empty data in multi valued field
    "d e"
  };
  
  protected static final String[] longMVValues = {
    "Followings are the examples of customizable parameters and actual examples of customization:",
    "The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically"
  };
  
  // test data for LUCENE-1448 bug
  protected static final String[] biMVValues = {
    "\nLucene/Solr does not require such additional hardware.",
    "\nWhen you talk about processing speed, the"
  };
  
  protected static final String[] strMVValues = {
    "abc",
    "defg",
    "hijkl"
  };

  @Override
  public void setUp() throws Exception {
    super.setUp();
    analyzerW = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false);
    analyzerB = new BigramAnalyzer();
    analyzerK = new MockAnalyzer(random, MockTokenizer.KEYWORD, false);
    paW = new QueryParser(TEST_VERSION_CURRENT,  F, analyzerW );
    paB = new QueryParser(TEST_VERSION_CURRENT,  F, analyzerB );
    dir = newDirectory();
  }
  
  @Override
  public void tearDown() throws Exception {
    if( reader != null ){
      reader.close();
      reader = null;
    }
    dir.close();
    super.tearDown();
  }

  protected Query tq( String text ){
    return tq( 1F, text );
  }

  protected Query tq( float boost, String text ){
    return tq( boost, F, text );
  }
  
  protected Query tq( String field, String text ){
    return tq( 1F, field, text );
  }
  
  protected Query tq( float boost, String field, String text ){
    Query query = new TermQuery( new Term( field, text ) );
    query.setBoost( boost );
    return query;
  }
  
  protected Query pqF( String... texts ){
    return pqF( 1F, texts );
  }
  
  protected Query pqF( float boost, String... texts ){
    return pqF( boost, 0, texts );
  }
  
  protected Query pqF( float boost, int slop, String... texts ){
    return pq( boost, slop, F, texts );
  }
  
  protected Query pq( String field, String... texts ){
    return pq( 1F, 0, field, texts );
  }
  
  protected Query pq( float boost, String field, String... texts ){
    return pq( boost, 0, field, texts );
  }
  
  protected Query pq( float boost, int slop, String field, String... texts ){
    PhraseQuery query = new PhraseQuery();
    for( String text : texts ){
      query.add( new Term( field, text ) );
    }
    query.setBoost( boost );
    query.setSlop( slop );
    return query;
  }
  
  protected Query dmq( Query... queries ){
    return dmq( 0.0F, queries );
  }
  
  protected Query dmq( float tieBreakerMultiplier, Query... queries ){
    DisjunctionMaxQuery query = new DisjunctionMaxQuery( tieBreakerMultiplier );
    for( Query q : queries ){
      query.add( q );
    }
    return query;
  }
  
  protected void assertCollectionQueries( Collection<Query> actual, Query... expected ){
    assertEquals( expected.length, actual.size() );
    for( Query query : expected ){
      assertTrue( actual.contains( query ) );
    }
  }

  static final class BigramAnalyzer extends Analyzer {
    @Override
    public TokenStream tokenStream(String fieldName, Reader reader) {
      return new BasicNGramTokenizer( reader );
    }
  }
  
  static final class BasicNGramTokenizer extends Tokenizer {

    public static final int DEFAULT_N_SIZE = 2;
    public static final String DEFAULT_DELIMITERS = " \t\n.,";
    private final int n;
    private final String delimiters;
    private int startTerm;
    private int lenTerm;
    private int startOffset;
    private int nextStartOffset;
    private int ch;
    private String snippet;
    private StringBuilder snippetBuffer;
    private static final int BUFFER_SIZE = 4096;
    private char[] charBuffer;
    private int charBufferIndex;
    private int charBufferLen;
    
    public BasicNGramTokenizer( Reader in ){
      this( in, DEFAULT_N_SIZE );
    }
    
    public BasicNGramTokenizer( Reader in, int n ){
      this( in, n, DEFAULT_DELIMITERS );
    }
    
    public BasicNGramTokenizer( Reader in, String delimiters ){
      this( in, DEFAULT_N_SIZE, delimiters );
    }
    
    public BasicNGramTokenizer( Reader in, int n, String delimiters ){
      super(in);
      this.n = n;
      this.delimiters = delimiters;
      startTerm = 0;
      nextStartOffset = 0;
      snippet = null;
      snippetBuffer = new StringBuilder();
      charBuffer = new char[BUFFER_SIZE];
      charBufferIndex = BUFFER_SIZE;
      charBufferLen = 0;
      ch = 0;
    }

    CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
    @Override
    public boolean incrementToken() throws IOException {
      if( !getNextPartialSnippet() )
        return false;
      clearAttributes();
      termAtt.setEmpty().append(snippet, startTerm, startTerm + lenTerm);
      offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset + lenTerm));
      return true;
    }

    private int getFinalOffset() {
      return nextStartOffset;
    }
    
    @Override
    public final void end(){
      offsetAtt.setOffset(getFinalOffset(),getFinalOffset());
    }
    
    protected boolean getNextPartialSnippet() throws IOException {
      if( snippet != null && snippet.length() >= startTerm + 1 + n ){
        startTerm++;
        startOffset++;
        lenTerm = n;
        return true;
      }
      return getNextSnippet();
    }
    
    protected boolean getNextSnippet() throws IOException {
      startTerm = 0;
      startOffset = nextStartOffset;
      snippetBuffer.delete( 0, snippetBuffer.length() );
      while( true ){
        if( ch != -1 )
          ch = readCharFromBuffer();
        if( ch == -1 ) break;
        else if( !isDelimiter( ch ) )
          snippetBuffer.append( (char)ch );
        else if( snippetBuffer.length() > 0 )
          break;
        else
          startOffset++;
      }
      if( snippetBuffer.length() == 0 )
        return false;
      snippet = snippetBuffer.toString();
      lenTerm = snippet.length() >= n ? n : snippet.length();
      return true;
    }
    
    protected int readCharFromBuffer() throws IOException {
      if( charBufferIndex >= charBufferLen ){
        charBufferLen = input.read( charBuffer );
        if( charBufferLen == -1 ){
          return -1;
        }
        charBufferIndex = 0;
      }
      int c = charBuffer[charBufferIndex++];
      nextStartOffset++;
      return c;
    }
    
    protected boolean isDelimiter( int c ){
      return delimiters.indexOf( c ) >= 0;
    }
    
    @Override
    public void reset( Reader input ) throws IOException {
      super.reset( input );
      reset();
    }
    
    @Override
    public void reset() throws IOException {
      startTerm = 0;
      nextStartOffset = 0;
      snippet = null;
      snippetBuffer.setLength( 0 );
      charBufferIndex = BUFFER_SIZE;
      charBufferLen = 0;
      ch = 0;
    }
  }

  protected void make1d1fIndex( String value ) throws Exception {
    make1dmfIndex( value );
  }
  
  protected void make1d1fIndexB( String value ) throws Exception {
    make1dmfIndexB( value );
  }
  
  protected void make1dmfIndex( String... values ) throws Exception {
    make1dmfIndex( analyzerW, values );
  }
  
  protected void make1dmfIndexB( String... values ) throws Exception {
    make1dmfIndex( analyzerB, values );
  }
  
  // make 1 doc with multi valued field
  protected void make1dmfIndex( Analyzer analyzer, String... values ) throws Exception {
    IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
        TEST_VERSION_CURRENT, analyzer).setOpenMode(OpenMode.CREATE));
    Document doc = new Document();
    for( String value: values )
      doc.add( new Field( F, value, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) );
    writer.addDocument( doc );
    writer.close();
    if (reader != null) reader.close();
    reader = IndexReader.open( dir, true );
  }
  
  // make 1 doc with multi valued & not analyzed field
  protected void make1dmfIndexNA( String... values ) throws Exception {
    IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
        TEST_VERSION_CURRENT, analyzerK).setOpenMode(OpenMode.CREATE));
    Document doc = new Document();
    for( String value: values )
      doc.add( new Field( F, value, Store.YES, Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) );
    writer.addDocument( doc );
    writer.close();
    if (reader != null) reader.close();
    reader = IndexReader.open( dir, true );
  }
  
  protected void makeIndexShortMV() throws Exception {
    
    //  0
    // ""
    //  1
    // ""

    //  234567
    // "a b c"
    //  0 1 2

    //  8
    // ""

    //   111
    //  9012
    // "d e"
    //  3 4
    make1dmfIndex( shortMVValues );
  }
  
  protected void makeIndexLongMV() throws Exception {
    //           11111111112222222222333333333344444444445555555555666666666677777777778888888888999
    // 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012
    // Followings are the examples of customizable parameters and actual examples of customization:
    // 0          1   2   3        4  5            6          7   8      9        10 11
    
    //        1                                                                                                   2
    // 999999900000000001111111111222222222233333333334444444444555555555566666666667777777777888888888899999999990000000000111111111122
    // 345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901
    // The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically
    // 12  13  (14)   (15)     16  17   18  19 20    21       22   23 (24)   (25)     26   27   28   29  30  31  32   33      34

    make1dmfIndex( longMVValues );
  }
  
  protected void makeIndexLongMVB() throws Exception {
    // "*" ... LF
    
    //           1111111111222222222233333333334444444444555555
    // 01234567890123456789012345678901234567890123456789012345
    // *Lucene/Solr does not require such additional hardware.
    //  Lu 0        do 10    re 15   su 21       na 31
    //   uc 1        oe 11    eq 16   uc 22       al 32
    //    ce 2        es 12    qu 17   ch 23         ha 33
    //     en 3          no 13  ui 18     ad 24       ar 34
    //      ne 4          ot 14  ir 19     dd 25       rd 35
    //       e/ 5                 re 20     di 26       dw 36
    //        /S 6                           it 27       wa 37
    //         So 7                           ti 28       ar 38
    //          ol 8                           io 29       re 39
    //           lr 9                           on 30

    // 5555666666666677777777778888888888999999999
    // 6789012345678901234567890123456789012345678
    // *When you talk about processing speed, the
    //  Wh 40         ab 48     es 56         th 65
    //   he 41         bo 49     ss 57         he 66
    //    en 42         ou 50     si 58
    //       yo 43       ut 51     in 59
    //        ou 44         pr 52   ng 60
    //           ta 45       ro 53     sp 61
    //            al 46       oc 54     pe 62
    //             lk 47       ce 55     ee 63
    //                                    ed 64

    make1dmfIndexB( biMVValues );
  }
  
  protected void makeIndexStrMV() throws Exception {

    //  0123
    // "abc"
    
    //  34567
    // "defg"

    //     111
    //  789012
    // "hijkl"
    make1dmfIndexNA( strMVValues );
  }
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene AbstractTestCase.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.