home | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (TestTermVectorsWriter.java)

This example Lucene source code file (TestTermVectorsWriter.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

directory, document, document, exception, field, field, indexreader, indexwriter, indexwriter, io, mockanalyzer, termpositionvector, termvectoroffsetinfo, test_version_current, test_version_current

The Lucene TestTermVectorsWriter.java source code

package org.apache.lucene.index;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.TeeSinkTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MockDirectoryWrapper;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.LuceneTestCase;

/** tests for writing term vectors */
public class TestTermVectorsWriter extends LuceneTestCase {
  // LUCENE-1442
  public void testDoubleOffsetCounting() throws Exception {
    Directory dir = newDirectory();
    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( 
        TEST_VERSION_CURRENT, new MockAnalyzer(random)));
    Document doc = new Document();
    Field f = newField("field", "abcd", Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
    doc.add(f);
    doc.add(f);
    Field f2 = newField("field", "", Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
    doc.add(f2);
    doc.add(f);
    w.addDocument(doc);
    w.close();

    IndexReader r = IndexReader.open(dir, true);
    TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);

    // Token "" occurred once
    assertEquals(1, termOffsets.length);
    assertEquals(8, termOffsets[0].getStartOffset());
    assertEquals(8, termOffsets[0].getEndOffset());

    // Token "abcd" occurred three times
    termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(1);
    assertEquals(3, termOffsets.length);
    assertEquals(0, termOffsets[0].getStartOffset());
    assertEquals(4, termOffsets[0].getEndOffset());
    assertEquals(4, termOffsets[1].getStartOffset());
    assertEquals(8, termOffsets[1].getEndOffset());
    assertEquals(8, termOffsets[2].getStartOffset());
    assertEquals(12, termOffsets[2].getEndOffset());
    r.close();
    dir.close();
  }

  // LUCENE-1442
  public void testDoubleOffsetCounting2() throws Exception {
    Directory dir = newDirectory();
    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random)));
    Document doc = new Document();
    Field f = newField("field", "abcd", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
    doc.add(f);
    doc.add(f);
    w.addDocument(doc);
    w.close();

    IndexReader r = IndexReader.open(dir, true);
    TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
    assertEquals(2, termOffsets.length);
    assertEquals(0, termOffsets[0].getStartOffset());
    assertEquals(4, termOffsets[0].getEndOffset());
    assertEquals(5, termOffsets[1].getStartOffset());
    assertEquals(9, termOffsets[1].getEndOffset());
    r.close();
    dir.close();
  }

  // LUCENE-1448
  public void testEndOffsetPositionCharAnalyzer() throws Exception {
    Directory dir = newDirectory();
    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random)));
    Document doc = new Document();
    Field f = newField("field", "abcd   ", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
    doc.add(f);
    doc.add(f);
    w.addDocument(doc);
    w.close();

    IndexReader r = IndexReader.open(dir, true);
    TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
    assertEquals(2, termOffsets.length);
    assertEquals(0, termOffsets[0].getStartOffset());
    assertEquals(4, termOffsets[0].getEndOffset());
    assertEquals(8, termOffsets[1].getStartOffset());
    assertEquals(12, termOffsets[1].getEndOffset());
    r.close();
    dir.close();
  }

  // LUCENE-1448
  public void testEndOffsetPositionWithCachingTokenFilter() throws Exception {
    Directory dir = newDirectory();
    Analyzer analyzer = new MockAnalyzer(random);
    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, analyzer));
    Document doc = new Document();
    TokenStream stream = analyzer.tokenStream("field", new StringReader("abcd   "));
    stream.reset(); // TODO: wierd to reset before wrapping with CachingTokenFilter... correct?
    stream = new CachingTokenFilter(stream);
    Field f = new Field("field", stream, Field.TermVector.WITH_POSITIONS_OFFSETS);
    doc.add(f);
    doc.add(f);
    w.addDocument(doc);
    w.close();

    IndexReader r = IndexReader.open(dir, true);
    TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
    assertEquals(2, termOffsets.length);
    assertEquals(0, termOffsets[0].getStartOffset());
    assertEquals(4, termOffsets[0].getEndOffset());
    assertEquals(8, termOffsets[1].getStartOffset());
    assertEquals(12, termOffsets[1].getEndOffset());
    r.close();
    dir.close();
  }
  
  // LUCENE-1448
  public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
    MockDirectoryWrapper dir = newDirectory();
    Analyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
    IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
    Document doc = new Document();
    TeeSinkTokenFilter tee = new TeeSinkTokenFilter(analyzer.tokenStream("field", new StringReader("abcd   ")));
    TokenStream sink = tee.newSinkTokenStream();
    Field f1 = new Field("field", tee, Field.TermVector.WITH_POSITIONS_OFFSETS);
    Field f2 = new Field("field", sink, Field.TermVector.WITH_POSITIONS_OFFSETS);
    doc.add(f1);
    doc.add(f2);
    w.addDocument(doc);
    w.close();

    IndexReader r = IndexReader.open(dir, true);
    TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
    assertEquals(2, termOffsets.length);
    assertEquals(0, termOffsets[0].getStartOffset());
    assertEquals(4, termOffsets[0].getEndOffset());
    assertEquals(8, termOffsets[1].getStartOffset());
    assertEquals(12, termOffsets[1].getEndOffset());
    r.close();
    dir.close();
  }
  
  // LUCENE-1448
  public void testEndOffsetPositionStopFilter() throws Exception {
    Directory dir = newDirectory();
    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( 
        TEST_VERSION_CURRENT, new StandardAnalyzer(TEST_VERSION_CURRENT)));
    Document doc = new Document();
    Field f = newField("field", "abcd the", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
    doc.add(f);
    doc.add(f);
    w.addDocument(doc);
    w.close();

    IndexReader r = IndexReader.open(dir, true);
    TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
    assertEquals(2, termOffsets.length);
    assertEquals(0, termOffsets[0].getStartOffset());
    assertEquals(4, termOffsets[0].getEndOffset());
    assertEquals(9, termOffsets[1].getStartOffset());
    assertEquals(13, termOffsets[1].getEndOffset());
    r.close();
    dir.close();
  }

  // LUCENE-1448
  public void testEndOffsetPositionStandard() throws Exception {
    Directory dir = newDirectory();
    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( 
        TEST_VERSION_CURRENT, new MockAnalyzer(random)));
    Document doc = new Document();
    Field f = newField("field", "abcd the  ", Field.Store.NO,
        Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
    Field f2 = newField("field", "crunch man", Field.Store.NO,
        Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
    doc.add(f);
    doc.add(f2);
    w.addDocument(doc);
    w.close();

    IndexReader r = IndexReader.open(dir, true);
    TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field"));
    TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0);
    assertEquals(1, termOffsets.length);
    assertEquals(0, termOffsets[0].getStartOffset());
    assertEquals(4, termOffsets[0].getEndOffset());
    termOffsets = tpv.getOffsets(1);
    assertEquals(11, termOffsets[0].getStartOffset());
    assertEquals(17, termOffsets[0].getEndOffset());
    termOffsets = tpv.getOffsets(2);
    assertEquals(18, termOffsets[0].getStartOffset());
    assertEquals(21, termOffsets[0].getEndOffset());
    r.close();
    dir.close();
  }

  // LUCENE-1448
  public void testEndOffsetPositionStandardEmptyField() throws Exception {
    Directory dir = newDirectory();
    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( 
        TEST_VERSION_CURRENT, new MockAnalyzer(random)));
    Document doc = new Document();
    Field f = newField("field", "", Field.Store.NO,
                        Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
    Field f2 = newField("field", "crunch man", Field.Store.NO,
        Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
    doc.add(f);
    doc.add(f2);
    w.addDocument(doc);
    w.close();

    IndexReader r = IndexReader.open(dir, true);
    TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field"));
    TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0);
    assertEquals(1, termOffsets.length);
    assertEquals(1, termOffsets[0].getStartOffset());
    assertEquals(7, termOffsets[0].getEndOffset());
    termOffsets = tpv.getOffsets(1);
    assertEquals(8, termOffsets[0].getStartOffset());
    assertEquals(11, termOffsets[0].getEndOffset());
    r.close();
    dir.close();
  }

  // LUCENE-1448
  public void testEndOffsetPositionStandardEmptyField2() throws Exception {
    Directory dir = newDirectory();
    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( 
        TEST_VERSION_CURRENT, new MockAnalyzer(random)));
    Document doc = new Document();

    Field f = newField("field", "abcd", Field.Store.NO,
                        Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
    doc.add(f);
    doc.add(newField("field", "", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));

    Field f2 = newField("field", "crunch", Field.Store.NO,
        Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
    doc.add(f2);

    w.addDocument(doc);
    w.close();

    IndexReader r = IndexReader.open(dir, true);
    TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field"));
    TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0);
    assertEquals(1, termOffsets.length);
    assertEquals(0, termOffsets[0].getStartOffset());
    assertEquals(4, termOffsets[0].getEndOffset());
    termOffsets = tpv.getOffsets(1);
    assertEquals(6, termOffsets[0].getStartOffset());
    assertEquals(12, termOffsets[0].getEndOffset());
    r.close();
    dir.close();
  }
  
  // LUCENE-1168
  public void testTermVectorCorruption() throws IOException {

    Directory dir = newDirectory();
    for(int iter=0;iter<2;iter++) {
      IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(
          TEST_VERSION_CURRENT, new MockAnalyzer(random))
          .setMaxBufferedDocs(2).setRAMBufferSizeMB(
              IndexWriterConfig.DISABLE_AUTO_FLUSH).setMergeScheduler(
              new SerialMergeScheduler()).setMergePolicy(
              new LogDocMergePolicy()));

      Document document = new Document();

      Field storedField = newField("stored", "stored", Field.Store.YES,
                                    Field.Index.NO);
      document.add(storedField);
      writer.addDocument(document);
      writer.addDocument(document);

      document = new Document();
      document.add(storedField);
      Field termVectorField = newField("termVector", "termVector",
                                        Field.Store.NO, Field.Index.NOT_ANALYZED,
                                        Field.TermVector.WITH_POSITIONS_OFFSETS);

      document.add(termVectorField);
      writer.addDocument(document);
      writer.optimize();
      writer.close();

      IndexReader reader = IndexReader.open(dir, true);
      for(int i=0;i<reader.numDocs();i++) {
        reader.document(i);
        reader.getTermFreqVectors(i);
      }
      reader.close();

      writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT,
          new MockAnalyzer(random)).setMaxBufferedDocs(2)
          .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH)
          .setMergeScheduler(new SerialMergeScheduler()).setMergePolicy(
              new LogDocMergePolicy()));

      Directory[] indexDirs = {new MockDirectoryWrapper(random, new RAMDirectory(dir))};
      writer.addIndexes(indexDirs);
      writer.optimize();
      writer.close();
    }
    dir.close();
  }

  // LUCENE-1168
  public void testTermVectorCorruption2() throws IOException {
    Directory dir = newDirectory();
    for(int iter=0;iter<2;iter++) {
      IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(
          TEST_VERSION_CURRENT, new MockAnalyzer(random))
          .setMaxBufferedDocs(2).setRAMBufferSizeMB(
              IndexWriterConfig.DISABLE_AUTO_FLUSH).setMergeScheduler(
              new SerialMergeScheduler()).setMergePolicy(
              new LogDocMergePolicy()));

      Document document = new Document();

      Field storedField = newField("stored", "stored", Field.Store.YES,
                                    Field.Index.NO);
      document.add(storedField);
      writer.addDocument(document);
      writer.addDocument(document);

      document = new Document();
      document.add(storedField);
      Field termVectorField = newField("termVector", "termVector",
                                        Field.Store.NO, Field.Index.NOT_ANALYZED,
                                        Field.TermVector.WITH_POSITIONS_OFFSETS);
      document.add(termVectorField);
      writer.addDocument(document);
      writer.optimize();
      writer.close();

      IndexReader reader = IndexReader.open(dir, true);
      assertTrue(reader.getTermFreqVectors(0)==null);
      assertTrue(reader.getTermFreqVectors(1)==null);
      assertTrue(reader.getTermFreqVectors(2)!=null);
      reader.close();
    }
    dir.close();
  }

  // LUCENE-1168
  public void testTermVectorCorruption3() throws IOException {
    Directory dir = newDirectory();
    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(
        TEST_VERSION_CURRENT, new MockAnalyzer(random))
        .setMaxBufferedDocs(2).setRAMBufferSizeMB(
            IndexWriterConfig.DISABLE_AUTO_FLUSH).setMergeScheduler(
            new SerialMergeScheduler()).setMergePolicy(new LogDocMergePolicy()));

    Document document = new Document();

    document = new Document();
    Field storedField = newField("stored", "stored", Field.Store.YES,
                                  Field.Index.NO);
    document.add(storedField);
    Field termVectorField = newField("termVector", "termVector",
                                      Field.Store.NO, Field.Index.NOT_ANALYZED,
                                      Field.TermVector.WITH_POSITIONS_OFFSETS);
    document.add(termVectorField);
    for(int i=0;i<10;i++)
      writer.addDocument(document);
    writer.close();

    writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT,
        new MockAnalyzer(random)).setMaxBufferedDocs(2)
        .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH)
        .setMergeScheduler(new SerialMergeScheduler()).setMergePolicy(
            new LogDocMergePolicy()));
    for(int i=0;i<6;i++)
      writer.addDocument(document);

    writer.optimize();
    writer.close();

    IndexReader reader = IndexReader.open(dir, true);
    for(int i=0;i<10;i++) {
      reader.getTermFreqVectors(i);
      reader.document(i);
    }
    reader.close();
    dir.close();
  }
  
  // LUCENE-1008
  public void testNoTermVectorAfterTermVector() throws IOException {
    Directory dir = newDirectory();
    IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(
        TEST_VERSION_CURRENT, new MockAnalyzer(random)));
    Document document = new Document();
    document.add(newField("tvtest", "a b c", Field.Store.NO, Field.Index.ANALYZED,
        Field.TermVector.YES));
    iw.addDocument(document);
    document = new Document();
    document.add(newField("tvtest", "x y z", Field.Store.NO, Field.Index.ANALYZED,
                           Field.TermVector.NO));
    iw.addDocument(document);
    // Make first segment
    iw.commit();

    document.add(newField("tvtest", "a b c", Field.Store.NO, Field.Index.ANALYZED,
        Field.TermVector.YES));
    iw.addDocument(document);
    // Make 2nd segment
    iw.commit();

    iw.optimize();
    iw.close();
    dir.close();
  }

  // LUCENE-1010
  public void testNoTermVectorAfterTermVectorMerge() throws IOException {
    Directory dir = newDirectory();
    IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(
        TEST_VERSION_CURRENT, new MockAnalyzer(random)));
    Document document = new Document();
    document.add(newField("tvtest", "a b c", Field.Store.NO, Field.Index.ANALYZED,
        Field.TermVector.YES));
    iw.addDocument(document);
    iw.commit();

    document = new Document();
    document.add(newField("tvtest", "x y z", Field.Store.NO, Field.Index.ANALYZED,
                           Field.TermVector.NO));
    iw.addDocument(document);
    // Make first segment
    iw.commit();

    iw.optimize();

    document.add(newField("tvtest", "a b c", Field.Store.NO, Field.Index.ANALYZED,
        Field.TermVector.YES));
    iw.addDocument(document);
    // Make 2nd segment
    iw.commit();
    iw.optimize();

    iw.close();
    dir.close();
  }
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene TestTermVectorsWriter.java source code file:



my book on functional programming

 

new blog posts

 

Copyright 1998-2019 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.