alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (TestTermVectorsReader.java)

This example Lucene source code file (TestTermVectorsReader.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

docnumawaremapper, io, ioexception, ioexception, override, size, string, string, termpositionvector, termvectorentry, termvectoroffsetinfo, termvectoroffsetinfo, termvectorsreader, termvectorsreader, testtoken, util

The Lucene TestTermVectorsReader.java source code

package org.apache.lucene.index;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedSet;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;

public class TestTermVectorsReader extends LuceneTestCase {
  //Must be lexicographically sorted, will do in setup, versus trying to maintain here
  private String[] testFields = {"f1", "f2", "f3", "f4"};
  private boolean[] testFieldsStorePos = {true, false, true, false};
  private boolean[] testFieldsStoreOff = {true, false, false, true};
  private String[] testTerms = {"this", "is", "a", "test"};
  private int[][] positions = new int[testTerms.length][];
  private TermVectorOffsetInfo[][] offsets = new TermVectorOffsetInfo[testTerms.length][];
  private Directory dir;
  private String seg;
  private FieldInfos fieldInfos = new FieldInfos();
  private static int TERM_FREQ = 3;

  private class TestToken implements Comparable<TestToken> {
    String text;
    int pos;
    int startOffset;
    int endOffset;
    public int compareTo(TestToken other) {
      return pos - other.pos;
    }
  }

  TestToken[] tokens = new TestToken[testTerms.length * TERM_FREQ];

  @Override
  public void setUp() throws Exception {
    super.setUp();
    /*
    for (int i = 0; i < testFields.length; i++) {
      fieldInfos.add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]);
    }
    */

    Arrays.sort(testTerms);
    int tokenUpto = 0;
    for (int i = 0; i < testTerms.length; i++) {
      positions[i] = new int[TERM_FREQ];
      offsets[i] = new TermVectorOffsetInfo[TERM_FREQ];
      // first position must be 0
      for (int j = 0; j < TERM_FREQ; j++) {
        // positions are always sorted in increasing order
        positions[i][j] = (int) (j * 10 + Math.random() * 10);
        // offsets are always sorted in increasing order
        offsets[i][j] = new TermVectorOffsetInfo(j * 10, j * 10 + testTerms[i].length());
        TestToken token = tokens[tokenUpto++] = new TestToken();
        token.text = testTerms[i];
        token.pos = positions[i][j];
        token.startOffset = offsets[i][j].getStartOffset();
        token.endOffset = offsets[i][j].getEndOffset();
      }
    }
    Arrays.sort(tokens);

    dir = newDirectory();
    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MyAnalyzer()).setMaxBufferedDocs(-1).setMergePolicy(newLogMergePolicy(false, 10)));

    Document doc = new Document();
    for(int i=0;i<testFields.length;i++) {
      final Field.TermVector tv;
      if (testFieldsStorePos[i] && testFieldsStoreOff[i])
        tv = Field.TermVector.WITH_POSITIONS_OFFSETS;
      else if (testFieldsStorePos[i] && !testFieldsStoreOff[i])
        tv = Field.TermVector.WITH_POSITIONS;
      else if (!testFieldsStorePos[i] && testFieldsStoreOff[i])
        tv = Field.TermVector.WITH_OFFSETS;
      else
        tv = Field.TermVector.YES;
      doc.add(new Field(testFields[i], "", Field.Store.NO, Field.Index.ANALYZED, tv));
    }

    //Create 5 documents for testing, they all have the same
    //terms
    for(int j=0;j<5;j++)
      writer.addDocument(doc);
    writer.commit();
    seg = writer.newestSegment().name;
    writer.close();

    fieldInfos = new FieldInfos(dir, IndexFileNames.segmentFileName(seg, IndexFileNames.FIELD_INFOS_EXTENSION));
  }
  
  @Override
  public void tearDown() throws Exception {
    dir.close();
    super.tearDown();
  }

  private class MyTokenStream extends TokenStream {
    int tokenUpto;
    
    CharTermAttribute termAtt;
    PositionIncrementAttribute posIncrAtt;
    OffsetAttribute offsetAtt;
    
    public MyTokenStream() {
      termAtt = addAttribute(CharTermAttribute.class);
      posIncrAtt = addAttribute(PositionIncrementAttribute.class);
      offsetAtt = addAttribute(OffsetAttribute.class);
    }
    
    @Override
    public boolean incrementToken() {
      if (tokenUpto >= tokens.length)
        return false;
      else {
        final TestToken testToken = tokens[tokenUpto++];
        clearAttributes();
        termAtt.append(testToken.text);
        offsetAtt.setOffset(testToken.startOffset, testToken.endOffset);
        if (tokenUpto > 1) {
          posIncrAtt.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos);
        } else {
          posIncrAtt.setPositionIncrement(testToken.pos+1);
        }
        return true;
      }
    }
  }

  private class MyAnalyzer extends Analyzer {
    @Override
    public TokenStream tokenStream(String fieldName, Reader reader) {
      return new MyTokenStream();
    }
  }

  public void test() throws IOException {
    //Check to see the files were created properly in setup
    assertTrue(dir.fileExists(IndexFileNames.segmentFileName(seg, IndexFileNames.VECTORS_DOCUMENTS_EXTENSION)));
    assertTrue(dir.fileExists(IndexFileNames.segmentFileName(seg, IndexFileNames.VECTORS_INDEX_EXTENSION)));
  }

  public void testReader() throws IOException {
    TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
    for (int j = 0; j < 5; j++) {
      TermFreqVector vector = reader.get(j, testFields[0]);
      assertTrue(vector != null);
      String[] terms = vector.getTerms();
      assertTrue(terms != null);
      assertTrue(terms.length == testTerms.length);
      for (int i = 0; i < terms.length; i++) {
        String term = terms[i];
        //System.out.println("Term: " + term);
        assertTrue(term.equals(testTerms[i]));
      }
    }
    reader.close();
  }

  public void testPositionReader() throws IOException {
    TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
    TermPositionVector vector;
    String[] terms;
    vector = (TermPositionVector) reader.get(0, testFields[0]);
    assertTrue(vector != null);
    terms = vector.getTerms();
    assertTrue(terms != null);
    assertTrue(terms.length == testTerms.length);
    for (int i = 0; i < terms.length; i++) {
      String term = terms[i];
      //System.out.println("Term: " + term);
      assertTrue(term.equals(testTerms[i]));
      int[] positions = vector.getTermPositions(i);
      assertTrue(positions != null);
      assertTrue(positions.length == this.positions[i].length);
      for (int j = 0; j < positions.length; j++) {
        int position = positions[j];
        assertTrue(position == this.positions[i][j]);
      }
      TermVectorOffsetInfo[] offset = vector.getOffsets(i);
      assertTrue(offset != null);
      assertTrue(offset.length == this.offsets[i].length);
      for (int j = 0; j < offset.length; j++) {
        TermVectorOffsetInfo termVectorOffsetInfo = offset[j];
        assertTrue(termVectorOffsetInfo.equals(offsets[i][j]));
      }
    }

    TermFreqVector freqVector = reader.get(0, testFields[1]); //no pos, no offset
    assertTrue(freqVector != null);
    assertTrue(freqVector instanceof TermPositionVector == false);
    terms = freqVector.getTerms();
    assertTrue(terms != null);
    assertTrue(terms.length == testTerms.length);
    for (int i = 0; i < terms.length; i++) {
      String term = terms[i];
      //System.out.println("Term: " + term);
      assertTrue(term.equals(testTerms[i]));
    }
    reader.close();
  }

  public void testOffsetReader() throws IOException {
    TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
    TermPositionVector vector = (TermPositionVector) reader.get(0, testFields[0]);
    assertTrue(vector != null);
    String[] terms = vector.getTerms();
    assertTrue(terms != null);
    assertTrue(terms.length == testTerms.length);
    for (int i = 0; i < terms.length; i++) {
      String term = terms[i];
      //System.out.println("Term: " + term);
      assertTrue(term.equals(testTerms[i]));
      int[] positions = vector.getTermPositions(i);
      assertTrue(positions != null);
      assertTrue(positions.length == this.positions[i].length);
      for (int j = 0; j < positions.length; j++) {
        int position = positions[j];
        assertTrue(position == this.positions[i][j]);
      }
      TermVectorOffsetInfo[] offset = vector.getOffsets(i);
      assertTrue(offset != null);
      assertTrue(offset.length == this.offsets[i].length);
      for (int j = 0; j < offset.length; j++) {
        TermVectorOffsetInfo termVectorOffsetInfo = offset[j];
        assertTrue(termVectorOffsetInfo.equals(offsets[i][j]));
      }
    }
    reader.close();
  }

  public void testMapper() throws IOException {
    TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
    SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
    reader.get(0, mapper);
    SortedSet<TermVectorEntry> set = mapper.getTermVectorEntrySet();
    assertTrue("set is null and it shouldn't be", set != null);
    //three fields, 4 terms, all terms are the same
    assertTrue("set Size: " + set.size() + " is not: " + 4, set.size() == 4);
    //Check offsets and positions
    for (Iterator<TermVectorEntry> iterator = set.iterator(); iterator.hasNext();) {
      TermVectorEntry tve =  iterator.next();
      assertTrue("tve is null and it shouldn't be", tve != null);
      assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
      assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);

    }

    mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
    reader.get(1, mapper);
    set = mapper.getTermVectorEntrySet();
    assertTrue("set is null and it shouldn't be", set != null);
    //three fields, 4 terms, all terms are the same
    assertTrue("set Size: " + set.size() + " is not: " + 4, set.size() == 4);
    //Should have offsets and positions b/c we are munging all the fields together
    for (Iterator<TermVectorEntry> iterator = set.iterator(); iterator.hasNext();) {
      TermVectorEntry tve = iterator.next();
      assertTrue("tve is null and it shouldn't be", tve != null);
      assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
      assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);

    }


    FieldSortedTermVectorMapper fsMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
    reader.get(0, fsMapper);
    Map<String,SortedSet map = fsMapper.getFieldToTerms();
    assertTrue("map Size: " + map.size() + " is not: " + testFields.length, map.size() == testFields.length);
    for (Map.Entry<String,SortedSet entry : map.entrySet()) {
      SortedSet<TermVectorEntry> sortedSet =  entry.getValue();
      assertTrue("sortedSet Size: " + sortedSet.size() + " is not: " + 4, sortedSet.size() == 4);
      for (final TermVectorEntry tve : sortedSet) {
        assertTrue("tve is null and it shouldn't be", tve != null);
        //Check offsets and positions.
        assertTrue("tve is null and it shouldn't be", tve != null);
        String field = tve.getField();
        if (field.equals(testFields[0])) {
          //should have offsets

          assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
          assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);
        }
        else if (field.equals(testFields[1])) {
          //should not have offsets

          assertTrue("tve.getOffsets() is not null and it shouldn't be", tve.getOffsets() == null);
          assertTrue("tve.getPositions() is not null and it shouldn't be", tve.getPositions() == null);
        }
      }
    }
    //Try mapper that ignores offs and positions
    fsMapper = new FieldSortedTermVectorMapper(true, true, new TermVectorEntryFreqSortedComparator());
    reader.get(0, fsMapper);
    map = fsMapper.getFieldToTerms();
    assertTrue("map Size: " + map.size() + " is not: " + testFields.length, map.size() == testFields.length);
    for (final Map.Entry<String,SortedSet entry : map.entrySet()) {
      SortedSet<TermVectorEntry> sortedSet =  entry.getValue();
      assertTrue("sortedSet Size: " + sortedSet.size() + " is not: " + 4, sortedSet.size() == 4);
      for (final TermVectorEntry tve : sortedSet) {
        assertTrue("tve is null and it shouldn't be", tve != null);
        //Check offsets and positions.
        assertTrue("tve is null and it shouldn't be", tve != null);
        String field = tve.getField();
        if (field.equals(testFields[0])) {
          //should have offsets

          assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() == null);
          assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() == null);
        }
        else if (field.equals(testFields[1])) {
          //should not have offsets

          assertTrue("tve.getOffsets() is not null and it shouldn't be", tve.getOffsets() == null);
          assertTrue("tve.getPositions() is not null and it shouldn't be", tve.getPositions() == null);
        }
      }
    }

    // test setDocumentNumber()
    IndexReader ir = IndexReader.open(dir, true);
    DocNumAwareMapper docNumAwareMapper = new DocNumAwareMapper();
    assertEquals(-1, docNumAwareMapper.getDocumentNumber());

    ir.getTermFreqVector(0, docNumAwareMapper);
    assertEquals(0, docNumAwareMapper.getDocumentNumber());
    docNumAwareMapper.setDocumentNumber(-1);

    ir.getTermFreqVector(1, docNumAwareMapper);
    assertEquals(1, docNumAwareMapper.getDocumentNumber());
    docNumAwareMapper.setDocumentNumber(-1);

    ir.getTermFreqVector(0, "f1", docNumAwareMapper);
    assertEquals(0, docNumAwareMapper.getDocumentNumber());
    docNumAwareMapper.setDocumentNumber(-1);

    ir.getTermFreqVector(1, "f2", docNumAwareMapper);
    assertEquals(1, docNumAwareMapper.getDocumentNumber());
    docNumAwareMapper.setDocumentNumber(-1);

    ir.getTermFreqVector(0, "f1", docNumAwareMapper);
    assertEquals(0, docNumAwareMapper.getDocumentNumber());

    ir.close();
    reader.close();
  }


  /**
   * Make sure exceptions and bad params are handled appropriately
   */
  public void testBadParams() throws IOException {
    TermVectorsReader reader = null;
    try {
      reader = new TermVectorsReader(dir, seg, fieldInfos);
      //Bad document number, good field number
      reader.get(50, testFields[0]);
      fail();
    } catch (IOException e) {
      // expected exception
    } catch (IllegalArgumentException e) {
      // mmapdir will give us this from java.nio.Buffer.position()
    } finally {
      reader.close();
    }
    try {
      reader = new TermVectorsReader(dir, seg, fieldInfos);
      //Bad document number, no field
      reader.get(50);
      fail();
    } catch (IOException e) {
      // expected exception
    } catch (IllegalArgumentException e) {
      // mmapdir will give us this from java.nio.Buffer.position()
    } finally {
      reader.close();
    }
    try {
      reader = new TermVectorsReader(dir, seg, fieldInfos);
      //good document number, bad field number
      TermFreqVector vector = reader.get(0, "f50");
      assertTrue(vector == null);
      reader.close();
    } catch (IOException e) {
      fail();
    } finally {
      reader.close();
    }
  }


  public static class DocNumAwareMapper extends TermVectorMapper {

    public DocNumAwareMapper() {
    }

    private int documentNumber = -1;

    @Override
    public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
      if (documentNumber == -1) {
        throw new RuntimeException("Documentnumber should be set at this point!");
      }
    }

    @Override
    public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
      if (documentNumber == -1) {
        throw new RuntimeException("Documentnumber should be set at this point!");
      }
    }

    public int getDocumentNumber() {
      return documentNumber;
    }

    @Override
    public void setDocumentNumber(int documentNumber) {
      this.documentNumber = documentNumber;
    }
  }
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene TestTermVectorsReader.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.