|
Lucene example source code file (TestTermVectorsReader.java)
The Lucene TestTermVectorsReader.java source codepackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.Reader; import java.util.Arrays; import java.util.Iterator; import java.util.Map; import java.util.SortedSet; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; public class TestTermVectorsReader extends LuceneTestCase { //Must be lexicographically sorted, will do in setup, versus trying to maintain here private String[] testFields = {"f1", "f2", "f3", "f4"}; private boolean[] testFieldsStorePos = {true, false, true, false}; private boolean[] testFieldsStoreOff = {true, false, false, true}; private String[] testTerms = {"this", "is", "a", "test"}; private int[][] positions = new int[testTerms.length][]; private TermVectorOffsetInfo[][] offsets = new TermVectorOffsetInfo[testTerms.length][]; private Directory dir; private String seg; private FieldInfos fieldInfos = new FieldInfos(); private static int TERM_FREQ = 3; private class TestToken implements Comparable<TestToken> { String text; int pos; int startOffset; int endOffset; public int compareTo(TestToken other) { return pos - other.pos; } } TestToken[] tokens = new TestToken[testTerms.length * TERM_FREQ]; @Override public void setUp() throws Exception { super.setUp(); /* for (int i = 0; i < testFields.length; i++) { fieldInfos.add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]); } */ Arrays.sort(testTerms); int tokenUpto = 0; for (int i = 0; i < testTerms.length; i++) { positions[i] = new int[TERM_FREQ]; offsets[i] = new TermVectorOffsetInfo[TERM_FREQ]; // first position must be 0 for (int j = 0; j < TERM_FREQ; j++) { // positions are always sorted in increasing order positions[i][j] = (int) (j * 10 + Math.random() * 10); // offsets are always sorted in increasing order offsets[i][j] = new TermVectorOffsetInfo(j * 10, j * 10 + testTerms[i].length()); TestToken token = tokens[tokenUpto++] = new TestToken(); token.text = testTerms[i]; token.pos = positions[i][j]; token.startOffset = offsets[i][j].getStartOffset(); token.endOffset = offsets[i][j].getEndOffset(); } } Arrays.sort(tokens); dir = newDirectory(); IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MyAnalyzer()).setMaxBufferedDocs(-1).setMergePolicy(newLogMergePolicy(false, 10))); Document doc = new Document(); for(int i=0;i<testFields.length;i++) { final Field.TermVector tv; if (testFieldsStorePos[i] && testFieldsStoreOff[i]) tv = Field.TermVector.WITH_POSITIONS_OFFSETS; else if (testFieldsStorePos[i] && !testFieldsStoreOff[i]) tv = Field.TermVector.WITH_POSITIONS; else if (!testFieldsStorePos[i] && testFieldsStoreOff[i]) tv = Field.TermVector.WITH_OFFSETS; else tv = Field.TermVector.YES; doc.add(new Field(testFields[i], "", Field.Store.NO, Field.Index.ANALYZED, tv)); } //Create 5 documents for testing, they all have the same //terms for(int j=0;j<5;j++) writer.addDocument(doc); writer.commit(); seg = writer.newestSegment().name; writer.close(); fieldInfos = new FieldInfos(dir, IndexFileNames.segmentFileName(seg, IndexFileNames.FIELD_INFOS_EXTENSION)); } @Override public void tearDown() throws Exception { dir.close(); super.tearDown(); } private class MyTokenStream extends TokenStream { int tokenUpto; CharTermAttribute termAtt; PositionIncrementAttribute posIncrAtt; OffsetAttribute offsetAtt; public MyTokenStream() { termAtt = addAttribute(CharTermAttribute.class); posIncrAtt = addAttribute(PositionIncrementAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); } @Override public boolean incrementToken() { if (tokenUpto >= tokens.length) return false; else { final TestToken testToken = tokens[tokenUpto++]; clearAttributes(); termAtt.append(testToken.text); offsetAtt.setOffset(testToken.startOffset, testToken.endOffset); if (tokenUpto > 1) { posIncrAtt.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos); } else { posIncrAtt.setPositionIncrement(testToken.pos+1); } return true; } } } private class MyAnalyzer extends Analyzer { @Override public TokenStream tokenStream(String fieldName, Reader reader) { return new MyTokenStream(); } } public void test() throws IOException { //Check to see the files were created properly in setup assertTrue(dir.fileExists(IndexFileNames.segmentFileName(seg, IndexFileNames.VECTORS_DOCUMENTS_EXTENSION))); assertTrue(dir.fileExists(IndexFileNames.segmentFileName(seg, IndexFileNames.VECTORS_INDEX_EXTENSION))); } public void testReader() throws IOException { TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); for (int j = 0; j < 5; j++) { TermFreqVector vector = reader.get(j, testFields[0]); assertTrue(vector != null); String[] terms = vector.getTerms(); assertTrue(terms != null); assertTrue(terms.length == testTerms.length); for (int i = 0; i < terms.length; i++) { String term = terms[i]; //System.out.println("Term: " + term); assertTrue(term.equals(testTerms[i])); } } reader.close(); } public void testPositionReader() throws IOException { TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); TermPositionVector vector; String[] terms; vector = (TermPositionVector) reader.get(0, testFields[0]); assertTrue(vector != null); terms = vector.getTerms(); assertTrue(terms != null); assertTrue(terms.length == testTerms.length); for (int i = 0; i < terms.length; i++) { String term = terms[i]; //System.out.println("Term: " + term); assertTrue(term.equals(testTerms[i])); int[] positions = vector.getTermPositions(i); assertTrue(positions != null); assertTrue(positions.length == this.positions[i].length); for (int j = 0; j < positions.length; j++) { int position = positions[j]; assertTrue(position == this.positions[i][j]); } TermVectorOffsetInfo[] offset = vector.getOffsets(i); assertTrue(offset != null); assertTrue(offset.length == this.offsets[i].length); for (int j = 0; j < offset.length; j++) { TermVectorOffsetInfo termVectorOffsetInfo = offset[j]; assertTrue(termVectorOffsetInfo.equals(offsets[i][j])); } } TermFreqVector freqVector = reader.get(0, testFields[1]); //no pos, no offset assertTrue(freqVector != null); assertTrue(freqVector instanceof TermPositionVector == false); terms = freqVector.getTerms(); assertTrue(terms != null); assertTrue(terms.length == testTerms.length); for (int i = 0; i < terms.length; i++) { String term = terms[i]; //System.out.println("Term: " + term); assertTrue(term.equals(testTerms[i])); } reader.close(); } public void testOffsetReader() throws IOException { TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); TermPositionVector vector = (TermPositionVector) reader.get(0, testFields[0]); assertTrue(vector != null); String[] terms = vector.getTerms(); assertTrue(terms != null); assertTrue(terms.length == testTerms.length); for (int i = 0; i < terms.length; i++) { String term = terms[i]; //System.out.println("Term: " + term); assertTrue(term.equals(testTerms[i])); int[] positions = vector.getTermPositions(i); assertTrue(positions != null); assertTrue(positions.length == this.positions[i].length); for (int j = 0; j < positions.length; j++) { int position = positions[j]; assertTrue(position == this.positions[i][j]); } TermVectorOffsetInfo[] offset = vector.getOffsets(i); assertTrue(offset != null); assertTrue(offset.length == this.offsets[i].length); for (int j = 0; j < offset.length; j++) { TermVectorOffsetInfo termVectorOffsetInfo = offset[j]; assertTrue(termVectorOffsetInfo.equals(offsets[i][j])); } } reader.close(); } public void testMapper() throws IOException { TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator()); reader.get(0, mapper); SortedSet<TermVectorEntry> set = mapper.getTermVectorEntrySet(); assertTrue("set is null and it shouldn't be", set != null); //three fields, 4 terms, all terms are the same assertTrue("set Size: " + set.size() + " is not: " + 4, set.size() == 4); //Check offsets and positions for (Iterator<TermVectorEntry> iterator = set.iterator(); iterator.hasNext();) { TermVectorEntry tve = iterator.next(); assertTrue("tve is null and it shouldn't be", tve != null); assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null); assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null); } mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator()); reader.get(1, mapper); set = mapper.getTermVectorEntrySet(); assertTrue("set is null and it shouldn't be", set != null); //three fields, 4 terms, all terms are the same assertTrue("set Size: " + set.size() + " is not: " + 4, set.size() == 4); //Should have offsets and positions b/c we are munging all the fields together for (Iterator<TermVectorEntry> iterator = set.iterator(); iterator.hasNext();) { TermVectorEntry tve = iterator.next(); assertTrue("tve is null and it shouldn't be", tve != null); assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null); assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null); } FieldSortedTermVectorMapper fsMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator()); reader.get(0, fsMapper); Map<String,SortedSet Other Lucene examples (source code examples)Here is a short list of links related to this Lucene TestTermVectorsReader.java source code file: |
... this post is sponsored by my books ... | |
#1 New Release! |
FP Best Seller |
Copyright 1998-2024 Alvin Alexander, alvinalexander.com
All Rights Reserved.
A percentage of advertising revenue from
pages under the /java/jwarehouse
URI on this website is
paid back to open source projects.