|
Lucene example source code file (TestPayloads.java)
The Lucene TestPayloads.java source codepackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.File; import java.io.IOException; import java.io.Reader; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util._TestUtil; public class TestPayloads extends LuceneTestCase { // Simple tests to test the Payload class public void testPayload() throws Exception { byte[] testData = "This is a test!".getBytes(); Payload payload = new Payload(testData); assertEquals("Wrong payload length.", testData.length, payload.length()); // test copyTo() byte[] target = new byte[testData.length - 1]; try { payload.copyTo(target, 0); fail("Expected exception not thrown"); } catch (Exception expected) { // expected exception } target = new byte[testData.length + 3]; payload.copyTo(target, 3); for (int i = 0; i < testData.length; i++) { assertEquals(testData[i], target[i + 3]); } // test toByteArray() target = payload.toByteArray(); assertByteArrayEquals(testData, target); // test byteAt() for (int i = 0; i < testData.length; i++) { assertEquals(payload.byteAt(i), testData[i]); } try { payload.byteAt(testData.length + 1); fail("Expected exception not thrown"); } catch (Exception expected) { // expected exception } Payload clone = (Payload) payload.clone(); assertEquals(payload.length(), clone.length()); for (int i = 0; i < payload.length(); i++) { assertEquals(payload.byteAt(i), clone.byteAt(i)); } } // Tests whether the DocumentWriter and SegmentMerger correctly enable the // payload bit in the FieldInfo public void testPayloadFieldBit() throws Exception { Directory ram = newDirectory(); PayloadAnalyzer analyzer = new PayloadAnalyzer(); IndexWriter writer = new IndexWriter(ram, newIndexWriterConfig( TEST_VERSION_CURRENT, analyzer)); Document d = new Document(); // this field won't have any payloads d.add(newField("f1", "This field has no payloads", Field.Store.NO, Field.Index.ANALYZED)); // this field will have payloads in all docs, however not for all term positions, // so this field is used to check if the DocumentWriter correctly enables the payloads bit // even if only some term positions have payloads d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED)); d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED)); // this field is used to verify if the SegmentMerger enables payloads for a field if it has payloads // enabled in only some documents d.add(newField("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.ANALYZED)); // only add payload data for field f2 analyzer.setPayloadData("f2", 1, "somedata".getBytes(), 0, 1); writer.addDocument(d); // flush writer.close(); SegmentReader reader = SegmentReader.getOnlySegmentReader(ram); FieldInfos fi = reader.fieldInfos(); assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads); assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads); assertFalse("Payload field bit should not be set.", fi.fieldInfo("f3").storePayloads); reader.close(); // now we add another document which has payloads for field f3 and verify if the SegmentMerger // enabled payloads for that field writer = new IndexWriter(ram, newIndexWriterConfig( TEST_VERSION_CURRENT, analyzer).setOpenMode(OpenMode.CREATE)); d = new Document(); d.add(newField("f1", "This field has no payloads", Field.Store.NO, Field.Index.ANALYZED)); d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED)); d.add(newField("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED)); d.add(newField("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.ANALYZED)); // add payload data for field f2 and f3 analyzer.setPayloadData("f2", "somedata".getBytes(), 0, 1); analyzer.setPayloadData("f3", "somedata".getBytes(), 0, 3); writer.addDocument(d); // force merge writer.optimize(); // flush writer.close(); reader = SegmentReader.getOnlySegmentReader(ram); fi = reader.fieldInfos(); assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads); assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads); assertTrue("Payload field bit should be set.", fi.fieldInfo("f3").storePayloads); reader.close(); ram.close(); } // Tests if payloads are correctly stored and loaded using both RamDirectory and FSDirectory public void testPayloadsEncoding() throws Exception { // first perform the test using a RAMDirectory Directory dir = newDirectory(); performTest(dir); dir.close(); // now use a FSDirectory and repeat same test File dirName = _TestUtil.getTempDir("test_payloads"); dir = newFSDirectory(dirName); performTest(dir); _TestUtil.rmDir(dirName); dir.close(); } // builds an index with payloads in the given Directory and performs // different tests to verify the payload encoding private void performTest(Directory dir) throws Exception { PayloadAnalyzer analyzer = new PayloadAnalyzer(); IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, analyzer) .setOpenMode(OpenMode.CREATE) .setMergePolicy(newLogMergePolicy())); // should be in sync with value in TermInfosWriter final int skipInterval = 16; final int numTerms = 5; final String fieldName = "f1"; int numDocs = skipInterval + 1; // create content for the test documents with just a few terms Term[] terms = generateTerms(fieldName, numTerms); StringBuilder sb = new StringBuilder(); for (int i = 0; i < terms.length; i++) { sb.append(terms[i].text); sb.append(" "); } String content = sb.toString(); int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2; byte[] payloadData = generateRandomData(payloadDataLength); Document d = new Document(); d.add(newField(fieldName, content, Field.Store.NO, Field.Index.ANALYZED)); // add the same document multiple times to have the same payload lengths for all // occurrences within two consecutive skip intervals int offset = 0; for (int i = 0; i < 2 * numDocs; i++) { analyzer.setPayloadData(fieldName, payloadData, offset, 1); offset += numTerms; writer.addDocument(d); } // make sure we create more than one segment to test merging writer.commit(); // now we make sure to have different payload lengths next at the next skip point for (int i = 0; i < numDocs; i++) { analyzer.setPayloadData(fieldName, payloadData, offset, i); offset += i * numTerms; writer.addDocument(d); } writer.optimize(); // flush writer.close(); /* * Verify the index * first we test if all payloads are stored correctly */ IndexReader reader = IndexReader.open(dir, true); byte[] verifyPayloadData = new byte[payloadDataLength]; offset = 0; TermPositions[] tps = new TermPositions[numTerms]; for (int i = 0; i < numTerms; i++) { tps[i] = reader.termPositions(terms[i]); } while (tps[0].next()) { for (int i = 1; i < numTerms; i++) { tps[i].next(); } int freq = tps[0].freq(); for (int i = 0; i < freq; i++) { for (int j = 0; j < numTerms; j++) { tps[j].nextPosition(); if (tps[j].isPayloadAvailable()) { tps[j].getPayload(verifyPayloadData, offset); offset += tps[j].getPayloadLength(); } } } } for (int i = 0; i < numTerms; i++) { tps[i].close(); } assertByteArrayEquals(payloadData, verifyPayloadData); /* * test lazy skipping */ TermPositions tp = reader.termPositions(terms[0]); tp.next(); tp.nextPosition(); // now we don't read this payload tp.nextPosition(); assertEquals("Wrong payload length.", 1, tp.getPayloadLength()); byte[] payload = tp.getPayload(null, 0); assertEquals(payload[0], payloadData[numTerms]); tp.nextPosition(); // we don't read this payload and skip to a different document tp.skipTo(5); tp.nextPosition(); assertEquals("Wrong payload length.", 1, tp.getPayloadLength()); payload = tp.getPayload(null, 0); assertEquals(payload[0], payloadData[5 * numTerms]); /* * Test different lengths at skip points */ tp.seek(terms[1]); tp.next(); tp.nextPosition(); assertEquals("Wrong payload length.", 1, tp.getPayloadLength()); tp.skipTo(skipInterval - 1); tp.nextPosition(); assertEquals("Wrong payload length.", 1, tp.getPayloadLength()); tp.skipTo(2 * skipInterval - 1); tp.nextPosition(); assertEquals("Wrong payload length.", 1, tp.getPayloadLength()); tp.skipTo(3 * skipInterval - 1); tp.nextPosition(); assertEquals("Wrong payload length.", 3 * skipInterval - 2 * numDocs - 1, tp.getPayloadLength()); /* * Test multiple call of getPayload() */ tp.getPayload(null, 0); try { // it is forbidden to call getPayload() more than once // without calling nextPosition() tp.getPayload(null, 0); fail("Expected exception not thrown"); } catch (Exception expected) { // expected exception } reader.close(); // test long payload analyzer = new PayloadAnalyzer(); writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, analyzer).setOpenMode(OpenMode.CREATE)); String singleTerm = "lucene"; d = new Document(); d.add(newField(fieldName, singleTerm, Field.Store.NO, Field.Index.ANALYZED)); // add a payload whose length is greater than the buffer size of BufferedIndexOutput payloadData = generateRandomData(2000); analyzer.setPayloadData(fieldName, payloadData, 100, 1500); writer.addDocument(d); writer.optimize(); // flush writer.close(); reader = IndexReader.open(dir, true); tp = reader.termPositions(new Term(fieldName, singleTerm)); tp.next(); tp.nextPosition(); verifyPayloadData = new byte[tp.getPayloadLength()]; tp.getPayload(verifyPayloadData, 0); byte[] portion = new byte[1500]; System.arraycopy(payloadData, 100, portion, 0, 1500); assertByteArrayEquals(portion, verifyPayloadData); reader.close(); } private void generateRandomData(byte[] data) { random.nextBytes(data); } private byte[] generateRandomData(int n) { byte[] data = new byte[n]; generateRandomData(data); return data; } private Term[] generateTerms(String fieldName, int n) { int maxDigits = (int) (Math.log(n) / Math.log(10)); Term[] terms = new Term[n]; StringBuilder sb = new StringBuilder(); for (int i = 0; i < n; i++) { sb.setLength(0); sb.append("t"); int zeros = maxDigits - (int) (Math.log(i) / Math.log(10)); for (int j = 0; j < zeros; j++) { sb.append("0"); } sb.append(i); terms[i] = new Term(fieldName, sb.toString()); } return terms; } void assertByteArrayEquals(byte[] b1, byte[] b2) { if (b1.length != b2.length) { fail("Byte arrays have different lengths: " + b1.length + ", " + b2.length); } for (int i = 0; i < b1.length; i++) { if (b1[i] != b2[i]) { fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[i]); } } } /** * This Analyzer uses an WhitespaceTokenizer and PayloadFilter. */ private static class PayloadAnalyzer extends Analyzer { Map<String,PayloadData> fieldToData = new HashMap Other Lucene examples (source code examples)Here is a short list of links related to this Lucene TestPayloads.java source code file: |
... this post is sponsored by my books ... | |
#1 New Release! |
FP Best Seller |
Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.
A percentage of advertising revenue from
pages under the /java/jwarehouse
URI on this website is
paid back to open source projects.