alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (TestPayloadSpans.java)

This example Lucene source code file (TestPayloadSpans.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

document, document, exception, indexreader, indexsearcher, io, randomindexwriter, spannearquery, spannearquery, spanquery, spanquery, spans, spantermquery, spantermquery, string, util

The Lucene TestPayloadSpans.java source code

package org.apache.lucene.search.spans;

/**
 * Copyright 2004 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Payload;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.payloads.PayloadHelper;
import org.apache.lucene.search.payloads.PayloadSpanUtil;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.LuceneTestCase;

public class TestPayloadSpans extends LuceneTestCase {
  private IndexSearcher searcher;
  private Similarity similarity = new DefaultSimilarity();
  protected IndexReader indexReader;
  private IndexReader closeIndexReader;
  private Directory directory;

  @Override
  public void setUp() throws Exception {
    super.setUp();
    PayloadHelper helper = new PayloadHelper();
    searcher = helper.setUp(random, similarity, 1000);
    indexReader = searcher.getIndexReader();
  }

  public void testSpanTermQuery() throws Exception {
    SpanTermQuery stq;
    Spans spans;
    stq = new SpanTermQuery(new Term(PayloadHelper.FIELD, "seventy"));
    spans = stq.getSpans(indexReader);
    assertTrue("spans is null and it shouldn't be", spans != null);
    checkSpans(spans, 100, 1, 1, 1);

    stq = new SpanTermQuery(new Term(PayloadHelper.NO_PAYLOAD_FIELD, "seventy"));  
    spans = stq.getSpans(indexReader);
    assertTrue("spans is null and it shouldn't be", spans != null);
    checkSpans(spans, 100, 0, 0, 0);
  }

  public void testSpanFirst() throws IOException {

    SpanQuery match;
    SpanFirstQuery sfq;
    match = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one"));
    sfq = new SpanFirstQuery(match, 2);
    Spans spans = sfq.getSpans(indexReader);
    checkSpans(spans, 109, 1, 1, 1);
    //Test more complicated subclause
    SpanQuery[] clauses = new SpanQuery[2];
    clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one"));
    clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "hundred"));
    match = new SpanNearQuery(clauses, 0, true);
    sfq = new SpanFirstQuery(match, 2);
    checkSpans(sfq.getSpans(indexReader), 100, 2, 1, 1);

    match = new SpanNearQuery(clauses, 0, false);
    sfq = new SpanFirstQuery(match, 2);
    checkSpans(sfq.getSpans(indexReader), 100, 2, 1, 1);
    
  }
  
  public void testSpanNot() throws Exception {
    SpanQuery[] clauses = new SpanQuery[2];
    clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one"));
    clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "three"));
    SpanQuery spq = new SpanNearQuery(clauses, 5, true);
    SpanNotQuery snq = new SpanNotQuery(spq, new SpanTermQuery(new Term(PayloadHelper.FIELD, "two")));



    Directory directory = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random, directory,
                                                     newIndexWriterConfig(TEST_VERSION_CURRENT, new PayloadAnalyzer()).setSimilarity(similarity));

    Document doc = new Document();
    doc.add(newField(PayloadHelper.FIELD, "one two three one four three",
        Field.Store.YES, Field.Index.ANALYZED));
    writer.addDocument(doc);
    IndexReader reader = writer.getReader();
    writer.close();

    checkSpans(snq.getSpans(reader), 1,new int[]{2});
    reader.close();
    directory.close();
  }
  
  public void testNestedSpans() throws Exception {
    SpanTermQuery stq;
    Spans spans;
    IndexSearcher searcher = getSearcher();
    stq = new SpanTermQuery(new Term(PayloadHelper.FIELD, "mark"));
    spans = stq.getSpans(searcher.getIndexReader());
    assertTrue("spans is null and it shouldn't be", spans != null);
    checkSpans(spans, 0, null);


    SpanQuery[] clauses = new SpanQuery[3];
    clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "rr"));
    clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "yy"));
    clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "xx"));
    SpanNearQuery spanNearQuery = new SpanNearQuery(clauses, 12, false);

    spans = spanNearQuery.getSpans(searcher.getIndexReader());
    assertTrue("spans is null and it shouldn't be", spans != null);
    checkSpans(spans, 2, new int[]{3,3});

     
    clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "xx"));
    clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "rr"));
    clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "yy"));

    spanNearQuery = new SpanNearQuery(clauses, 6, true);
   
    
    spans = spanNearQuery.getSpans(searcher.getIndexReader());
    assertTrue("spans is null and it shouldn't be", spans != null);
    checkSpans(spans, 1, new int[]{3});
     
    clauses = new SpanQuery[2];
     
    clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "xx"));
    clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "rr"));

    spanNearQuery = new SpanNearQuery(clauses, 6, true);
     
    // xx within 6 of rr
    
    SpanQuery[] clauses2 = new SpanQuery[2];
     
    clauses2[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "yy"));
    clauses2[1] = spanNearQuery;
     
    SpanNearQuery nestedSpanNearQuery = new SpanNearQuery(clauses2, 6, false);
    
    // yy within 6 of xx within 6 of rr

    spans = nestedSpanNearQuery.getSpans(searcher.getIndexReader());
    assertTrue("spans is null and it shouldn't be", spans != null);
    checkSpans(spans, 2, new int[]{3,3});
    searcher.close();
    closeIndexReader.close();
    directory.close();
  }
  
  public void testFirstClauseWithoutPayload() throws Exception {
    Spans spans;
    IndexSearcher searcher = getSearcher();

    SpanQuery[] clauses = new SpanQuery[3];
    clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "nopayload"));
    clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "qq"));
    clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "ss"));

    SpanNearQuery spanNearQuery = new SpanNearQuery(clauses, 6, true);
    
    SpanQuery[] clauses2 = new SpanQuery[2];
     
    clauses2[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "pp"));
    clauses2[1] = spanNearQuery;

    SpanNearQuery snq = new SpanNearQuery(clauses2, 6, false);
    
    SpanQuery[] clauses3 = new SpanQuery[2];
     
    clauses3[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "np"));
    clauses3[1] = snq;
     
    SpanNearQuery nestedSpanNearQuery = new SpanNearQuery(clauses3, 6, false);

    spans = nestedSpanNearQuery.getSpans(searcher.getIndexReader());
    assertTrue("spans is null and it shouldn't be", spans != null);
    checkSpans(spans, 1, new int[]{3});
    searcher.close();
    closeIndexReader.close();
    directory.close();
  }
  
  public void testHeavilyNestedSpanQuery() throws Exception {
    Spans spans;
    IndexSearcher searcher = getSearcher();

    SpanQuery[] clauses = new SpanQuery[3];
    clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one"));
    clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "two"));
    clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "three"));

    SpanNearQuery spanNearQuery = new SpanNearQuery(clauses, 5, true);
   
    clauses = new SpanQuery[3];
    clauses[0] = spanNearQuery; 
    clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "five"));
    clauses[2] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "six"));

    SpanNearQuery spanNearQuery2 = new SpanNearQuery(clauses, 6, true);
     
    SpanQuery[] clauses2 = new SpanQuery[2];
    clauses2[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "eleven"));
    clauses2[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "ten"));
    SpanNearQuery spanNearQuery3 = new SpanNearQuery(clauses2, 2, false);
    
    SpanQuery[] clauses3 = new SpanQuery[3];
    clauses3[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "nine"));
    clauses3[1] = spanNearQuery2;
    clauses3[2] = spanNearQuery3;
     
    SpanNearQuery nestedSpanNearQuery = new SpanNearQuery(clauses3, 6, false);

    spans = nestedSpanNearQuery.getSpans(searcher.getIndexReader());
    assertTrue("spans is null and it shouldn't be", spans != null);
    checkSpans(spans, 2, new int[]{8, 8});
    searcher.close();
    closeIndexReader.close();
    directory.close();
  }
  
  public void testShrinkToAfterShortestMatch() throws CorruptIndexException,
      LockObtainFailedException, IOException {
    Directory directory = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random, directory,
                                                     newIndexWriterConfig(TEST_VERSION_CURRENT, new TestPayloadAnalyzer()));

    Document doc = new Document();
    doc.add(new Field("content", new StringReader("a b c d e f g h i j a k")));
    writer.addDocument(doc);

    IndexReader reader = writer.getReader();
    IndexSearcher is = newSearcher(reader);
    writer.close();

    SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
    SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
    SpanQuery[] sqs = { stq1, stq2 };
    SpanNearQuery snq = new SpanNearQuery(sqs, 1, true);
    Spans spans = snq.getSpans(is.getIndexReader());

    TopDocs topDocs = is.search(snq, 1);
    Set<String> payloadSet = new HashSet();
    for (int i = 0; i < topDocs.scoreDocs.length; i++) {
      while (spans.next()) {
        Collection<byte[]> payloads = spans.getPayload();

        for (final byte [] payload : payloads) {
          payloadSet.add(new String(payload));
        }
      }
    }
    assertEquals(2, payloadSet.size());
    assertTrue(payloadSet.contains("a:Noise:10"));
    assertTrue(payloadSet.contains("k:Noise:11"));
    is.close();
    reader.close();
    directory.close();
  }
  
  public void testShrinkToAfterShortestMatch2() throws CorruptIndexException,
      LockObtainFailedException, IOException {
    Directory directory = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random, directory,
                                                     newIndexWriterConfig(TEST_VERSION_CURRENT, new TestPayloadAnalyzer()));

    Document doc = new Document();
    doc.add(new Field("content", new StringReader("a b a d k f a h i k a k")));
    writer.addDocument(doc);
    IndexReader reader = writer.getReader();
    IndexSearcher is = newSearcher(reader);
    writer.close();

    SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
    SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
    SpanQuery[] sqs = { stq1, stq2 };
    SpanNearQuery snq = new SpanNearQuery(sqs, 0, true);
    Spans spans = snq.getSpans(is.getIndexReader());

    TopDocs topDocs = is.search(snq, 1);
    Set<String> payloadSet = new HashSet();
    for (int i = 0; i < topDocs.scoreDocs.length; i++) {
      while (spans.next()) {
        Collection<byte[]> payloads = spans.getPayload();
        for (final byte[] payload : payloads) {
          payloadSet.add(new String(payload));
        }
      }
    }
    assertEquals(2, payloadSet.size());
    assertTrue(payloadSet.contains("a:Noise:10"));
    assertTrue(payloadSet.contains("k:Noise:11"));
    is.close();
    reader.close();
    directory.close();
  }
  
  public void testShrinkToAfterShortestMatch3() throws CorruptIndexException,
      LockObtainFailedException, IOException {
    Directory directory = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random, directory,
                                                     newIndexWriterConfig(TEST_VERSION_CURRENT, new TestPayloadAnalyzer()));

    Document doc = new Document();
    doc.add(new Field("content", new StringReader("j k a l f k k p a t a k l k t a")));
    writer.addDocument(doc);
    IndexReader reader = writer.getReader();
    IndexSearcher is = newSearcher(reader);
    writer.close();

    SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
    SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
    SpanQuery[] sqs = { stq1, stq2 };
    SpanNearQuery snq = new SpanNearQuery(sqs, 0, true);
    Spans spans = snq.getSpans(is.getIndexReader());

    TopDocs topDocs = is.search(snq, 1);
    Set<String> payloadSet = new HashSet();
    for (int i = 0; i < topDocs.scoreDocs.length; i++) {
      while (spans.next()) {
        Collection<byte[]> payloads = spans.getPayload();

        for (final byte [] payload : payloads) {
          payloadSet.add(new String(payload));
        }
      }
    }
    assertEquals(2, payloadSet.size());
    if(VERBOSE) {
      for (final String payload : payloadSet)
        System.out.println("match:" +  payload);
      
    }
    assertTrue(payloadSet.contains("a:Noise:10"));
    assertTrue(payloadSet.contains("k:Noise:11"));
    is.close();
    reader.close();
    directory.close();
  }
  
  public void testPayloadSpanUtil() throws Exception {
    Directory directory = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random, directory,
                                                     newIndexWriterConfig(TEST_VERSION_CURRENT, new PayloadAnalyzer()).setSimilarity(similarity));

    Document doc = new Document();
    doc.add(newField(PayloadHelper.FIELD,"xx rr yy mm  pp", Field.Store.YES, Field.Index.ANALYZED));
    writer.addDocument(doc);
  
    IndexReader reader = writer.getReader();
    writer.close();
    IndexSearcher searcher = newSearcher(reader);

    PayloadSpanUtil psu = new PayloadSpanUtil(searcher.getIndexReader());
    
    Collection<byte[]> payloads = psu.getPayloadsForQuery(new TermQuery(new Term(PayloadHelper.FIELD, "rr")));
    if(VERBOSE)
      System.out.println("Num payloads:" + payloads.size());
    for (final byte [] bytes : payloads) {
      if(VERBOSE)
        System.out.println(new String(bytes));
    }
    searcher.close();
    reader.close();
    directory.close();
  }

  private void checkSpans(Spans spans, int expectedNumSpans, int expectedNumPayloads,
                          int expectedPayloadLength, int expectedFirstByte) throws IOException {
    assertTrue("spans is null and it shouldn't be", spans != null);
    //each position match should have a span associated with it, since there is just one underlying term query, there should
    //only be one entry in the span
    int seen = 0;
    while (spans.next() == true)
    {
      //if we expect payloads, then isPayloadAvailable should be true
      if (expectedNumPayloads > 0) {
        assertTrue("isPayloadAvailable is not returning the correct value: " + spans.isPayloadAvailable()
                + " and it should be: " + (expectedNumPayloads >  0),
                spans.isPayloadAvailable() == true);
      } else {
        assertTrue("isPayloadAvailable should be false", spans.isPayloadAvailable() == false);
      }
      //See payload helper, for the PayloadHelper.FIELD field, there is a single byte payload at every token
      if (spans.isPayloadAvailable()) {
        Collection<byte[]> payload = spans.getPayload();
        assertTrue("payload Size: " + payload.size() + " is not: " + expectedNumPayloads, payload.size() == expectedNumPayloads);
        for (final byte [] thePayload : payload) {
          assertTrue("payload[0] Size: " + thePayload.length + " is not: " + expectedPayloadLength,
                  thePayload.length == expectedPayloadLength);
          assertTrue(thePayload[0] + " does not equal: " + expectedFirstByte, thePayload[0] == expectedFirstByte);

        }

      }
      seen++;
    }
    assertTrue(seen + " does not equal: " + expectedNumSpans, seen == expectedNumSpans);
  }
  
  private IndexSearcher getSearcher() throws Exception {
    directory = newDirectory();
    String[] docs = new String[]{"xx rr yy mm  pp","xx yy mm rr pp", "nopayload qq ss pp np", "one two three four five six seven eight nine ten eleven", "nine one two three four five six seven eight eleven ten"};
    RandomIndexWriter writer = new RandomIndexWriter(random, directory,
                                                     newIndexWriterConfig(TEST_VERSION_CURRENT, new PayloadAnalyzer()).setSimilarity(similarity));

    Document doc = null;
    for(int i = 0; i < docs.length; i++) {
      doc = new Document();
      String docText = docs[i];
      doc.add(newField(PayloadHelper.FIELD,docText, Field.Store.YES, Field.Index.ANALYZED));
      writer.addDocument(doc);
    }

    closeIndexReader = writer.getReader();
    writer.close();

    IndexSearcher searcher = newSearcher(closeIndexReader);
    return searcher;
  }
  
  private void checkSpans(Spans spans, int numSpans, int[] numPayloads) throws IOException {
    int cnt = 0;

    while (spans.next() == true) {
      if(VERBOSE)
        System.out.println("\nSpans Dump --");
      if (spans.isPayloadAvailable()) {
        Collection<byte[]> payload = spans.getPayload();
        if(VERBOSE)
          System.out.println("payloads for span:" + payload.size());
        for (final byte [] bytes : payload) {
          if(VERBOSE)
            System.out.println("doc:" + spans.doc() + " s:" + spans.start() + " e:" + spans.end() + " "
              + new String(bytes));
        }

        assertEquals(numPayloads[cnt],payload.size());
      } else {
        assertFalse("Expected spans:" + numPayloads[cnt] + " found: 0",numPayloads.length > 0 && numPayloads[cnt] > 0 );
      }
      cnt++;
    }

    assertEquals(numSpans, cnt);
  }

  final class PayloadAnalyzer extends Analyzer {

    @Override
    public TokenStream tokenStream(String fieldName, Reader reader) {
      TokenStream result = new LowerCaseTokenizer(TEST_VERSION_CURRENT, reader);
      result = new PayloadFilter(result, fieldName);
      return result;
    }
  }

  final class PayloadFilter extends TokenFilter {
    String fieldName;
    int numSeen = 0;
    Set<String> entities = new HashSet();
    Set<String> nopayload = new HashSet();
    int pos;
    PayloadAttribute payloadAtt;
    CharTermAttribute termAtt;
    PositionIncrementAttribute posIncrAtt;

    public PayloadFilter(TokenStream input, String fieldName) {
      super(input);
      this.fieldName = fieldName;
      pos = 0;
      entities.add("xx");
      entities.add("one");
      nopayload.add("nopayload");
      nopayload.add("np");
      termAtt = addAttribute(CharTermAttribute.class);
      posIncrAtt = addAttribute(PositionIncrementAttribute.class);
      payloadAtt = addAttribute(PayloadAttribute.class);
    }

    @Override
    public boolean incrementToken() throws IOException {
      if (input.incrementToken()) {
        String token = termAtt.toString();

        if (!nopayload.contains(token)) {
          if (entities.contains(token)) {
            payloadAtt.setPayload(new Payload((token + ":Entity:"+ pos ).getBytes()));
          } else {
            payloadAtt.setPayload(new Payload((token + ":Noise:" + pos ).getBytes()));
          }
        }
        pos += posIncrAtt.getPositionIncrement();
        return true;
      }
      return false;
    }
  }
  
  public final class TestPayloadAnalyzer extends Analyzer {

    @Override
    public TokenStream tokenStream(String fieldName, Reader reader) {
      TokenStream result = new LowerCaseTokenizer(TEST_VERSION_CURRENT, reader);
      result = new PayloadFilter(result, fieldName);
      return result;
    }
  }
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene TestPayloadSpans.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.