alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (NearSpansOrdered.java)

This example Lucene source code file (NearSpansOrdered.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

advance, check, collection, collection, comparator, indexreader, io, ioexception, ioexception, nearspansordered, override, override, set, spans, spans, util

The Lucene NearSpansOrdered.java source code

package org.apache.lucene.search.spans;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.ArrayUtil;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Collection;
import java.util.Set;

/** A Spans that is formed from the ordered subspans of a SpanNearQuery
 * where the subspans do not overlap and have a maximum slop between them.
 * <p>
 * The formed spans only contains minimum slop matches.<br>
 * The matching slop is computed from the distance(s) between
 * the non overlapping matching Spans.<br>
 * Successive matches are always formed from the successive Spans
 * of the SpanNearQuery.
 * <p>
 * The formed spans may contain overlaps when the slop is at least 1.
 * For example, when querying using
 * <pre>t1 t2 t3
* with slop at least 1, the fragment: * <pre>t1 t2 t1 t3 t2 t3 * matches twice: * <pre>t1 t2 .. t3 * <pre> t1 .. t2 t3 * * * Expert: * Only public for subclassing. Most implementations should not need this class */ public class NearSpansOrdered extends Spans { private final int allowedSlop; private boolean firstTime = true; private boolean more = false; /** The spans in the same order as the SpanNearQuery */ private final Spans[] subSpans; /** Indicates that all subSpans have same doc() */ private boolean inSameDoc = false; private int matchDoc = -1; private int matchStart = -1; private int matchEnd = -1; private List<byte[]> matchPayload; private final Spans[] subSpansByDoc; private final Comparator<Spans> spanDocComparator = new Comparator() { public int compare(Spans o1, Spans o2) { return o1.doc() - o2.doc(); } }; private SpanNearQuery query; private boolean collectPayloads = true; public NearSpansOrdered(SpanNearQuery spanNearQuery, IndexReader reader) throws IOException { this(spanNearQuery, reader, true); } public NearSpansOrdered(SpanNearQuery spanNearQuery, IndexReader reader, boolean collectPayloads) throws IOException { if (spanNearQuery.getClauses().length < 2) { throw new IllegalArgumentException("Less than 2 clauses: " + spanNearQuery); } this.collectPayloads = collectPayloads; allowedSlop = spanNearQuery.getSlop(); SpanQuery[] clauses = spanNearQuery.getClauses(); subSpans = new Spans[clauses.length]; matchPayload = new LinkedList<byte[]>(); subSpansByDoc = new Spans[clauses.length]; for (int i = 0; i < clauses.length; i++) { subSpans[i] = clauses[i].getSpans(reader); subSpansByDoc[i] = subSpans[i]; // used in toSameDoc() } query = spanNearQuery; // kept for toString() only. } // inherit javadocs @Override public int doc() { return matchDoc; } // inherit javadocs @Override public int start() { return matchStart; } // inherit javadocs @Override public int end() { return matchEnd; } public Spans[] getSubSpans() { return subSpans; } // TODO: Remove warning after API has been finalized // TODO: Would be nice to be able to lazy load payloads @Override public Collection<byte[]> getPayload() throws IOException { return matchPayload; } // TODO: Remove warning after API has been finalized @Override public boolean isPayloadAvailable() { return matchPayload.isEmpty() == false; } // inherit javadocs @Override public boolean next() throws IOException { if (firstTime) { firstTime = false; for (int i = 0; i < subSpans.length; i++) { if (! subSpans[i].next()) { more = false; return false; } } more = true; } if(collectPayloads) { matchPayload.clear(); } return advanceAfterOrdered(); } // inherit javadocs @Override public boolean skipTo(int target) throws IOException { if (firstTime) { firstTime = false; for (int i = 0; i < subSpans.length; i++) { if (! subSpans[i].skipTo(target)) { more = false; return false; } } more = true; } else if (more && (subSpans[0].doc() < target)) { if (subSpans[0].skipTo(target)) { inSameDoc = false; } else { more = false; return false; } } if(collectPayloads) { matchPayload.clear(); } return advanceAfterOrdered(); } /** Advances the subSpans to just after an ordered match with a minimum slop * that is smaller than the slop allowed by the SpanNearQuery. * @return true iff there is such a match. */ private boolean advanceAfterOrdered() throws IOException { while (more && (inSameDoc || toSameDoc())) { if (stretchToOrder() && shrinkToAfterShortestMatch()) { return true; } } return false; // no more matches } /** Advance the subSpans to the same document */ private boolean toSameDoc() throws IOException { ArrayUtil.mergeSort(subSpansByDoc, spanDocComparator); int firstIndex = 0; int maxDoc = subSpansByDoc[subSpansByDoc.length - 1].doc(); while (subSpansByDoc[firstIndex].doc() != maxDoc) { if (! subSpansByDoc[firstIndex].skipTo(maxDoc)) { more = false; inSameDoc = false; return false; } maxDoc = subSpansByDoc[firstIndex].doc(); if (++firstIndex == subSpansByDoc.length) { firstIndex = 0; } } for (int i = 0; i < subSpansByDoc.length; i++) { assert (subSpansByDoc[i].doc() == maxDoc) : " NearSpansOrdered.toSameDoc() spans " + subSpansByDoc[0] + "\n at doc " + subSpansByDoc[i].doc() + ", but should be at " + maxDoc; } inSameDoc = true; return true; } /** Check whether two Spans in the same document are ordered. * @param spans1 * @param spans2 * @return true iff spans1 starts before spans2 * or the spans start at the same position, * and spans1 ends before spans2. */ static final boolean docSpansOrdered(Spans spans1, Spans spans2) { assert spans1.doc() == spans2.doc() : "doc1 " + spans1.doc() + " != doc2 " + spans2.doc(); int start1 = spans1.start(); int start2 = spans2.start(); /* Do not call docSpansOrdered(int,int,int,int) to avoid invoking .end() : */ return (start1 == start2) ? (spans1.end() < spans2.end()) : (start1 < start2); } /** Like {@link #docSpansOrdered(Spans,Spans)}, but use the spans * starts and ends as parameters. */ private static final boolean docSpansOrdered(int start1, int end1, int start2, int end2) { return (start1 == start2) ? (end1 < end2) : (start1 < start2); } /** Order the subSpans within the same document by advancing all later spans * after the previous one. */ private boolean stretchToOrder() throws IOException { matchDoc = subSpans[0].doc(); for (int i = 1; inSameDoc && (i < subSpans.length); i++) { while (! docSpansOrdered(subSpans[i-1], subSpans[i])) { if (! subSpans[i].next()) { inSameDoc = false; more = false; break; } else if (matchDoc != subSpans[i].doc()) { inSameDoc = false; break; } } } return inSameDoc; } /** The subSpans are ordered in the same doc, so there is a possible match. * Compute the slop while making the match as short as possible by advancing * all subSpans except the last one in reverse order. */ private boolean shrinkToAfterShortestMatch() throws IOException { matchStart = subSpans[subSpans.length - 1].start(); matchEnd = subSpans[subSpans.length - 1].end(); Set<byte[]> possibleMatchPayloads = new HashSet(); if (subSpans[subSpans.length - 1].isPayloadAvailable()) { possibleMatchPayloads.addAll(subSpans[subSpans.length - 1].getPayload()); } Collection<byte[]> possiblePayload = null; int matchSlop = 0; int lastStart = matchStart; int lastEnd = matchEnd; for (int i = subSpans.length - 2; i >= 0; i--) { Spans prevSpans = subSpans[i]; if (collectPayloads && prevSpans.isPayloadAvailable()) { Collection<byte[]> payload = prevSpans.getPayload(); possiblePayload = new ArrayList<byte[]>(payload.size()); possiblePayload.addAll(payload); } int prevStart = prevSpans.start(); int prevEnd = prevSpans.end(); while (true) { // Advance prevSpans until after (lastStart, lastEnd) if (! prevSpans.next()) { inSameDoc = false; more = false; break; // Check remaining subSpans for final match. } else if (matchDoc != prevSpans.doc()) { inSameDoc = false; // The last subSpans is not advanced here. break; // Check remaining subSpans for last match in this document. } else { int ppStart = prevSpans.start(); int ppEnd = prevSpans.end(); // Cannot avoid invoking .end() if (! docSpansOrdered(ppStart, ppEnd, lastStart, lastEnd)) { break; // Check remaining subSpans. } else { // prevSpans still before (lastStart, lastEnd) prevStart = ppStart; prevEnd = ppEnd; if (collectPayloads && prevSpans.isPayloadAvailable()) { Collection<byte[]> payload = prevSpans.getPayload(); possiblePayload = new ArrayList<byte[]>(payload.size()); possiblePayload.addAll(payload); } } } } if (collectPayloads && possiblePayload != null) { possibleMatchPayloads.addAll(possiblePayload); } assert prevStart <= matchStart; if (matchStart > prevEnd) { // Only non overlapping spans add to slop. matchSlop += (matchStart - prevEnd); } /* Do not break on (matchSlop > allowedSlop) here to make sure * that subSpans[0] is advanced after the match, if any. */ matchStart = prevStart; lastStart = prevStart; lastEnd = prevEnd; } boolean match = matchSlop <= allowedSlop; if(collectPayloads && match && possibleMatchPayloads.size() > 0) { matchPayload.addAll(possibleMatchPayloads); } return match; // ordered and allowed slop } @Override public String toString() { return getClass().getName() + "("+query.toString()+")@"+ (firstTime?"START":(more?(doc()+":"+start()+"-"+end()):"END")); } }

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene NearSpansOrdered.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.