|
Lucene example source code file (FreqProxTermsWriter.java)
This example Lucene source code file (FreqProxTermsWriter.java) is included in the DevDaily.com
"Java Source Code
Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.
The Lucene FreqProxTermsWriter.java source code
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.UnicodeUtil;
import java.io.IOException;
import java.util.Collection;
import java.util.Map;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.util.BitVector;
import org.apache.lucene.util.CollectionUtil;
final class FreqProxTermsWriter extends TermsHashConsumer {
@Override
public TermsHashConsumerPerThread addThread(TermsHashPerThread perThread) {
return new FreqProxTermsWriterPerThread(perThread);
}
private static int compareText(final char[] text1, int pos1, final char[] text2, int pos2) {
while(true) {
final char c1 = text1[pos1++];
final char c2 = text2[pos2++];
if (c1 != c2) {
if (0xffff == c2)
return 1;
else if (0xffff == c1)
return -1;
else
return c1-c2;
} else if (0xffff == c1)
return 0;
}
}
@Override
void abort() {}
// TODO: would be nice to factor out more of this, eg the
// FreqProxFieldMergeState, and code to visit all Fields
// under the same FieldInfo together, up into TermsHash*.
// Other writers would presumably share alot of this...
@Override
public void flush(Map<TermsHashConsumerPerThread,Collection threadsAndFields, final SegmentWriteState state) throws IOException {
// Gather all FieldData's that have postings, across all
// ThreadStates
List<FreqProxTermsWriterPerField> allFields = new ArrayList();
for (Map.Entry<TermsHashConsumerPerThread,Collection entry : threadsAndFields.entrySet()) {
Collection<TermsHashConsumerPerField> fields = entry.getValue();
for (final TermsHashConsumerPerField i : fields) {
final FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField) i;
if (perField.termsHashPerField.numPostings > 0)
allFields.add(perField);
}
}
// Sort by field name
CollectionUtil.quickSort(allFields);
final int numAllFields = allFields.size();
// TODO: allow Lucene user to customize this consumer:
final FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos);
/*
Current writer chain:
FormatPostingsFieldsConsumer
-> IMPL: FormatPostingsFieldsWriter
-> FormatPostingsTermsConsumer
-> IMPL: FormatPostingsTermsWriter
-> FormatPostingsDocConsumer
-> IMPL: FormatPostingsDocWriter
-> FormatPostingsPositionsConsumer
-> IMPL: FormatPostingsPositionsWriter
*/
try {
int start = 0;
while(start < numAllFields) {
final FieldInfo fieldInfo = allFields.get(start).fieldInfo;
final String fieldName = fieldInfo.name;
int end = start+1;
while(end < numAllFields && allFields.get(end).fieldInfo.name.equals(fieldName))
end++;
FreqProxTermsWriterPerField[] fields = new FreqProxTermsWriterPerField[end-start];
for(int i=start;i<end;i++) {
fields[i-start] = allFields.get(i);
// Aggregate the storePayload as seen by the same
// field across multiple threads
if (!fieldInfo.omitTermFreqAndPositions) {
fieldInfo.storePayloads |= fields[i-start].hasPayloads;
}
}
// If this field has postings then add them to the
// segment
appendPostings(fieldName, state, fields, consumer);
for(int i=0;i<fields.length;i++) {
TermsHashPerField perField = fields[i].termsHashPerField;
int numPostings = perField.numPostings;
perField.reset();
perField.shrinkHash(numPostings);
fields[i].reset();
}
start = end;
}
for (Map.Entry<TermsHashConsumerPerThread,Collection entry : threadsAndFields.entrySet()) {
FreqProxTermsWriterPerThread perThread = (FreqProxTermsWriterPerThread) entry.getKey();
perThread.termsHashPerThread.reset(true);
}
} finally {
consumer.finish();
}
}
private byte[] payloadBuffer;
/* Walk through all unique text tokens (Posting
* instances) found in this field and serialize them
* into a single RAM segment. */
void appendPostings(String fieldName, SegmentWriteState state,
FreqProxTermsWriterPerField[] fields,
FormatPostingsFieldsConsumer consumer)
throws CorruptIndexException, IOException {
int numFields = fields.length;
final FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields];
for(int i=0;i<numFields;i++) {
FreqProxFieldMergeState fms = mergeStates[i] = new FreqProxFieldMergeState(fields[i]);
assert fms.field.fieldInfo == fields[0].fieldInfo;
// Should always be true
boolean result = fms.nextTerm();
assert result;
}
final FormatPostingsTermsConsumer termsConsumer = consumer.addField(fields[0].fieldInfo);
final Term protoTerm = new Term(fieldName);
FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields];
final boolean currentFieldOmitTermFreqAndPositions = fields[0].fieldInfo.omitTermFreqAndPositions;
final Map<Term,Integer> segDeletes;
if (state.segDeletes != null && state.segDeletes.terms.size() > 0) {
segDeletes = state.segDeletes.terms;
} else {
segDeletes = null;
}
try {
// TODO: really TermsHashPerField should take over most
// of this loop, including merge sort of terms from
// multiple threads and interacting with the
// TermsConsumer, only calling out to us (passing us the
// DocsConsumer) to handle delivery of docs/positions
while(numFields > 0) {
// Get the next term to merge
termStates[0] = mergeStates[0];
int numToMerge = 1;
// TODO: pqueue
for(int i=1;i<numFields;i++) {
final char[] text = mergeStates[i].text;
final int textOffset = mergeStates[i].textOffset;
final int cmp = compareText(text, textOffset, termStates[0].text, termStates[0].textOffset);
if (cmp < 0) {
termStates[0] = mergeStates[i];
numToMerge = 1;
} else if (cmp == 0)
termStates[numToMerge++] = mergeStates[i];
}
final FormatPostingsDocsConsumer docConsumer = termsConsumer.addTerm(termStates[0].text, termStates[0].textOffset);
final int delDocLimit;
if (segDeletes != null) {
final Integer docIDUpto = segDeletes.get(protoTerm.createTerm(termStates[0].termText()));
if (docIDUpto != null) {
delDocLimit = docIDUpto;
} else {
delDocLimit = 0;
}
} else {
delDocLimit = 0;
}
try {
// Now termStates has numToMerge FieldMergeStates
// which all share the same term. Now we must
// interleave the docID streams.
while(numToMerge > 0) {
FreqProxFieldMergeState minState = termStates[0];
for(int i=1;i<numToMerge;i++)
if (termStates[i].docID < minState.docID)
minState = termStates[i];
final int termDocFreq = minState.termFreq;
final FormatPostingsPositionsConsumer posConsumer = docConsumer.addDoc(minState.docID, termDocFreq);
// NOTE: we could check here if the docID was
// deleted, and skip it. However, this is somewhat
// dangerous because it can yield non-deterministic
// behavior since we may see the docID before we see
// the term that caused it to be deleted. This
// would mean some (but not all) of its postings may
// make it into the index, which'd alter the docFreq
// for those terms. We could fix this by doing two
// passes, ie first sweep marks all del docs, and
// 2nd sweep does the real flush, but I suspect
// that'd add too much time to flush.
if (minState.docID < delDocLimit) {
// Mark it deleted. TODO: we could also skip
// writing its postings; this would be
// deterministic (just for this Term's docs).
if (state.deletedDocs == null) {
state.deletedDocs = new BitVector(state.numDocs);
}
state.deletedDocs.set(minState.docID);
}
final ByteSliceReader prox = minState.prox;
// Carefully copy over the prox + payload info,
// changing the format to match Lucene's segment
// format.
if (!currentFieldOmitTermFreqAndPositions) {
// omitTermFreqAndPositions == false so we do write positions &
// payload
try {
int position = 0;
for(int j=0;j<termDocFreq;j++) {
final int code = prox.readVInt();
position += code >> 1;
final int payloadLength;
if ((code & 1) != 0) {
// This position has a payload
payloadLength = prox.readVInt();
if (payloadBuffer == null || payloadBuffer.length < payloadLength)
payloadBuffer = new byte[payloadLength];
prox.readBytes(payloadBuffer, 0, payloadLength);
} else
payloadLength = 0;
posConsumer.addPosition(position, payloadBuffer, 0, payloadLength);
} //End for
} finally {
posConsumer.finish();
}
}
if (!minState.nextDoc()) {
// Remove from termStates
int upto = 0;
for(int i=0;i<numToMerge;i++)
if (termStates[i] != minState)
termStates[upto++] = termStates[i];
numToMerge--;
assert upto == numToMerge;
// Advance this state to the next term
if (!minState.nextTerm()) {
// OK, no more terms, so remove from mergeStates
// as well
upto = 0;
for(int i=0;i<numFields;i++)
if (mergeStates[i] != minState)
mergeStates[upto++] = mergeStates[i];
numFields--;
assert upto == numFields;
}
}
}
} finally {
docConsumer.finish();
}
}
} finally {
termsConsumer.finish();
}
}
final UnicodeUtil.UTF8Result termsUTF8 = new UnicodeUtil.UTF8Result();
}
Other Lucene examples (source code examples)
Here is a short list of links related to this Lucene FreqProxTermsWriter.java source code file:
|