|
Lucene example source code file (FieldsReader.java)
This example Lucene source code file (FieldsReader.java) is included in the DevDaily.com
"Java Source Code
Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.
The Lucene FieldsReader.java source code
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.util.zip.DataFormatException;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.AbstractField;
import org.apache.lucene.document.CompressionTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.FieldSelectorResult;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.BufferedIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.CloseableThreadLocal;
/**
* Class responsible for access to stored document fields.
* <p/>
* It uses <segment>.fdt and <segment>.fdx; files.
*/
final class FieldsReader implements Cloneable {
private final FieldInfos fieldInfos;
// The main fieldStream, used only for cloning.
private final IndexInput cloneableFieldsStream;
// This is a clone of cloneableFieldsStream used for reading documents.
// It should not be cloned outside of a synchronized context.
private final IndexInput fieldsStream;
private final IndexInput cloneableIndexStream;
private final IndexInput indexStream;
private int numTotalDocs;
private int size;
private boolean closed;
private final int format;
private final int formatSize;
// The docID offset where our docs begin in the index
// file. This will be 0 if we have our own private file.
private int docStoreOffset;
private CloseableThreadLocal<IndexInput> fieldsStreamTL = new CloseableThreadLocal();
private boolean isOriginal = false;
/** Returns a cloned FieldsReader that shares open
* IndexInputs with the original one. It is the caller's
* job not to close the original FieldsReader until all
* clones are called (eg, currently SegmentReader manages
* this logic). */
@Override
public Object clone() {
ensureOpen();
return new FieldsReader(fieldInfos, numTotalDocs, size, format, formatSize, docStoreOffset, cloneableFieldsStream, cloneableIndexStream);
}
/**
* Detects the code version this segment was written with. Returns either
* "2.x" for all pre-3.0 segments, or "3.0" for 3.0 segments. This method
* should not be called for 3.1+ segments since they already record their code
* version.
*/
static String detectCodeVersion(Directory dir, String segment) throws IOException {
IndexInput idxStream = dir.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.FIELDS_INDEX_EXTENSION), 1024);
try {
int format = idxStream.readInt();
if (format < FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS) {
return "2.x";
} else {
return "3.0";
}
} finally {
idxStream.close();
}
}
// Used only by clone
private FieldsReader(FieldInfos fieldInfos, int numTotalDocs, int size, int format, int formatSize,
int docStoreOffset, IndexInput cloneableFieldsStream, IndexInput cloneableIndexStream) {
this.fieldInfos = fieldInfos;
this.numTotalDocs = numTotalDocs;
this.size = size;
this.format = format;
this.formatSize = formatSize;
this.docStoreOffset = docStoreOffset;
this.cloneableFieldsStream = cloneableFieldsStream;
this.cloneableIndexStream = cloneableIndexStream;
fieldsStream = (IndexInput) cloneableFieldsStream.clone();
indexStream = (IndexInput) cloneableIndexStream.clone();
}
FieldsReader(Directory d, String segment, FieldInfos fn) throws IOException {
this(d, segment, fn, BufferedIndexInput.BUFFER_SIZE, -1, 0);
}
FieldsReader(Directory d, String segment, FieldInfos fn, int readBufferSize) throws IOException {
this(d, segment, fn, readBufferSize, -1, 0);
}
FieldsReader(Directory d, String segment, FieldInfos fn, int readBufferSize, int docStoreOffset, int size) throws IOException {
boolean success = false;
isOriginal = true;
try {
fieldInfos = fn;
cloneableFieldsStream = d.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.FIELDS_EXTENSION), readBufferSize);
cloneableIndexStream = d.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.FIELDS_INDEX_EXTENSION), readBufferSize);
// First version of fdx did not include a format
// header, but, the first int will always be 0 in that
// case
int firstInt = cloneableIndexStream.readInt();
if (firstInt == 0)
format = 0;
else
format = firstInt;
if (format > FieldsWriter.FORMAT_CURRENT)
throw new CorruptIndexException("Incompatible format version: " + format + " expected "
+ FieldsWriter.FORMAT_CURRENT + " or lower");
if (format > FieldsWriter.FORMAT)
formatSize = 4;
else
formatSize = 0;
if (format < FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES)
cloneableFieldsStream.setModifiedUTF8StringsMode();
fieldsStream = (IndexInput) cloneableFieldsStream.clone();
final long indexSize = cloneableIndexStream.length()-formatSize;
if (docStoreOffset != -1) {
// We read only a slice out of this shared fields file
this.docStoreOffset = docStoreOffset;
this.size = size;
// Verify the file is long enough to hold all of our
// docs
assert ((int) (indexSize / 8)) >= size + this.docStoreOffset: "indexSize=" + indexSize + " size=" + size + " docStoreOffset=" + docStoreOffset;
} else {
this.docStoreOffset = 0;
this.size = (int) (indexSize >> 3);
}
indexStream = (IndexInput) cloneableIndexStream.clone();
numTotalDocs = (int) (indexSize >> 3);
success = true;
} finally {
// With lock-less commits, it's entirely possible (and
// fine) to hit a FileNotFound exception above. In
// this case, we want to explicitly close any subset
// of things that were opened so that we don't have to
// wait for a GC to do so.
if (!success) {
close();
}
}
}
/**
* @throws AlreadyClosedException if this FieldsReader is closed
*/
private void ensureOpen() throws AlreadyClosedException {
if (closed) {
throw new AlreadyClosedException("this FieldsReader is closed");
}
}
/**
* Closes the underlying {@link org.apache.lucene.store.IndexInput} streams, including any ones associated with a
* lazy implementation of a Field. This means that the Fields values will not be accessible.
*
* @throws IOException
*/
final void close() throws IOException {
if (!closed) {
if (fieldsStream != null) {
fieldsStream.close();
}
if (isOriginal) {
if (cloneableFieldsStream != null) {
cloneableFieldsStream.close();
}
if (cloneableIndexStream != null) {
cloneableIndexStream.close();
}
}
if (indexStream != null) {
indexStream.close();
}
fieldsStreamTL.close();
closed = true;
}
}
final int size() {
return size;
}
private final void seekIndex(int docID) throws IOException {
indexStream.seek(formatSize + (docID + docStoreOffset) * 8L);
}
boolean canReadRawDocs() {
// Disable reading raw docs in 2.x format, because of the removal of compressed
// fields in 3.0. We don't want rawDocs() to decode field bits to figure out
// if a field was compressed, hence we enforce ordinary (non-raw) stored field merges
// for <3.0 indexes.
return format >= FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS;
}
final Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
seekIndex(n);
long position = indexStream.readLong();
fieldsStream.seek(position);
Document doc = new Document();
int numFields = fieldsStream.readVInt();
out: for (int i = 0; i < numFields; i++) {
int fieldNumber = fieldsStream.readVInt();
FieldInfo fi = fieldInfos.fieldInfo(fieldNumber);
FieldSelectorResult acceptField = fieldSelector == null ? FieldSelectorResult.LOAD : fieldSelector.accept(fi.name);
int bits = fieldsStream.readByte() & 0xFF;
assert bits <= (FieldsWriter.FIELD_IS_NUMERIC_MASK | FieldsWriter.FIELD_IS_COMPRESSED | FieldsWriter.FIELD_IS_TOKENIZED | FieldsWriter.FIELD_IS_BINARY): "bits=" + Integer.toHexString(bits);
boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0;
assert (compressed ? (format < FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS) : true)
: "compressed fields are only allowed in indexes of version <= 2.9";
boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0;
boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0;
final int numeric = bits & FieldsWriter.FIELD_IS_NUMERIC_MASK;
switch (acceptField) {
case LOAD:
addField(doc, fi, binary, compressed, tokenize, numeric);
break;
case LOAD_AND_BREAK:
addField(doc, fi, binary, compressed, tokenize, numeric);
break out; //Get out of this loop
case LAZY_LOAD:
addFieldLazy(doc, fi, binary, compressed, tokenize, true, numeric);
break;
case LATENT:
addFieldLazy(doc, fi, binary, compressed, tokenize, false, numeric);
break;
case SIZE:
skipFieldBytes(binary, compressed, addFieldSize(doc, fi, binary, compressed, numeric));
break;
case SIZE_AND_BREAK:
addFieldSize(doc, fi, binary, compressed, numeric);
break out; //Get out of this loop
default:
skipField(binary, compressed, numeric);
}
}
return doc;
}
/** Returns the length in bytes of each raw document in a
* contiguous range of length numDocs starting with
* startDocID. Returns the IndexInput (the fieldStream),
* already seeked to the starting point for startDocID.*/
final IndexInput rawDocs(int[] lengths, int startDocID, int numDocs) throws IOException {
seekIndex(startDocID);
long startOffset = indexStream.readLong();
long lastOffset = startOffset;
int count = 0;
while (count < numDocs) {
final long offset;
final int docID = docStoreOffset + startDocID + count + 1;
assert docID <= numTotalDocs;
if (docID < numTotalDocs)
offset = indexStream.readLong();
else
offset = fieldsStream.length();
lengths[count++] = (int) (offset-lastOffset);
lastOffset = offset;
}
fieldsStream.seek(startOffset);
return fieldsStream;
}
/**
* Skip the field. We still have to read some of the information about the field, but can skip past the actual content.
* This will have the most payoff on large fields.
*/
private void skipField(boolean binary, boolean compressed, int numeric) throws IOException {
final int numBytes;
switch(numeric) {
case 0:
numBytes = fieldsStream.readVInt();
break;
case FieldsWriter.FIELD_IS_NUMERIC_INT:
case FieldsWriter.FIELD_IS_NUMERIC_FLOAT:
numBytes = 4;
break;
case FieldsWriter.FIELD_IS_NUMERIC_LONG:
case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE:
numBytes = 8;
break;
default:
throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric));
}
skipFieldBytes(binary, compressed, numBytes);
}
private void skipFieldBytes(boolean binary, boolean compressed, int toRead) throws IOException {
if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES || binary || compressed) {
fieldsStream.seek(fieldsStream.getFilePointer() + toRead);
} else {
// We need to skip chars. This will slow us down, but still better
fieldsStream.skipChars(toRead);
}
}
private NumericField loadNumericField(FieldInfo fi, int numeric) throws IOException {
assert numeric != 0;
switch(numeric) {
case FieldsWriter.FIELD_IS_NUMERIC_INT:
return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setIntValue(fieldsStream.readInt());
case FieldsWriter.FIELD_IS_NUMERIC_LONG:
return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setLongValue(fieldsStream.readLong());
case FieldsWriter.FIELD_IS_NUMERIC_FLOAT:
return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setFloatValue(Float.intBitsToFloat(fieldsStream.readInt()));
case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE:
return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setDoubleValue(Double.longBitsToDouble(fieldsStream.readLong()));
default:
throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric));
}
}
private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize, boolean cacheResult, int numeric) throws IOException {
final AbstractField f;
if (binary) {
int toRead = fieldsStream.readVInt();
long pointer = fieldsStream.getFilePointer();
f = new LazyField(fi.name, Field.Store.YES, toRead, pointer, binary, compressed, cacheResult);
//Need to move the pointer ahead by toRead positions
fieldsStream.seek(pointer + toRead);
} else if (numeric != 0) {
f = loadNumericField(fi, numeric);
} else {
Field.Store store = Field.Store.YES;
Field.Index index = Field.Index.toIndex(fi.isIndexed, tokenize);
Field.TermVector termVector = Field.TermVector.toTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector);
if (compressed) {
int toRead = fieldsStream.readVInt();
long pointer = fieldsStream.getFilePointer();
f = new LazyField(fi.name, store, toRead, pointer, binary, compressed, cacheResult);
//skip over the part that we aren't loading
fieldsStream.seek(pointer + toRead);
} else {
int length = fieldsStream.readVInt();
long pointer = fieldsStream.getFilePointer();
//Skip ahead of where we are by the length of what is stored
if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) {
fieldsStream.seek(pointer+length);
} else {
fieldsStream.skipChars(length);
}
f = new LazyField(fi.name, store, index, termVector, length, pointer, binary, compressed, cacheResult);
}
}
f.setOmitNorms(fi.omitNorms);
f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions);
doc.add(f);
}
private void addField(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize, int numeric) throws CorruptIndexException, IOException {
final AbstractField f;
//we have a binary stored field, and it may be compressed
if (binary) {
int toRead = fieldsStream.readVInt();
final byte[] b = new byte[toRead];
fieldsStream.readBytes(b, 0, b.length);
if (compressed) {
f = new Field(fi.name, uncompress(b));
} else {
f = new Field(fi.name, b);
}
} else if (numeric != 0) {
f = loadNumericField(fi, numeric);
} else {
Field.Store store = Field.Store.YES;
Field.Index index = Field.Index.toIndex(fi.isIndexed, tokenize);
Field.TermVector termVector = Field.TermVector.toTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector);
if (compressed) {
int toRead = fieldsStream.readVInt();
final byte[] b = new byte[toRead];
fieldsStream.readBytes(b, 0, b.length);
f = new Field(fi.name, // field name
false,
new String(uncompress(b), "UTF-8"), // uncompress the value and add as string
store,
index,
termVector);
} else {
f = new Field(fi.name, // name
false,
fieldsStream.readString(), // read value
store,
index,
termVector);
}
}
f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions);
f.setOmitNorms(fi.omitNorms);
doc.add(f);
}
// Add the size of field as a byte[] containing the 4 bytes of the integer byte size (high order byte first; char = 2 bytes)
// Read just the size -- caller must skip the field content to continue reading fields
// Return the size in bytes or chars, depending on field type
private int addFieldSize(Document doc, FieldInfo fi, boolean binary, boolean compressed, int numeric) throws IOException {
final int bytesize, size;
switch(numeric) {
case 0:
size = fieldsStream.readVInt();
bytesize = (binary || compressed) ? size : 2*size;
break;
case FieldsWriter.FIELD_IS_NUMERIC_INT:
case FieldsWriter.FIELD_IS_NUMERIC_FLOAT:
size = bytesize = 4;
break;
case FieldsWriter.FIELD_IS_NUMERIC_LONG:
case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE:
size = bytesize = 8;
break;
default:
throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric));
}
byte[] sizebytes = new byte[4];
sizebytes[0] = (byte) (bytesize>>>24);
sizebytes[1] = (byte) (bytesize>>>16);
sizebytes[2] = (byte) (bytesize>>> 8);
sizebytes[3] = (byte) bytesize ;
doc.add(new Field(fi.name, sizebytes));
return size;
}
/**
* A Lazy implementation of Fieldable that defers loading of fields until asked for, instead of when the Document is
* loaded.
*/
private class LazyField extends AbstractField implements Fieldable {
private int toRead;
private long pointer;
/** @deprecated Only kept for backward-compatbility with <3.0 indexes. Will be removed in 4.0. */
@Deprecated
private boolean isCompressed;
private boolean cacheResult;
public LazyField(String name, Field.Store store, int toRead, long pointer, boolean isBinary, boolean isCompressed, boolean cacheResult) {
super(name, store, Field.Index.NO, Field.TermVector.NO);
this.toRead = toRead;
this.pointer = pointer;
this.isBinary = isBinary;
this.cacheResult = cacheResult;
if (isBinary)
binaryLength = toRead;
lazy = true;
this.isCompressed = isCompressed;
}
public LazyField(String name, Field.Store store, Field.Index index, Field.TermVector termVector, int toRead, long pointer, boolean isBinary, boolean isCompressed, boolean cacheResult) {
super(name, store, index, termVector);
this.toRead = toRead;
this.pointer = pointer;
this.isBinary = isBinary;
this.cacheResult = cacheResult;
if (isBinary)
binaryLength = toRead;
lazy = true;
this.isCompressed = isCompressed;
}
private IndexInput getFieldStream() {
IndexInput localFieldsStream = fieldsStreamTL.get();
if (localFieldsStream == null) {
localFieldsStream = (IndexInput) cloneableFieldsStream.clone();
fieldsStreamTL.set(localFieldsStream);
}
return localFieldsStream;
}
/** The value of the field as a Reader, or null. If null, the String value,
* binary value, or TokenStream value is used. Exactly one of stringValue(),
* readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
public Reader readerValue() {
ensureOpen();
return null;
}
/** The value of the field as a TokenStream, or null. If null, the Reader value,
* String value, or binary value is used. Exactly one of stringValue(),
* readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
public TokenStream tokenStreamValue() {
ensureOpen();
return null;
}
/** The value of the field as a String, or null. If null, the Reader value,
* binary value, or TokenStream value is used. Exactly one of stringValue(),
* readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
public String stringValue() {
ensureOpen();
if (isBinary)
return null;
else {
if (fieldsData == null) {
IndexInput localFieldsStream = getFieldStream();
String value;
try {
localFieldsStream.seek(pointer);
if (isCompressed) {
final byte[] b = new byte[toRead];
localFieldsStream.readBytes(b, 0, b.length);
value = new String(uncompress(b), "UTF-8");
} else {
if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) {
byte[] bytes = new byte[toRead];
localFieldsStream.readBytes(bytes, 0, toRead);
value = new String(bytes, "UTF-8");
} else {
//read in chars b/c we already know the length we need to read
char[] chars = new char[toRead];
localFieldsStream.readChars(chars, 0, toRead);
value = new String(chars);
}
}
} catch (IOException e) {
throw new FieldReaderException(e);
}
if (cacheResult){
fieldsData = value;
}
return value;
} else{
return (String) fieldsData;
}
}
}
public long getPointer() {
ensureOpen();
return pointer;
}
public void setPointer(long pointer) {
ensureOpen();
this.pointer = pointer;
}
public int getToRead() {
ensureOpen();
return toRead;
}
public void setToRead(int toRead) {
ensureOpen();
this.toRead = toRead;
}
@Override
public byte[] getBinaryValue(byte[] result) {
ensureOpen();
if (isBinary) {
if (fieldsData == null) {
// Allocate new buffer if result is null or too small
final byte[] b;
byte[] value;
if (result == null || result.length < toRead)
b = new byte[toRead];
else
b = result;
IndexInput localFieldsStream = getFieldStream();
// Throw this IOException since IndexReader.document does so anyway, so probably not that big of a change for people
// since they are already handling this exception when getting the document
try {
localFieldsStream.seek(pointer);
localFieldsStream.readBytes(b, 0, toRead);
if (isCompressed == true) {
value = uncompress(b);
} else {
value = b;
}
} catch (IOException e) {
throw new FieldReaderException(e);
}
binaryOffset = 0;
binaryLength = toRead;
if (cacheResult == true){
fieldsData = value;
}
return value;
} else{
return (byte[]) fieldsData;
}
} else {
return null;
}
}
}
private byte[] uncompress(byte[] b)
throws CorruptIndexException {
try {
return CompressionTools.decompress(b);
} catch (DataFormatException e) {
// this will happen if the field is not compressed
CorruptIndexException newException = new CorruptIndexException("field data are in wrong format: " + e.toString());
newException.initCause(e);
throw newException;
}
}
}
Other Lucene examples (source code examples)
Here is a short list of links related to this Lucene FieldsReader.java source code file:
|