|
What this is
This file is included in the DevDaily.com
"Java Source Code
Warehouse" project. The intent of this project is to help you "Learn
Java by Example" TM.
Other links
The source code
package org.apache.lucene.index;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.OutputStream;
import org.apache.lucene.util.StringHelper;
import java.io.IOException;
import java.util.Vector;
/**
* Writer works by opening a document and then opening the fields within the document and then
* writing out the vectors for each field.
*
* Rough usage:
*
for each document
{
writer.openDocument();
for each field on the document
{
writer.openField(field);
for all of the terms
{
writer.addTerm(...)
}
writer.closeField
}
writer.closeDocument()
}
*/
final class TermVectorsWriter {
public static final int FORMAT_VERSION = 1;
//The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
public static final int FORMAT_SIZE = 4;
//TODO: Figure out how to write with or w/o position information and read back in
public static final String TVX_EXTENSION = ".tvx";
public static final String TVD_EXTENSION = ".tvd";
public static final String TVF_EXTENSION = ".tvf";
private OutputStream tvx = null, tvd = null, tvf = null;
private Vector fields = null;
private Vector terms = null;
private FieldInfos fieldInfos;
private TVField currentField = null;
private long currentDocPointer = -1;
/** Create term vectors writer for the specified segment in specified
* directory. A new TermVectorsWriter should be created for each
* segment. The parameter maxFields indicates how many total
* fields are found in this document. Not all of these fields may require
* termvectors to be stored, so the number of calls to
* openField is less or equal to this number.
*/
public TermVectorsWriter(Directory directory, String segment,
FieldInfos fieldInfos)
throws IOException {
// Open files for TermVector storage
tvx = directory.createFile(segment + TVX_EXTENSION);
tvx.writeInt(FORMAT_VERSION);
tvd = directory.createFile(segment + TVD_EXTENSION);
tvd.writeInt(FORMAT_VERSION);
tvf = directory.createFile(segment + TVF_EXTENSION);
tvf.writeInt(FORMAT_VERSION);
this.fieldInfos = fieldInfos;
fields = new Vector(fieldInfos.size());
terms = new Vector();
}
public final void openDocument()
throws IOException {
closeDocument();
currentDocPointer = tvd.getFilePointer();
}
public final void closeDocument()
throws IOException {
if (isDocumentOpen()) {
closeField();
writeDoc();
fields.clear();
currentDocPointer = -1;
}
}
public final boolean isDocumentOpen() {
return currentDocPointer != -1;
}
/** Start processing a field. This can be followed by a number of calls to
* addTerm, and a final call to closeField to indicate the end of
* processing of this field. If a field was previously open, it is
* closed automatically.
*/
public final void openField(String field)
throws IOException {
if (!isDocumentOpen()) throw new IllegalStateException("Cannot open field when no document is open.");
closeField();
currentField = new TVField(fieldInfos.fieldNumber(field));
}
/** Finished processing current field. This should be followed by a call to
* openField before future calls to addTerm.
*/
public final void closeField()
throws IOException {
if (isFieldOpen()) {
/* DEBUG */
//System.out.println("closeField()");
/* DEBUG */
// save field and terms
writeField();
fields.add(currentField);
terms.clear();
currentField = null;
}
}
/** Return true if a field is currently open. */
public final boolean isFieldOpen() {
return currentField != null;
}
/** Add term to the field's term vector. Field must already be open
* of NullPointerException is thrown. Terms should be added in
* increasing order of terms, one call per unique termNum. ProxPointer
* is a pointer into the TermPosition file (prx). Freq is the number of
* times this term appears in this field, in this document.
*/
public final void addTerm(String termText, int freq) {
if (!isDocumentOpen()) throw new IllegalStateException("Cannot add terms when document is not open");
if (!isFieldOpen()) throw new IllegalStateException("Cannot add terms when field is not open");
addTermInternal(termText, freq);
}
private final void addTermInternal(String termText, int freq) {
currentField.length += freq;
TVTerm term = new TVTerm();
term.termText = termText;
term.freq = freq;
terms.add(term);
}
/** Add specified vectors to the document.
*/
public final void addVectors(TermFreqVector[] vectors)
throws IOException {
if (!isDocumentOpen()) throw new IllegalStateException("Cannot add term vectors when document is not open");
if (isFieldOpen()) throw new IllegalStateException("Cannot add term vectors when field is open");
for (int i = 0; i < vectors.length; i++) {
addTermFreqVector(vectors[i]);
}
}
/** Add specified vector to the document. Document must be open but no field
* should be open or exception is thrown. The same document can have addTerm
* and addVectors calls mixed, however a given field must either be
* populated with addTerm or with addVector . *
*/
public final void addTermFreqVector(TermFreqVector vector)
throws IOException {
if (!isDocumentOpen()) throw new IllegalStateException("Cannot add term vector when document is not open");
if (isFieldOpen()) throw new IllegalStateException("Cannot add term vector when field is open");
addTermFreqVectorInternal(vector);
}
private final void addTermFreqVectorInternal(TermFreqVector vector)
throws IOException {
openField(vector.getField());
for (int i = 0; i < vector.size(); i++) {
addTermInternal(vector.getTerms()[i], vector.getTermFrequencies()[i]);
}
closeField();
}
/** Close all streams. */
final void close() throws IOException {
try {
closeDocument();
} finally {
// make an effort to close all streams we can but remember and re-throw
// the first exception encountered in this process
IOException keep = null;
if (tvx != null)
try {
tvx.close();
} catch (IOException e) {
if (keep == null) keep = e;
}
if (tvd != null)
try {
tvd.close();
} catch (IOException e) {
if (keep == null) keep = e;
}
if (tvf != null)
try {
tvf.close();
} catch (IOException e) {
if (keep == null) keep = e;
}
if (keep != null) throw (IOException) keep.fillInStackTrace();
}
}
private void writeField() throws IOException {
// remember where this field is written
currentField.tvfPointer = tvf.getFilePointer();
//System.out.println("Field Pointer: " + currentField.tvfPointer);
final int size;
tvf.writeVInt(size = terms.size());
tvf.writeVInt(currentField.length - size);
String lastTermText = "";
// write term ids and positions
for (int i = 0; i < size; i++) {
TVTerm term = (TVTerm) terms.elementAt(i);
//tvf.writeString(term.termText);
int start = StringHelper.stringDifference(lastTermText, term.termText);
int length = term.termText.length() - start;
tvf.writeVInt(start); // write shared prefix length
tvf.writeVInt(length); // write delta length
tvf.writeChars(term.termText, start, length); // write delta chars
tvf.writeVInt(term.freq);
lastTermText = term.termText;
}
}
private void writeDoc() throws IOException {
if (isFieldOpen()) throw new IllegalStateException("Field is still open while writing document");
//System.out.println("Writing doc pointer: " + currentDocPointer);
// write document index record
tvx.writeLong(currentDocPointer);
// write document data record
final int size;
// write the number of fields
tvd.writeVInt(size = fields.size());
// write field numbers
int lastFieldNumber = 0;
for (int i = 0; i < size; i++) {
TVField field = (TVField) fields.elementAt(i);
tvd.writeVInt(field.number - lastFieldNumber);
lastFieldNumber = field.number;
}
// write field pointers
long lastFieldPointer = 0;
for (int i = 0; i < size; i++) {
TVField field = (TVField) fields.elementAt(i);
tvd.writeVLong(field.tvfPointer - lastFieldPointer);
lastFieldPointer = field.tvfPointer;
}
//System.out.println("After writing doc pointer: " + tvx.getFilePointer());
}
private static class TVField {
int number;
long tvfPointer = 0;
int length = 0; // number of distinct term positions
TVField(int number) {
this.number = number;
}
}
private static class TVTerm {
String termText;
int freq = 0;
//int positions[] = null;
}
}
|