alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (EnwikiContentSource.java)

This example Lucene source code file (EnwikiContentSource.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

body, date, id, io, ioexception, ioexception, nomoredataexception, nomoredataexception, override, override, page, sax, string, string, stringbuilder, title, util

The Lucene EnwikiContentSource.java source code

package org.apache.lucene.benchmark.byTask.feeds;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;

import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
import org.apache.lucene.util.ThreadInterruptedException;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;

/**
 * A {@link ContentSource} which reads the English Wikipedia dump. You can read
 * the .bz2 file directly (it will be decompressed on the fly). Config
 * properties:
 * <ul>
 * <li>keep.image.only.docs=false|true (default true).
 * <li>docs.file=<path to the file>
 * </ul>
 */
public class EnwikiContentSource extends ContentSource {

  private class Parser extends DefaultHandler implements Runnable {
    private Thread t;
    private boolean threadDone;
    private String[] tuple;
    private NoMoreDataException nmde;
    private StringBuilder contents = new StringBuilder();
    private String title;
    private String body;
    private String time;
    private String id;
    
    String[] next() throws NoMoreDataException {
      if (t == null) {
        threadDone = false;
        t = new Thread(this);
        t.setDaemon(true);
        t.start();
      }
      String[] result;
      synchronized(this){
        while(tuple == null && nmde == null && !threadDone) {
          try {
            wait();
          } catch (InterruptedException ie) {
            throw new ThreadInterruptedException(ie);
          }
        }
        if (nmde != null) {
          // Set to null so we will re-start thread in case
          // we are re-used:
          t = null;
          throw nmde;
        }
        if (t != null && threadDone) {
          // The thread has exited yet did not hit end of
          // data, so this means it hit an exception.  We
          // throw NoMorDataException here to force
          // benchmark to stop the current alg:
          throw new NoMoreDataException();
        }
        result = tuple;
        tuple = null;
        notify();
      }
      return result;
    }
    
    String time(String original) {
      StringBuilder buffer = new StringBuilder();

      buffer.append(original.substring(8, 10));
      buffer.append('-');
      buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]);
      buffer.append('-');
      buffer.append(original.substring(0, 4));
      buffer.append(' ');
      buffer.append(original.substring(11, 19));
      buffer.append(".000");

      return buffer.toString();
    }
    
    @Override
    public void characters(char[] ch, int start, int length) {
      contents.append(ch, start, length);
    }

    @Override
    public void endElement(String namespace, String simple, String qualified)
      throws SAXException {
      int elemType = getElementType(qualified);
      switch (elemType) {
        case PAGE:
          // the body must be null and we either are keeping image docs or the
          // title does not start with Image:
          if (body != null && (keepImages || !title.startsWith("Image:"))) {
            String[] tmpTuple = new String[LENGTH];
            tmpTuple[TITLE] = title.replace('\t', ' ');
            tmpTuple[DATE] = time.replace('\t', ' ');
            tmpTuple[BODY] = body.replaceAll("[\t\n]", " ");
            tmpTuple[ID] = id;
            synchronized(this) {
              while (tuple != null) {
                try {
                  wait();
                } catch (InterruptedException ie) {
                  throw new ThreadInterruptedException(ie);
                }
              }
              tuple = tmpTuple;
              notify();
            }
          }
          break;
        case BODY:
          body = contents.toString();
          //workaround that startswith doesn't have an ignore case option, get at least 20 chars.
          String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase();
          if (startsWith.startsWith("#redirect")) {
            body = null;
          }
          break;
        case DATE:
          time = time(contents.toString());
          break;
        case TITLE:
          title = contents.toString();
          break;
        case ID:
          //the doc id is the first one in the page.  All other ids after that one can be ignored according to the schema
          if (id == null) {
            id = contents.toString();
          }
          break;
        default:
          // this element should be discarded.
      }
    }

    public void run() {

      try {
        XMLReader reader = XMLReaderFactory.createXMLReader();
        reader.setContentHandler(this);
        reader.setErrorHandler(this);
        while(true){
          final InputStream localFileIS = is;
          try {
            reader.parse(new InputSource(localFileIS));
          } catch (IOException ioe) {
            synchronized(EnwikiContentSource.this) {
              if (localFileIS != is) {
                // fileIS was closed on us, so, just fall
                // through
              } else
                // Exception is real
                throw ioe;
            }
          }
          synchronized(this) {
            if (!forever) {
              nmde = new NoMoreDataException();
              notify();
              return;
            } else if (localFileIS == is) {
              // If file is not already re-opened then re-open it now
              is = StreamUtils.inputStream(file);
            }
          }
        }
      } catch (SAXException sae) {
        throw new RuntimeException(sae);
      } catch (IOException ioe) {
        throw new RuntimeException(ioe);
      } finally {
        synchronized(this) {
          threadDone = true;
          notify();
        }
      }
    }

    @Override
    public void startElement(String namespace, String simple, String qualified,
                             Attributes attributes) {
      int elemType = getElementType(qualified);
      switch (elemType) {
        case PAGE:
          title = null;
          body = null;
          time = null;
          id = null;
          break;
        // intentional fall-through.
        case BODY:
        case DATE:
        case TITLE:
        case ID:
          contents.setLength(0);
          break;
        default:
          // this element should be discarded.
      }
    }
  }

  private static final Map<String,Integer> ELEMENTS = new HashMap();
  private static final int TITLE = 0;
  private static final int DATE = TITLE + 1;
  private static final int BODY = DATE + 1;
  private static final int ID = BODY + 1;
  private static final int LENGTH = ID + 1;
  // LENGTH is used as the size of the tuple, so whatever constants we need that
  // should not be part of the tuple, we should define them after LENGTH.
  private static final int PAGE = LENGTH + 1;

  private static final String[] months = {"JAN", "FEB", "MAR", "APR",
                                  "MAY", "JUN", "JUL", "AUG",
                                  "SEP", "OCT", "NOV", "DEC"};

  static {
    ELEMENTS.put("page", Integer.valueOf(PAGE));
    ELEMENTS.put("text", Integer.valueOf(BODY));
    ELEMENTS.put("timestamp", Integer.valueOf(DATE));
    ELEMENTS.put("title", Integer.valueOf(TITLE));
    ELEMENTS.put("id", Integer.valueOf(ID));
  }
  
  /**
   * Returns the type of the element if defined, otherwise returns -1. This
   * method is useful in startElement and endElement, by not needing to compare
   * the element qualified name over and over.
   */
  private final static int getElementType(String elem) {
    Integer val = ELEMENTS.get(elem);
    return val == null ? -1 : val.intValue();
  }
  
  private File file;
  private boolean keepImages = true;
  private InputStream is;
  private Parser parser = new Parser();
  
  @Override
  public void close() throws IOException {
    synchronized (EnwikiContentSource.this) {
      if (is != null) {
        is.close();
        is = null;
      }
    }
  }
  
  @Override
  public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
    String[] tuple = parser.next();
    docData.clear();
    docData.setName(tuple[ID]);
    docData.setBody(tuple[BODY]);
    docData.setDate(tuple[DATE]);
    docData.setTitle(tuple[TITLE]);
    return docData;
  }

  @Override
  public void resetInputs() throws IOException {
    super.resetInputs();
    is = StreamUtils.inputStream(file);
  }
  
  @Override
  public void setConfig(Config config) {
    super.setConfig(config);
    keepImages = config.get("keep.image.only.docs", true);
    String fileName = config.get("docs.file", null);
    if (fileName == null) {
      throw new IllegalArgumentException("docs.file must be set");
    }
    file = new File(fileName).getAbsoluteFile();
  }
  
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene EnwikiContentSource.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.