alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (HTMLParser.jj)

This example Lucene source code file (HTMLParser.jj) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

a, argquote1, io, ioexception, ioexception, let, mypipedinputstream, num, string, string, summary_length, token, token, util, withintag, withintag

The Lucene HTMLParser.jj source code

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

// HTMLParser.jj

options {
  STATIC = false;
  //DEBUG_LOOKAHEAD = true;
  //DEBUG_TOKEN_MANAGER = true;
  UNICODE_INPUT = true;
}

PARSER_BEGIN(HTMLParser)

package org.apache.lucene.benchmark.byTask.feeds.demohtml;

import java.io.*;
import java.util.Locale;
import java.util.Properties;

public class HTMLParser {
  public static int SUMMARY_LENGTH = 200;

  StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
  StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
  Properties metaTags=new Properties();
  String currentMetaTag=null;
  String currentMetaContent=null;
  int length = 0;
  boolean titleComplete = false;
  boolean inTitle = false;
  boolean inMetaTag = false;
  boolean inStyle = false;
  boolean afterTag = false;
  boolean afterSpace = false;
  String eol = System.getProperty("line.separator");
  Reader pipeIn = null;
  Writer pipeOut;
  private MyPipedInputStream pipeInStream = null;
  private PipedOutputStream pipeOutStream = null;
  
  private class MyPipedInputStream extends PipedInputStream{
    
    public MyPipedInputStream(){
      super();
    }
    
    public MyPipedInputStream(PipedOutputStream src) throws IOException{
      super(src);
    }
    
    public boolean full() throws IOException{
      return this.available() >= PipedInputStream.PIPE_SIZE;
    }
  }

  public String getTitle() throws IOException, InterruptedException {
    if (pipeIn == null)
      getReader();				  // spawn parsing thread
    while (true) {
      synchronized(this) {
	if (titleComplete || pipeInStream.full())
	  break;
	wait(10);
      }
    }
    return title.toString().trim();
  }

  public Properties getMetaTags() throws IOException,
InterruptedException {
    if (pipeIn == null)
      getReader();				  // spawn parsing thread
    while (true) {
      synchronized(this) {
	if (titleComplete || pipeInStream.full())
	  break;
	wait(10);
      }
    }
    return metaTags;
  }


  public String getSummary() throws IOException, InterruptedException {
    if (pipeIn == null)
      getReader();				  // spawn parsing thread
    while (true) {
      synchronized(this) {
	if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full())
	  break;
	wait(10);
      }
    }
    if (summary.length() > SUMMARY_LENGTH)
      summary.setLength(SUMMARY_LENGTH);

    String sum = summary.toString().trim();
    String tit = getTitle();
    if (sum.equals(""))
      return tit;
    else
      return sum;
  }

  public Reader getReader() throws IOException {
    if (pipeIn == null) {
      pipeInStream = new MyPipedInputStream();
      pipeOutStream = new PipedOutputStream(pipeInStream);
      pipeIn = new InputStreamReader(pipeInStream, "UTF-16BE");
      pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-16BE");

      Thread thread = new ParserThread(this);
      thread.start();				  // start parsing
    }

    return pipeIn;
  }

  void addToSummary(String text) {
    if (summary.length() < SUMMARY_LENGTH) {
      summary.append(text);
      if (summary.length() >= SUMMARY_LENGTH) {
	synchronized(this) {
	  notifyAll();
	}
      }
    }
  }

  void addText(String text) throws IOException {
    if (inStyle)
      return;
    if (inTitle)
      title.append(text);
    else {
      addToSummary(text);
      if (!titleComplete && !(title.length() == 0)) {  // finished title
	synchronized(this) {
	  titleComplete = true;			  // tell waiting threads
	  notifyAll();
	}
      }
    }

    length += text.length();
    pipeOut.write(text);

    afterSpace = false;
  }
  
  void addMetaTag() {
      metaTags.setProperty(currentMetaTag, currentMetaContent);
      currentMetaTag = null;
      currentMetaContent = null;
      return;
  }

  void addSpace() throws IOException {
    if (!afterSpace) {
      if (inTitle)
	title.append(" ");
      else
	addToSummary(" ");

      String space = afterTag ? eol : " ";
      length += space.length();
      pipeOut.write(space);
      afterSpace = true;
    }
  }

//    void handleException(Exception e) {
//      System.out.println(e.toString());  // print the error message
//      System.out.println("Skipping...");
//      Token t;
//      do {
//        t = getNextToken();
//      } while (t.kind != TagEnd);
//    }
}

PARSER_END(HTMLParser)


void HTMLDocument() throws IOException :
{
  Token t;
}
{
//  try {
    ( Tag()         { afterTag = true; }
    | t=Decl()      { afterTag = true; }
    | CommentTag()  { afterTag = true; }
    | ScriptTag()  { afterTag = true; }
    | t=<Word>      { addText(t.image); afterTag = false; }
    | t=<Entity>    { addText(Entities.decode(t.image)); afterTag = false; }
    | t=<Punct>     { addText(t.image); afterTag = false; }
    | <Space>       { addSpace(); afterTag = false; }
    )* <EOF>
//  } catch (ParseException e) {
//    handleException(e);
//  }
}

void Tag() throws IOException :
{
  Token t1, t2;
  boolean inImg = false;
}
{
  t1=<TagName> {
   String tagName = t1.image.toLowerCase(Locale.ENGLISH);
   if(Tags.WS_ELEMS.contains(tagName) ) {
      addSpace();
    }
    inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in                  { return t; }
| <ArgQuote1> t=   { return t; }
| LOOKAHEAD(2)
  <ArgQuote2>                  { return t; }
| <ArgQuote2> t=   { return t; }
}


Token Decl() :
{
  Token t;
}
{
  t=<DeclName> (  | ArgValue() |  )* 
  { return t; }
}


void CommentTag() :
{}
{
  (<Comment1> (  )* )
 |
  (<Comment2> (  )* )
}

void ScriptTag() :
{}
{
  <ScriptStart> (  )* 
}


TOKEN :
{
  < ScriptStart: " : WithinTag
| < DeclName: "<"  "!"   ["A"-"Z","a"-"z"] ( : WithinTag

| < Comment1:  "