alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (PatternParser.java)

This example Lucene source code file (PatternParser.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

arraylist, arraylist, elem_classes, elem_exceptions, elem_hyphen, elem_patterns, hyphenationexception, hyphenationexception, io, net, network, override, parser, patternparser, sax, string, string, stringbuilder, stringbuilder, xml

The Lucene PatternParser.java source code

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.analysis.compound.hyphenation;

// SAX
import org.xml.sax.XMLReader;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.Attributes;

// Java
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;

import javax.xml.parsers.SAXParserFactory;

/**
 * A SAX document handler to read and parse hyphenation patterns from a XML
 * file.
 * 
 * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. 
 */
public class PatternParser extends DefaultHandler implements PatternConsumer {

  XMLReader parser;

  int currElement;

  PatternConsumer consumer;

  StringBuilder token;

  ArrayList<Object> exception;

  char hyphenChar;

  String errMsg;

  static final int ELEM_CLASSES = 1;

  static final int ELEM_EXCEPTIONS = 2;

  static final int ELEM_PATTERNS = 3;

  static final int ELEM_HYPHEN = 4;

  public PatternParser() throws HyphenationException {
    token = new StringBuilder();
    parser = createParser();
    parser.setContentHandler(this);
    parser.setErrorHandler(this);
    parser.setEntityResolver(this);
    hyphenChar = '-'; // default

  }

  public PatternParser(PatternConsumer consumer) throws HyphenationException {
    this();
    this.consumer = consumer;
  }

  public void setConsumer(PatternConsumer consumer) {
    this.consumer = consumer;
  }

  /**
   * Parses a hyphenation pattern file.
   * 
   * @param filename the filename
   * @throws HyphenationException In case of an exception while parsing
   */
  public void parse(String filename) throws HyphenationException {
    parse(new InputSource(filename));
  }

  /**
   * Parses a hyphenation pattern file.
   * 
   * @param file the pattern file
   * @throws HyphenationException In case of an exception while parsing
   */
  public void parse(File file) throws HyphenationException {
    try {
      InputSource src = new InputSource(file.toURL().toExternalForm());
      parse(src);
    } catch (MalformedURLException e) {
      throw new HyphenationException("Error converting the File '" + file
          + "' to a URL: " + e.getMessage());
    }
  }

  /**
   * Parses a hyphenation pattern file.
   * 
   * @param source the InputSource for the file
   * @throws HyphenationException In case of an exception while parsing
   */
  public void parse(InputSource source) throws HyphenationException {
    try {
      parser.parse(source);
    } catch (FileNotFoundException fnfe) {
      throw new HyphenationException("File not found: " + fnfe.getMessage());
    } catch (IOException ioe) {
      throw new HyphenationException(ioe.getMessage());
    } catch (SAXException e) {
      throw new HyphenationException(errMsg);
    }
  }

  /**
   * Creates a SAX parser using JAXP
   * 
   * @return the created SAX parser
   */
  static XMLReader createParser() {
    try {
      SAXParserFactory factory = SAXParserFactory.newInstance();
      factory.setNamespaceAware(true);
      return factory.newSAXParser().getXMLReader();
    } catch (Exception e) {
      throw new RuntimeException("Couldn't create XMLReader: " + e.getMessage());
    }
  }

  protected String readToken(StringBuffer chars) {
    String word;
    boolean space = false;
    int i;
    for (i = 0; i < chars.length(); i++) {
      if (Character.isWhitespace(chars.charAt(i))) {
        space = true;
      } else {
        break;
      }
    }
    if (space) {
      // chars.delete(0,i);
      for (int countr = i; countr < chars.length(); countr++) {
        chars.setCharAt(countr - i, chars.charAt(countr));
      }
      chars.setLength(chars.length() - i);
      if (token.length() > 0) {
        word = token.toString();
        token.setLength(0);
        return word;
      }
    }
    space = false;
    for (i = 0; i < chars.length(); i++) {
      if (Character.isWhitespace(chars.charAt(i))) {
        space = true;
        break;
      }
    }
    token.append(chars.toString().substring(0, i));
    // chars.delete(0,i);
    for (int countr = i; countr < chars.length(); countr++) {
      chars.setCharAt(countr - i, chars.charAt(countr));
    }
    chars.setLength(chars.length() - i);
    if (space) {
      word = token.toString();
      token.setLength(0);
      return word;
    }
    token.append(chars);
    return null;
  }

  protected static String getPattern(String word) {
    StringBuilder pat = new StringBuilder();
    int len = word.length();
    for (int i = 0; i < len; i++) {
      if (!Character.isDigit(word.charAt(i))) {
        pat.append(word.charAt(i));
      }
    }
    return pat.toString();
  }

  protected ArrayList<Object> normalizeException(ArrayList ex) {
    ArrayList<Object> res = new ArrayList();
    for (int i = 0; i < ex.size(); i++) {
      Object item = ex.get(i);
      if (item instanceof String) {
        String str = (String) item;
        StringBuilder buf = new StringBuilder();
        for (int j = 0; j < str.length(); j++) {
          char c = str.charAt(j);
          if (c != hyphenChar) {
            buf.append(c);
          } else {
            res.add(buf.toString());
            buf.setLength(0);
            char[] h = new char[1];
            h[0] = hyphenChar;
            // we use here hyphenChar which is not necessarily
            // the one to be printed
            res.add(new Hyphen(new String(h), null, null));
          }
        }
        if (buf.length() > 0) {
          res.add(buf.toString());
        }
      } else {
        res.add(item);
      }
    }
    return res;
  }

  protected String getExceptionWord(ArrayList<?> ex) {
    StringBuilder res = new StringBuilder();
    for (int i = 0; i < ex.size(); i++) {
      Object item = ex.get(i);
      if (item instanceof String) {
        res.append((String) item);
      } else {
        if (((Hyphen) item).noBreak != null) {
          res.append(((Hyphen) item).noBreak);
        }
      }
    }
    return res.toString();
  }

  protected static String getInterletterValues(String pat) {
    StringBuilder il = new StringBuilder();
    String word = pat + "a"; // add dummy letter to serve as sentinel
    int len = word.length();
    for (int i = 0; i < len; i++) {
      char c = word.charAt(i);
      if (Character.isDigit(c)) {
        il.append(c);
        i++;
      } else {
        il.append('0');
      }
    }
    return il.toString();
  }

  //
  // EntityResolver methods
  //
  @Override
  public InputSource resolveEntity(String publicId, String systemId) {
    // supply the internal hyphenation.dtd if possible
    if (
      (systemId != null && systemId.matches("(?i).*\\bhyphenation.dtd\\b.*")) ||
      ("hyphenation-info".equals(publicId))
    ) {
      // System.out.println(this.getClass().getResource("hyphenation.dtd").toExternalForm());
      return new InputSource(this.getClass().getResource("hyphenation.dtd").toExternalForm());
    }
    return null;
  }

  //
  // ContentHandler methods
  //

  /**
   * @see org.xml.sax.ContentHandler#startElement(java.lang.String,
   *      java.lang.String, java.lang.String, org.xml.sax.Attributes)
   */
  @Override
  public void startElement(String uri, String local, String raw,
      Attributes attrs) {
    if (local.equals("hyphen-char")) {
      String h = attrs.getValue("value");
      if (h != null && h.length() == 1) {
        hyphenChar = h.charAt(0);
      }
    } else if (local.equals("classes")) {
      currElement = ELEM_CLASSES;
    } else if (local.equals("patterns")) {
      currElement = ELEM_PATTERNS;
    } else if (local.equals("exceptions")) {
      currElement = ELEM_EXCEPTIONS;
      exception = new ArrayList<Object>();
    } else if (local.equals("hyphen")) {
      if (token.length() > 0) {
        exception.add(token.toString());
      }
      exception.add(new Hyphen(attrs.getValue("pre"), attrs.getValue("no"),
          attrs.getValue("post")));
      currElement = ELEM_HYPHEN;
    }
    token.setLength(0);
  }

  /**
   * @see org.xml.sax.ContentHandler#endElement(java.lang.String,
   *      java.lang.String, java.lang.String)
   */
  @Override
  @SuppressWarnings("unchecked")
  public void endElement(String uri, String local, String raw) {

    if (token.length() > 0) {
      String word = token.toString();
      switch (currElement) {
        case ELEM_CLASSES:
          consumer.addClass(word);
          break;
        case ELEM_EXCEPTIONS:
          exception.add(word);
          exception = normalizeException(exception);
          consumer.addException(getExceptionWord(exception), 
              (ArrayList) exception.clone());
          break;
        case ELEM_PATTERNS:
          consumer.addPattern(getPattern(word), getInterletterValues(word));
          break;
        case ELEM_HYPHEN:
          // nothing to do
          break;
      }
      if (currElement != ELEM_HYPHEN) {
        token.setLength(0);
      }
    }
    if (currElement == ELEM_HYPHEN) {
      currElement = ELEM_EXCEPTIONS;
    } else {
      currElement = 0;
    }

  }

  /**
   * @see org.xml.sax.ContentHandler#characters(char[], int, int)
   */
  @SuppressWarnings("unchecked")
  @Override
  public void characters(char ch[], int start, int length) {
    StringBuffer chars = new StringBuffer(length);
    chars.append(ch, start, length);
    String word = readToken(chars);
    while (word != null) {
      // System.out.println("\"" + word + "\"");
      switch (currElement) {
        case ELEM_CLASSES:
          consumer.addClass(word);
          break;
        case ELEM_EXCEPTIONS:
          exception.add(word);
          exception = normalizeException(exception);
          consumer.addException(getExceptionWord(exception),
              (ArrayList) exception.clone());
          exception.clear();
          break;
        case ELEM_PATTERNS:
          consumer.addPattern(getPattern(word), getInterletterValues(word));
          break;
      }
      word = readToken(chars);
    }

  }

  /**
   * Returns a string of the location.
   */
  private String getLocationString(SAXParseException ex) {
    StringBuilder str = new StringBuilder();

    String systemId = ex.getSystemId();
    if (systemId != null) {
      int index = systemId.lastIndexOf('/');
      if (index != -1) {
        systemId = systemId.substring(index + 1);
      }
      str.append(systemId);
    }
    str.append(':');
    str.append(ex.getLineNumber());
    str.append(':');
    str.append(ex.getColumnNumber());

    return str.toString();

  } // getLocationString(SAXParseException):String

  // PatternConsumer implementation for testing purposes
  public void addClass(String c) {
    System.out.println("class: " + c);
  }

  public void addException(String w, ArrayList<Object> e) {
    System.out.println("exception: " + w + " : " + e.toString());
  }

  public void addPattern(String p, String v) {
    System.out.println("pattern: " + p + " : " + v);
  }

  public static void main(String[] args) throws Exception {
    if (args.length > 0) {
      PatternParser pp = new PatternParser();
      pp.setConsumer(pp);
      pp.parse(args[0]);
    }
  }
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene PatternParser.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.