alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (RSLPStemmerBase.java)

This example Lucene source code file (RSLPStemmerBase.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

io, ioexception, ioexception, linenumberreader, map, override, pattern, regex, rule, rule, rulewithsetexceptions, runtimeexception, step, step, string, string, util

The Lucene RSLPStemmerBase.java source code

package org.apache.lucene.analysis.pt;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.Version;

import static org.apache.lucene.analysis.util.StemmerUtil.*;

/**
 * Base class for stemmers that use a set of RSLP-like stemming steps.
 * <p>
 * RSLP (Removedor de Sufixos da Lingua Portuguesa) is an algorithm designed
 * originally for stemming the Portuguese language, described in the paper
 * <i>A Stemming Algorithm for the Portuguese Language, Orengo et. al.
 * <p>
 * Since this time a plural-only modification (RSLP-S) as well as a modification
 * for the Galician language have been implemented. This class parses a configuration
 * file that describes {@link Step}s, where each Step contains a set of {@link Rule}s.
 * <p>
 * The general rule format is: 
 * <blockquote>{ "suffix", N, "replacement", { "exception1", "exception2", ...}}
 * where:
 * <ul>
 *   <li>suffix is the suffix to be removed (such as "inho").
 *   <li>N is the min stem size, where stem is defined as the candidate stem 
 *       after removing the suffix (but before appending the replacement!)
 *   <li>replacement is an optimal string to append after removing the suffix.
 *       This can be the empty string.
 *   <li>exceptions is an optional list of exceptions, patterns that should 
 *       not be stemmed. These patterns can be specified as whole word or suffix (ends-with) 
 *       patterns, depending upon the exceptions format flag in the step header.
 * </ul>
 * <p>
 * A step is an ordered list of rules, with a structure in this format:
 * <blockquote>{ "name", N, B, { "cond1", "cond2", ... }
 *               ... rules ... };
 * </blockquote>
 * where:
 * <ul>
 *   <li>name is a name for the step (such as "Plural").
 *   <li>N is the min word size. Words that are less than this length bypass
 *       the step completely, as an optimization. Note: N can be zero, in this case this 
 *       implementation will automatically calculate the appropriate value from the underlying 
 *       rules.
 *   <li>B is a "boolean" flag specifying how exceptions in the rules are matched.
 *       A value of 1 indicates whole-word pattern matching, a value of 0 indicates that 
 *       exceptions are actually suffixes and should be matched with ends-with.
 *   <li>conds are an optional list of conditions to enter the step at all. If
 *       the list is non-empty, then a word must end with one of these conditions or it will
 *       bypass the step completely as an optimization.
 * </ul>
 * <p>
 * @see <a href="http://www.inf.ufrgs.br/~viviane/rslp/index.htm">RSLP description
 * @lucene.internal
 */
public abstract class RSLPStemmerBase {
  
  /**
   * A basic rule, with no exceptions.
   */
  protected static class Rule {
    protected final char suffix[];
    protected final char replacement[];
    protected final int min;
    
    /**
     * Create a rule.
     * @param suffix suffix to remove
     * @param min minimum stem length
     * @param replacement replacement string
     */
    public Rule(String suffix, int min, String replacement) {
      this.suffix = suffix.toCharArray();
      this.replacement = replacement.toCharArray();
      this.min = min;
    }
    
    /**
     * @return true if the word matches this rule.
     */
    public boolean matches(char s[], int len) {
      return (len - suffix.length >= min && endsWith(s, len, suffix));
    }
    
    /**
     * @return new valid length of the string after firing this rule.
     */
    public int replace(char s[], int len) {
      if (replacement.length > 0) {
        System.arraycopy(replacement, 0, s, len - suffix.length, replacement.length);
      }
      return len - suffix.length + replacement.length;
    }
  }
  
  /**
   * A rule with a set of whole-word exceptions.
   */
  protected static class RuleWithSetExceptions extends Rule {
    protected final CharArraySet exceptions;
    
    public RuleWithSetExceptions(String suffix, int min, String replacement,
        String[] exceptions) {
      super(suffix, min, replacement);
      for (int i = 0; i < exceptions.length; i++) {
        if (!exceptions[i].endsWith(suffix))
          System.err.println("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
      }
      this.exceptions = new CharArraySet(Version.LUCENE_31,
           Arrays.asList(exceptions), false);
    }

    @Override
    public boolean matches(char s[], int len) {
      return super.matches(s, len) && !exceptions.contains(s, 0, len);
    }
  }
  
  /**
   * A rule with a set of exceptional suffixes.
   */
  protected static class RuleWithSuffixExceptions extends Rule {
    // TODO: use a more efficient datastructure: automaton?
    protected final char[][] exceptions;
    
    public RuleWithSuffixExceptions(String suffix, int min, String replacement,
        String[] exceptions) {
      super(suffix, min, replacement);
      for (int i = 0; i < exceptions.length; i++) {
        if (!exceptions[i].endsWith(suffix))
          System.err.println("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
      }
      this.exceptions = new char[exceptions.length][];
      for (int i = 0; i < exceptions.length; i++)
        this.exceptions[i] = exceptions[i].toCharArray();
    }
    
    @Override
    public boolean matches(char s[], int len) {
      if (!super.matches(s, len))
        return false;
      
      for (int i = 0; i < exceptions.length; i++)
        if (endsWith(s, len, exceptions[i]))
          return false;

      return true;
    }
  }
  
  /**
   * A step containing a list of rules.
   */
  protected static class Step {
    protected final String name;
    protected final Rule rules[];
    protected final int min;
    protected final char[][] suffixes;
    
    /**
     * Create a new step
     * @param name Step's name.
     * @param rules an ordered list of rules.
     * @param min minimum word size. if this is 0 it is automatically calculated.
     * @param suffixes optional list of conditional suffixes. may be null.
     */
    public Step(String name, Rule rules[], int min, String suffixes[]) {
      this.name = name;
      this.rules = rules;
      if (min == 0) {
        min = Integer.MAX_VALUE;
        for (Rule r : rules)
          min = Math.min(min, r.min + r.suffix.length);
      }
      this.min = min;
      
      if (suffixes == null || suffixes.length == 0) {
        this.suffixes = null;
      } else {
        this.suffixes = new char[suffixes.length][];
        for (int i = 0; i < suffixes.length; i++)
          this.suffixes[i] = suffixes[i].toCharArray();
      }
    }
    
    /**
     * @return new valid length of the string after applying the entire step.
     */
    public int apply(char s[], int len) {
      if (len < min)
        return len;
      
      if (suffixes != null) {
        boolean found = false;
        
        for (int i = 0; i < suffixes.length; i++)
          if (endsWith(s, len, suffixes[i])) {
            found = true;
            break;
          }
        
        if (!found) return len;
      }
      
      for (int i = 0; i < rules.length; i++) {
        if (rules[i].matches(s, len))
          return rules[i].replace(s, len);
      }
      
      return len;
    }
  }
  
  /**
   * Parse a resource file into an RSLP stemmer description.
   * @return a Map containing the named Steps in this description.
   */
  protected static Map<String,Step> parse(Class clazz, String resource) {
    // TODO: this parser is ugly, but works. use a jflex grammar instead.
    try {
      InputStream is = clazz.getResourceAsStream(resource);
      LineNumberReader r = new LineNumberReader(new InputStreamReader(is, "UTF-8"));
      Map<String,Step> steps = new HashMap();
      String step;
      while ((step = readLine(r)) != null) {
        Step s = parseStep(r, step);
        steps.put(s.name, s);
      }
      r.close();
      return steps;
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }
  
  private static final Pattern headerPattern = 
    Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*(0|1),\\s*\\{(.*)\\},\\s*$");
  private static final Pattern stripPattern = 
    Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+)\\s*\\}\\s*(,|(\\}\\s*;))$");
  private static final Pattern repPattern = 
    Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*\"([^\"]*)\"\\}\\s*(,|(\\}\\s*;))$");
  private static final Pattern excPattern = 
    Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*\"([^\"]*)\",\\s*\\{(.*)\\}\\s*\\}\\s*(,|(\\}\\s*;))$");
  
  private static Step parseStep(LineNumberReader r, String header) throws IOException {
    Matcher matcher = headerPattern.matcher(header);
    if (!matcher.find()) {
      throw new RuntimeException("Illegal Step header specified at line " + r.getLineNumber());
    }
    assert matcher.groupCount() == 4;
    String name = matcher.group(1);
    int min = Integer.parseInt(matcher.group(2));
    int type = Integer.parseInt(matcher.group(3));
    String suffixes[] = parseList(matcher.group(4));
    Rule rules[] = parseRules(r, type);
    return new Step(name, rules, min, suffixes);
  }
  
  private static Rule[] parseRules(LineNumberReader r, int type) throws IOException {
    List<Rule> rules = new ArrayList();
    String line;
    while ((line = readLine(r)) != null) {
      Matcher matcher = stripPattern.matcher(line);
      if (matcher.matches()) {
        rules.add(new Rule(matcher.group(1), Integer.parseInt(matcher.group(2)), ""));
      } else {
        matcher = repPattern.matcher(line);
        if (matcher.matches()) {
          rules.add(new Rule(matcher.group(1), Integer.parseInt(matcher.group(2)), matcher.group(3)));
        } else {
          matcher = excPattern.matcher(line);
          if (matcher.matches()) {
            if (type == 0) {
              rules.add(new RuleWithSuffixExceptions(matcher.group(1), 
                        Integer.parseInt(matcher.group(2)), 
                        matcher.group(3), 
                        parseList(matcher.group(4))));
            } else {
              rules.add(new RuleWithSetExceptions(matcher.group(1), 
                        Integer.parseInt(matcher.group(2)), 
                        matcher.group(3), 
                        parseList(matcher.group(4))));
            }
          } else {
            throw new RuntimeException("Illegal Step rule specified at line " + r.getLineNumber());
          }
        }
      }
      if (line.endsWith(";"))
        return rules.toArray(new Rule[rules.size()]);
    }
    return null;
  }
  
  private static String[] parseList(String s) {
    if (s.length() == 0)
      return null;
    String list[] = s.split(",");
    for (int i = 0; i < list.length; i++)
      list[i] = parseString(list[i].trim());
    return list;
  }
  
  private static String parseString(String s) {
    return s.substring(1, s.length()-1);
  }
  
  private static String readLine(LineNumberReader r) throws IOException {
    String line = null;
    while ((line = r.readLine()) != null) {
      line = line.trim();
      if (line.length() > 0 && line.charAt(0) != '#')
        return line;
    }
    return line;
  }
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene RSLPStemmerBase.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.