alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (DutchStemmer.java)

This example Lucene source code file (DutchStemmer.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

deprecated, dutchstemmer, i, i, map, string, string, stringbuilder, stringbuilder, util, y, y

The Lucene DutchStemmer.java source code

package org.apache.lucene.analysis.nl;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.util.Map;

/**
 * A stemmer for Dutch words. 
 * <p>
 * The algorithm is an implementation of
 * the <a href="http://snowball.tartarus.org/algorithms/dutch/stemmer.html">dutch stemming
 * algorithm in Martin Porter's snowball project.
 * </p>
 * @deprecated Use {@link org.tartarus.snowball.ext.DutchStemmer} instead, 
 * which has the same functionality. This filter will be removed in Lucene 5.0
 */
@Deprecated
public class DutchStemmer {
  /**
   * Buffer for the terms while stemming them.
   */
  private StringBuilder sb = new StringBuilder();
  private boolean _removedE;
  private Map _stemDict;

  private int _R1;
  private int _R2;

  //TODO convert to internal
  /*
   * Stems the given term to an unique <tt>discriminator.
   *
   * @param term The term that should be stemmed.
   * @return Discriminator for <tt>term
   */
  public String stem(String term) {
    term = term.toLowerCase();
    if (!isStemmable(term))
      return term;
    if (_stemDict != null && _stemDict.containsKey(term))
      if (_stemDict.get(term) instanceof String)
        return (String) _stemDict.get(term);
      else
        return null;

    // Reset the StringBuilder.
    sb.delete(0, sb.length());
    sb.insert(0, term);
    // Stemming starts here...
    substitute(sb);
    storeYandI(sb);
    _R1 = getRIndex(sb, 0);
    _R1 = Math.max(3, _R1);
    step1(sb);
    step2(sb);
    _R2 = getRIndex(sb, _R1);
    step3a(sb);
    step3b(sb);
    step4(sb);
    reStoreYandI(sb);
    return sb.toString();
  }

  private boolean enEnding(StringBuilder sb) {
    String[] enend = new String[]{"ene", "en"};
    for (int i = 0; i < enend.length; i++) {
      String end = enend[i];
      String s = sb.toString();
      int index = s.length() - end.length();
      if (s.endsWith(end) &&
          index >= _R1 &&
          isValidEnEnding(sb, index - 1)
      ) {
        sb.delete(index, index + end.length());
        unDouble(sb, index);
        return true;
      }
    }
    return false;
  }


  private void step1(StringBuilder sb) {
    if (_R1 >= sb.length())
      return;

    String s = sb.toString();
    int lengthR1 = sb.length() - _R1;
    int index;

    if (s.endsWith("heden")) {
      sb.replace(_R1, lengthR1 + _R1, sb.substring(_R1, lengthR1 + _R1).replaceAll("heden", "heid"));
      return;
    }

    if (enEnding(sb))
      return;

    if (s.endsWith("se") &&
        (index = s.length() - 2) >= _R1 &&
        isValidSEnding(sb, index - 1)
    ) {
      sb.delete(index, index + 2);
      return;
    }
    if (s.endsWith("s") &&
        (index = s.length() - 1) >= _R1 &&
        isValidSEnding(sb, index - 1)) {
      sb.delete(index, index + 1);
    }
  }

  /**
   * Delete suffix e if in R1 and
   * preceded by a non-vowel, and then undouble the ending
   *
   * @param sb String being stemmed
   */
  private void step2(StringBuilder sb) {
    _removedE = false;
    if (_R1 >= sb.length())
      return;
    String s = sb.toString();
    int index = s.length() - 1;
    if (index >= _R1 &&
        s.endsWith("e") &&
        !isVowel(sb.charAt(index - 1))) {
      sb.delete(index, index + 1);
      unDouble(sb);
      _removedE = true;
    }
  }

  /**
   * Delete "heid"
   *
   * @param sb String being stemmed
   */
  private void step3a(StringBuilder sb) {
    if (_R2 >= sb.length())
      return;
    String s = sb.toString();
    int index = s.length() - 4;
    if (s.endsWith("heid") && index >= _R2 && sb.charAt(index - 1) != 'c') {
      sb.delete(index, index + 4); //remove heid
      enEnding(sb);
    }
  }

  /**
   * <p>A d-suffix, or derivational suffix, enables a new word,
   * often with a different grammatical category, or with a different
   * sense, to be built from another word. Whether a d-suffix can be
   * attached is discovered not from the rules of grammar, but by
   * referring to a dictionary. So in English, ness can be added to
   * certain adjectives to form corresponding nouns (littleness,
   * kindness, foolishness ...) but not to all adjectives
   * (not for example, to big, cruel, wise ...) d-suffixes can be
   * used to change meaning, often in rather exotic ways.</p>
   * Remove "ing", "end", "ig", "lijk", "baar" and "bar"
   *
   * @param sb String being stemmed
   */
  private void step3b(StringBuilder sb) {
    if (_R2 >= sb.length())
      return;
    String s = sb.toString();
    int index = 0;

    if ((s.endsWith("end") || s.endsWith("ing")) &&
        (index = s.length() - 3) >= _R2) {
      sb.delete(index, index + 3);
      if (sb.charAt(index - 2) == 'i' &&
          sb.charAt(index - 1) == 'g') {
        if (sb.charAt(index - 3) != 'e' & index - 2 >= _R2) {
          index -= 2;
          sb.delete(index, index + 2);
        }
      } else {
        unDouble(sb, index);
      }
      return;
    }
    if (s.endsWith("ig") &&
        (index = s.length() - 2) >= _R2
    ) {
      if (sb.charAt(index - 1) != 'e')
        sb.delete(index, index + 2);
      return;
    }
    if (s.endsWith("lijk") &&
        (index = s.length() - 4) >= _R2
    ) {
      sb.delete(index, index + 4);
      step2(sb);
      return;
    }
    if (s.endsWith("baar") &&
        (index = s.length() - 4) >= _R2
    ) {
      sb.delete(index, index + 4);
      return;
    }
    if (s.endsWith("bar") &&
        (index = s.length() - 3) >= _R2
    ) {
      if (_removedE)
        sb.delete(index, index + 3);
      return;
    }
  }

  /**
   * undouble vowel
   * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod).
   *
   * @param sb String being stemmed
   */
  private void step4(StringBuilder sb) {
    if (sb.length() < 4)
      return;
    String end = sb.substring(sb.length() - 4, sb.length());
    char c = end.charAt(0);
    char v1 = end.charAt(1);
    char v2 = end.charAt(2);
    char d = end.charAt(3);
    if (v1 == v2 &&
        d != 'I' &&
        v1 != 'i' &&
        isVowel(v1) &&
        !isVowel(d) &&
        !isVowel(c)) {
      sb.delete(sb.length() - 2, sb.length() - 1);
    }
  }

  /**
   * Checks if a term could be stemmed.
   *
   * @return true if, and only if, the given term consists in letters.
   */
  private boolean isStemmable(String term) {
    for (int c = 0; c < term.length(); c++) {
      if (!Character.isLetter(term.charAt(c))) return false;
    }
    return true;
  }

  /**
   * Substitute ä, ë, ï, ö, ü, á , é, í, ó, ú
   */
  private void substitute(StringBuilder buffer) {
    for (int i = 0; i < buffer.length(); i++) {
      switch (buffer.charAt(i)) {
        case 'ä':
        case 'á':
          {
            buffer.setCharAt(i, 'a');
            break;
          }
        case 'ë':
        case 'é':
          {
            buffer.setCharAt(i, 'e');
            break;
          }
        case 'ü':
        case 'ú':
          {
            buffer.setCharAt(i, 'u');
            break;
          }
        case 'ï':
        case 'i':
          {
            buffer.setCharAt(i, 'i');
            break;
          }
        case 'ö':
        case 'ó':
          {
            buffer.setCharAt(i, 'o');
            break;
          }
      }
    }
  }

  /*private boolean isValidSEnding(StringBuilder sb) {
    return isValidSEnding(sb, sb.length() - 1);
  }*/

  private boolean isValidSEnding(StringBuilder sb, int index) {
    char c = sb.charAt(index);
    if (isVowel(c) || c == 'j')
      return false;
    return true;
  }

  /*private boolean isValidEnEnding(StringBuilder sb) {
    return isValidEnEnding(sb, sb.length() - 1);
  }*/

  private boolean isValidEnEnding(StringBuilder sb, int index) {
    char c = sb.charAt(index);
    if (isVowel(c))
      return false;
    if (c < 3)
      return false;
    // ends with "gem"?
    if (c == 'm' && sb.charAt(index - 2) == 'g' && sb.charAt(index - 1) == 'e')
      return false;
    return true;
  }

  private void unDouble(StringBuilder sb) {
    unDouble(sb, sb.length());
  }

  private void unDouble(StringBuilder sb, int endIndex) {
    String s = sb.substring(0, endIndex);
    if (s.endsWith("kk") || s.endsWith("tt") || s.endsWith("dd") || s.endsWith("nn") || s.endsWith("mm") || s.endsWith("ff")) {
      sb.delete(endIndex - 1, endIndex);
    }
  }

  private int getRIndex(StringBuilder sb, int start) {
    if (start == 0)
      start = 1;
    int i = start;
    for (; i < sb.length(); i++) {
      //first non-vowel preceded by a vowel
      if (!isVowel(sb.charAt(i)) && isVowel(sb.charAt(i - 1))) {
        return i + 1;
      }
    }
    return i + 1;
  }

  private void storeYandI(StringBuilder sb) {
    if (sb.charAt(0) == 'y')
      sb.setCharAt(0, 'Y');

    int last = sb.length() - 1;

    for (int i = 1; i < last; i++) {
      switch (sb.charAt(i)) {
        case 'i':
          {
            if (isVowel(sb.charAt(i - 1)) &&
                isVowel(sb.charAt(i + 1))
            )
              sb.setCharAt(i, 'I');
            break;
          }
        case 'y':
          {
            if (isVowel(sb.charAt(i - 1)))
              sb.setCharAt(i, 'Y');
            break;
          }
      }
    }
    if (last > 0 && sb.charAt(last) == 'y' && isVowel(sb.charAt(last - 1)))
      sb.setCharAt(last, 'Y');
  }

  private void reStoreYandI(StringBuilder sb) {
    String tmp = sb.toString();
    sb.delete(0, sb.length());
    sb.insert(0, tmp.replaceAll("I", "i").replaceAll("Y", "y"));
  }

  private boolean isVowel(char c) {
    switch (c) {
      case 'e':
      case 'a':
      case 'o':
      case 'i':
      case 'u':
      case 'y':
      case 'è':
        {
          return true;
        }
    }
    return false;
  }

  void setStemDictionary(Map dict) {
    _stemDict = dict;
  }

}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene DutchStemmer.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.