Java example - PorterStemmer.java

What this is

This file is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" ^TM.
The source code

package org.apache.lucene.analysis;

/**
 * Copyright 2004 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*

   Porter stemmer in Java. The original paper is in

       Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
       no. 3, pp 130-137,

   See also http://www.tartarus.org/~martin/PorterStemmer/index.html

   Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
   Tthe words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
   is then out outside the bounds of b.

   Similarly,

   Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
   'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
   b[j] is then outside the bounds of b.

   Release 3.

   [ This version is derived from Release 3, modified by Brian Goetz to
     optimize for fewer object creations.  ]

*/


import java.io.*;

/**
 *
 * Stemmer, implementing the Porter Stemming Algorithm
 *
 * The Stemmer class transforms a word into its root form.  The input
 * word can be provided a character at time (by calling add()), or at once
 * by calling one of the various stem(something) methods.
 */

class PorterStemmer
{
  private char[] b;
  private int i,    /* offset into b */
    j, k, k0;
  private boolean dirty = false;
  private static final int INC = 50; /* unit of size whereby b is increased */
  private static final int EXTRA = 1;

  public PorterStemmer() {
    b = new char[INC];
    i = 0;
  }

  /**
   * reset() resets the stemmer so it can stem another word.  If you invoke
   * the stemmer by calling add(char) and then stem(), you must call reset()
   * before starting another word.
   */
  public void reset() { i = 0; dirty = false; }

  /**
   * Add a character to the word being stemmed.  When you are finished
   * adding characters, you can call stem(void) to process the word.
   */
  public void add(char ch) {
    if (b.length <= i + EXTRA) {
      char[] new_b = new char[b.length+INC];
      for (int c = 0; c < b.length; c++)
        new_b[c] = b[c];
      b = new_b;
    }
    b[i++] = ch;
  }

  /**
   * After a word has been stemmed, it can be retrieved by toString(),
   * or a reference to the internal buffer can be retrieved by getResultBuffer
   * and getResultLength (which is generally more efficient.)
   */
  public String toString() { return new String(b,0,i); }

  /**
   * Returns the length of the word resulting from the stemming process.
   */
  public int getResultLength() { return i; }

  /**
   * Returns a reference to a character buffer containing the results of
   * the stemming process.  You also need to consult getResultLength()
   * to determine the length of the result.
   */
  public char[] getResultBuffer() { return b; }

  /* cons(i) is true <=> b[i] is a consonant. */

  private final boolean cons(int i) {
    switch (b[i]) {
    case 'a': case 'e': case 'i': case 'o': case 'u':
      return false;
    case 'y':
      return (i==k0) ? true : !cons(i-1);
    default:
      return true;
    }
  }

  /* m() measures the number of consonant sequences between k0 and j. if c is
     a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
     presence,

                 gives 0
          vc     gives 1
          vcvc   gives 2
          vcvcvc gives 3
          ....
  */

  private final int m() {
    int n = 0;
    int i = k0;
    while(true) {
      if (i > j)
        return n;
      if (! cons(i))
        break;
      i++;
    }
    i++;
    while(true) {
      while(true) {
        if (i > j)
          return n;
        if (cons(i))
          break;
        i++;
      }
      i++;
      n++;
      while(true) {
        if (i > j)
          return n;
        if (! cons(i))
          break;
        i++;
      }
      i++;
    }
  }

  /* vowelinstem() is true <=> k0,...j contains a vowel */

  private final boolean vowelinstem() {
    int i;
    for (i = k0; i <= j; i++)
      if (! cons(i))
        return true;
    return false;
  }

  /* doublec(j) is true <=> j,(j-1) contain a double consonant. */

  private final boolean doublec(int j) {
    if (j < k0+1)
      return false;
    if (b[j] != b[j-1])
      return false;
    return cons(j);
  }

  /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
     and also if the second c is not w,x or y. this is used when trying to
     restore an e at the end of a short word. e.g.

          cav(e), lov(e), hop(e), crim(e), but
          snow, box, tray.

  */

  private final boolean cvc(int i) {
    if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2))
      return false;
    else {
      int ch = b[i];
      if (ch == 'w' || ch == 'x' || ch == 'y') return false;
    }
    return true;
  }

  private final boolean ends(String s) {
    int l = s.length();
    int o = k-l+1;
    if (o < k0)
      return false;
    for (int i = 0; i < l; i++)
      if (b[o+i] != s.charAt(i))
        return false;
    j = k-l;
    return true;
  }

  /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
     k. */

  void setto(String s) {
    int l = s.length();
    int o = j+1;
    for (int i = 0; i < l; i++)
      b[o+i] = s.charAt(i);
    k = j+l;
    dirty = true;
  }

  /* r(s) is used further down. */

  void r(String s) { if (m() > 0) setto(s); }

  /* step1() gets rid of plurals and -ed or -ing. e.g.

           caresses  ->  caress
           ponies    ->  poni
           ties      ->  ti
           caress    ->  caress
           cats      ->  cat

           feed      ->  feed
           agreed    ->  agree
           disabled  ->  disable

           matting   ->  mat
           mating    ->  mate
           meeting   ->  meet
           milling   ->  mill
           messing   ->  mess

           meetings  ->  meet

  */

  private final void step1() {
    if (b[k] == 's') {
      if (ends("sses")) k -= 2;
      else if (ends("ies")) setto("i");
      else if (b[k-1] != 's') k--;
    }
    if (ends("eed")) {
      if (m() > 0)
        k--;
    }
    else if ((ends("ed") || ends("ing")) && vowelinstem()) {
      k = j;
      if (ends("at")) setto("ate");
      else if (ends("bl")) setto("ble");
      else if (ends("iz")) setto("ize");
      else if (doublec(k)) {
        int ch = b[k--];
        if (ch == 'l' || ch == 's' || ch == 'z')
          k++;
      }
      else if (m() == 1 && cvc(k))
        setto("e");
    }
  }

  /* step2() turns terminal y to i when there is another vowel in the stem. */

  private final void step2() {
    if (ends("y") && vowelinstem()) {
      b[k] = 'i';
      dirty = true;
    }
  }

  /* step3() maps double suffices to single ones. so -ization ( = -ize plus
     -ation) maps to -ize etc. note that the string before the suffix must give
     m() > 0. */

  private final void step3() {
    if (k == k0) return; /* For Bug 1 */
    switch (b[k-1]) {
    case 'a':
      if (ends("ational")) { r("ate"); break; }
      if (ends("tional")) { r("tion"); break; }
      break;
    case 'c':
      if (ends("enci")) { r("ence"); break; }
      if (ends("anci")) { r("ance"); break; }
      break;
    case 'e':
      if (ends("izer")) { r("ize"); break; }
      break;
    case 'l':
      if (ends("bli")) { r("ble"); break; }
      if (ends("alli")) { r("al"); break; }
      if (ends("entli")) { r("ent"); break; }
      if (ends("eli")) { r("e"); break; }
      if (ends("ousli")) { r("ous"); break; }
      break;
    case 'o':
      if (ends("ization")) { r("ize"); break; }
      if (ends("ation")) { r("ate"); break; }
      if (ends("ator")) { r("ate"); break; }
      break;
    case 's':
      if (ends("alism")) { r("al"); break; }
      if (ends("iveness")) { r("ive"); break; }
      if (ends("fulness")) { r("ful"); break; }
      if (ends("ousness")) { r("ous"); break; }
      break;
    case 't':
      if (ends("aliti")) { r("al"); break; }
      if (ends("iviti")) { r("ive"); break; }
      if (ends("biliti")) { r("ble"); break; }
      break;
    case 'g':
      if (ends("logi")) { r("log"); break; }
    }
  }

  /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */

  private final void step4() {
    switch (b[k]) {
    case 'e':
      if (ends("icate")) { r("ic"); break; }
      if (ends("ative")) { r(""); break; }
      if (ends("alize")) { r("al"); break; }
      break;
    case 'i':
      if (ends("iciti")) { r("ic"); break; }
      break;
    case 'l':
      if (ends("ical")) { r("ic"); break; }
      if (ends("ful")) { r(""); break; }
      break;
    case 's':
      if (ends("ness")) { r(""); break; }
      break;
    }
  }

  /* step5() takes off -ant, -ence etc., in context vcvc. */

  private final void step5() {
    if (k == k0) return; /* for Bug 1 */
    switch (b[k-1]) {
    case 'a':
      if (ends("al")) break;
      return;
    case 'c':
      if (ends("ance")) break;
      if (ends("ence")) break;
      return;
    case 'e':
      if (ends("er")) break; return;
    case 'i':
      if (ends("ic")) break; return;
    case 'l':
      if (ends("able")) break;
      if (ends("ible")) break; return;
    case 'n':
      if (ends("ant")) break;
      if (ends("ement")) break;
      if (ends("ment")) break;
      /* element etc. not stripped before the m */
      if (ends("ent")) break;
      return;
    case 'o':
      if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
      /* j >= 0 fixes Bug 2 */
      if (ends("ou")) break;
      return;
      /* takes care of -ous */
    case 's':
      if (ends("ism")) break;
      return;
    case 't':
      if (ends("ate")) break;
      if (ends("iti")) break;
      return;
    case 'u':
      if (ends("ous")) break;
      return;
    case 'v':
      if (ends("ive")) break;
      return;
    case 'z':
      if (ends("ize")) break;
      return;
    default:
      return;
    }
    if (m() > 1)
      k = j;
  }

  /* step6() removes a final -e if m() > 1. */

  private final void step6() {
    j = k;
    if (b[k] == 'e') {
      int a = m();
      if (a > 1 || a == 1 && !cvc(k-1))
        k--;
    }
    if (b[k] == 'l' && doublec(k) && m() > 1)
      k--;
  }


  /**
   * Stem a word provided as a String.  Returns the result as a String.
   */
  public String stem(String s) {
    if (stem(s.toCharArray(), s.length()))
      return toString();
    else
      return s;
  }

  /** Stem a word contained in a char[].  Returns true if the stemming process
   * resulted in a word different from the input.  You can retrieve the
   * result with getResultLength()/getResultBuffer() or toString().
   */
  public boolean stem(char[] word) {
    return stem(word, word.length);
  }

  /** Stem a word contained in a portion of a char[] array.  Returns
   * true if the stemming process resulted in a word different from
   * the input.  You can retrieve the result with
   * getResultLength()/getResultBuffer() or toString().
   */
  public boolean stem(char[] wordBuffer, int offset, int wordLen) {
    reset();
    if (b.length < wordLen) {
      char[] new_b = new char[wordLen + EXTRA];
      b = new_b;
    }
    for (int j=0; j k0+1) {
      step1(); step2(); step3(); step4(); step5(); step6();
    }
    // Also, a word is considered dirty if we lopped off letters
    // Thanks to Ifigenia Vairelles for pointing this out.
    if (i != k+1)
      dirty = true;
    i = k+1;
    return dirty;
  }

  /** Test program for demonstrating the Stemmer.  It reads a file and
   * stems each word, writing the result to standard out.
   * Usage: Stemmer file-name
   */
  public static void main(String[] args) {
    PorterStemmer s = new PorterStemmer();

    for (int i = 0; i < args.length; i++) {
      try {
        InputStream in = new FileInputStream(args[i]);
        byte[] buffer = new byte[1024];
        int bufferLen, offset, ch;

        bufferLen = in.read(buffer);
        offset = 0;
        s.reset();

        while(true) {
          if (offset < bufferLen)
            ch = buffer[offset++];
          else {
            bufferLen = in.read(buffer);
            offset = 0;
            if (bufferLen < 0)
              ch = -1;
            else
              ch = buffer[offset++];
          }

          if (Character.isLetter((char) ch)) {
            s.add(Character.toLowerCase((char) ch));
          }
          else {
             s.stem();
             System.out.print(s.toString());
             s.reset();
             if (ch < 0)
               break;
             else {
               System.out.print((char) ch);
             }
           }
        }

        in.close();
      }
      catch (IOException e) {
        System.out.println("error reading " + args[i]);
      }
    }
  }
}
Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.
What this is

Other links

The source code

new blog posts

... this post is sponsored by my books ...
#1 New Release!	FP Best Seller