alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

What this is

This file is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Other links

The source code

package org.apache.lucene.analysis.ru;

/**
 * Copyright 2004 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 * Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
 *
 * @author  Boris Okner, b.okner@rogers.com
 * @version $Id: RussianStemmer.java,v 1.5 2004/03/29 22:48:01 cutting Exp $
 */
class RussianStemmer
{
    private char[] charset;

    // positions of RV, R1 and R2 respectively
    private int RV, R1, R2;

    // letters
    private static char A = 0;
    private static char B = 1;
    private static char V = 2;
    private static char G = 3;
    private static char D = 4;
    private static char E = 5;
    private static char ZH = 6;
    private static char Z = 7;
    private static char I = 8;
    private static char I_ = 9;
    private static char K = 10;
    private static char L = 11;
    private static char M = 12;
    private static char N = 13;
    private static char O = 14;
    private static char P = 15;
    private static char R = 16;
    private static char S = 17;
    private static char T = 18;
    private static char U = 19;
    private static char F = 20;
    private static char X = 21;
    private static char TS = 22;
    private static char CH = 23;
    private static char SH = 24;
    private static char SHCH = 25;
    private static char HARD = 26;
    private static char Y = 27;
    private static char SOFT = 28;
    private static char AE = 29;
    private static char IU = 30;
    private static char IA = 31;

    // stem definitions
    private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };

    private static char[][] perfectiveGerundEndings1 = {
        { V },
        { V, SH, I },
        { V, SH, I, S, SOFT }
    };

    private static char[][] perfectiveGerund1Predessors = {
        { A },
        { IA }
    };

    private static char[][] perfectiveGerundEndings2 = { { I, V }, {
        Y, V }, {
            I, V, SH, I }, {
                Y, V, SH, I }, {
                    I, V, SH, I, S, SOFT }, {
                        Y, V, SH, I, S, SOFT }
    };

    private static char[][] adjectiveEndings = {
        { E, E },
        { I, E },
        { Y, E },
        { O, E },
        { E, I_ },
        { I, I_ },
        { Y, I_ },
        { O, I_ },
        { E, M },
        { I, M },
        { Y, M },
        { O, M },
        { I, X },
        { Y, X },
        { U, IU },
        { IU, IU },
        { A, IA },
        { IA, IA },
        { O, IU },
        { E, IU },
        { I, M, I },
        { Y, M, I },
        { E, G, O },
        { O, G, O },
        { E, M, U },
        {O, M, U }
    };

    private static char[][] participleEndings1 = {
        { SHCH },
        { E, M },
        { N, N },
        { V, SH },
        { IU, SHCH }
    };

    private static char[][] participleEndings2 = {
        { I, V, SH },
        { Y, V, SH },
        { U, IU, SHCH }
    };

    private static char[][] participle1Predessors = {
        { A },
        { IA }
    };

    private static char[][] reflexiveEndings = {
        { S, IA },
        { S, SOFT }
    };

    private static char[][] verbEndings1 = {
        { I_ },
        { L },
        { N },
        { L, O },
        { N, O },
        { E, T },
        { IU, T },
        { L, A },
        { N, A },
        { L, I },
        { E, M },
        { N, Y },
        { E, T, E },
        { I_, T, E },
        { T, SOFT },
        { E, SH, SOFT },
        { N, N, O }
    };

    private static char[][] verbEndings2 = {
        { IU },
        { U, IU },
        { E, N },
        { E, I_ },
        { IA, T },
        { U, I_ },
        { I, L },
        { Y, L },
        { I, M },
        { Y, M },
        { I, T },
        { Y, T },
        { I, L, A },
        { Y, L, A },
        { E, N, A },
        { I, T, E },
        { I, L, I },
        { Y, L, I },
        { I, L, O },
        { Y, L, O },
        { E, N, O },
        { U, E, T },
        { U, IU, T },
        { E, N, Y },
        { I, T, SOFT },
        { Y, T, SOFT },
        { I, SH, SOFT },
        { E, I_, T, E },
        { U, I_, T, E }
    };

    private static char[][] verb1Predessors = {
        { A },
        { IA }
    };

    private static char[][] nounEndings = {
        { A },
        { U },
        { I_ },
        { O },
        { U },
        { E },
        { Y },
        { I },
        { SOFT },
        { IA },
        { E, V },
        { O, V },
        { I, E },
        { SOFT, E },
        { IA, X },
        { I, IU },
        { E, I },
        { I, I },
        { E, I_ },
        { O, I_ },
        { E, M },
        { A, M },
        { O, M },
        { A, X },
        { SOFT, IU },
        { I, IA },
        { SOFT, IA },
        { I, I_ },
        { IA, M },
        { IA, M, I },
        { A, M, I },
        { I, E, I_ },
        { I, IA, M },
        { I, E, M },
        { I, IA, X },
        { I, IA, M, I }
    };

    private static char[][] superlativeEndings = {
        { E, I_, SH },
        { E, I_, SH, E }
    };

    private static char[][] derivationalEndings = {
        { O, S, T },
        { O, S, T, SOFT }
    };

    /**
     * RussianStemmer constructor comment.
     */
    public RussianStemmer()
    {
        super();
    }

    /**
     * RussianStemmer constructor comment.
     */
    public RussianStemmer(char[] charset)
    {
        super();
        this.charset = charset;
    }

    /**
     * Adjectival ending is an adjective ending,
     * optionally preceded by participle ending.
     * Creation date: (17/03/2002 12:14:58 AM)
     * @param stemmingZone java.lang.StringBuffer
     */
    private boolean adjectival(StringBuffer stemmingZone)
    {
        // look for adjective ending in a stemming zone
        if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
            return false;
        // if adjective ending was found, try for participle ending
        boolean r =
            findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors)
            ||
            findAndRemoveEnding(stemmingZone, participleEndings2);
        return true;
    }

    /**
     * Derivational endings
     * Creation date: (17/03/2002 12:14:58 AM)
     * @param stemmingZone java.lang.StringBuffer
     */
    private boolean derivational(StringBuffer stemmingZone)
    {
        int endingLength = findEnding(stemmingZone, derivationalEndings);
        if (endingLength == 0)
             // no derivational ending found
            return false;
        else
        {
            // Ensure that the ending locates in R2
            if (R2 - RV <= stemmingZone.length() - endingLength)
            {
                stemmingZone.setLength(stemmingZone.length() - endingLength);
                return true;
            }
            else
            {
                return false;
            }
        }
    }

    /**
     * Finds ending among given ending class and returns the length of ending found(0, if not found).
     * Creation date: (17/03/2002 8:18:34 PM)
     */
    private int findEnding(StringBuffer stemmingZone, int startIndex, char[][] theEndingClass)
    {
        boolean match = false;
        for (int i = theEndingClass.length - 1; i >= 0; i--)
        {
            char[] theEnding = theEndingClass[i];
            // check if the ending is bigger than stemming zone
            if (startIndex < theEnding.length - 1)
            {
                match = false;
                continue;
            }
            match = true;
            int stemmingIndex = startIndex;
            for (int j = theEnding.length - 1; j >= 0; j--)
            {
                if (stemmingZone.charAt(stemmingIndex--) != charset[theEnding[j]])
                {
                    match = false;
                    break;
                }
            }
            // check if ending was found
            if (match)
            {
                return theEndingClass[i].length; // cut ending
            }
        }
        return 0;
    }

    private int findEnding(StringBuffer stemmingZone, char[][] theEndingClass)
    {
        return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass);
    }

    /**
     * Finds the ending among the given class of endings and removes it from stemming zone.
     * Creation date: (17/03/2002 8:18:34 PM)
     */
    private boolean findAndRemoveEnding(StringBuffer stemmingZone, char[][] theEndingClass)
    {
        int endingLength = findEnding(stemmingZone, theEndingClass);
        if (endingLength == 0)
            // not found
            return false;
        else {
            stemmingZone.setLength(stemmingZone.length() - endingLength);
            // cut the ending found
            return true;
        }
    }

    /**
     * Finds the ending among the given class of endings, then checks if this ending was
     * preceded by any of given predessors, and if so, removes it from stemming zone.
     * Creation date: (17/03/2002 8:18:34 PM)
     */
    private boolean findAndRemoveEnding(StringBuffer stemmingZone,
        char[][] theEndingClass, char[][] thePredessors)
    {
        int endingLength = findEnding(stemmingZone, theEndingClass);
        if (endingLength == 0)
            // not found
            return false;
        else
        {
            int predessorLength =
                findEnding(stemmingZone,
                    stemmingZone.length() - endingLength - 1,
                    thePredessors);
            if (predessorLength == 0)
                return false;
            else {
                stemmingZone.setLength(stemmingZone.length() - endingLength);
                // cut the ending found
                return true;
            }
        }

    }

    /**
     * Marks positions of RV, R1 and R2 in a given word.
     * Creation date: (16/03/2002 3:40:11 PM)
     */
    private void markPositions(String word)
    {
        RV = 0;
        R1 = 0;
        R2 = 0;
        int i = 0;
        // find RV
        while (word.length() > i && !isVowel(word.charAt(i)))
        {
            i++;
        }
        if (word.length() - 1 < ++i)
            return; // RV zone is empty
        RV = i;
        // find R1
        while (word.length() > i && isVowel(word.charAt(i)))
        {
            i++;
        }
        if (word.length() - 1 < ++i)
            return; // R1 zone is empty
        R1 = i;
        // find R2
        while (word.length() > i && !isVowel(word.charAt(i)))
        {
            i++;
        }
        if (word.length() - 1 < ++i)
            return; // R2 zone is empty
        while (word.length() > i && isVowel(word.charAt(i)))
        {
            i++;
        }
        if (word.length() - 1 < ++i)
            return; // R2 zone is empty
        R2 = i;
    }

    /**
     * Checks if character is a vowel..
     * Creation date: (16/03/2002 10:47:03 PM)
     * @return boolean
     * @param letter char
     */
    private boolean isVowel(char letter)
    {
        for (int i = 0; i < vowels.length; i++)
        {
            if (letter == charset[vowels[i]])
                return true;
        }
        return false;
    }

    /**
     * Noun endings.
     * Creation date: (17/03/2002 12:14:58 AM)
     * @param stemmingZone java.lang.StringBuffer
     */
    private boolean noun(StringBuffer stemmingZone)
    {
        return findAndRemoveEnding(stemmingZone, nounEndings);
    }

    /**
     * Perfective gerund endings.
     * Creation date: (17/03/2002 12:14:58 AM)
     * @param stemmingZone java.lang.StringBuffer
     */
    private boolean perfectiveGerund(StringBuffer stemmingZone)
    {
        return findAndRemoveEnding(
            stemmingZone,
            perfectiveGerundEndings1,
            perfectiveGerund1Predessors)
            || findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
    }

    /**
     * Reflexive endings.
     * Creation date: (17/03/2002 12:14:58 AM)
     * @param stemmingZone java.lang.StringBuffer
     */
    private boolean reflexive(StringBuffer stemmingZone)
    {
        return findAndRemoveEnding(stemmingZone, reflexiveEndings);
    }

    /**
     * Insert the method's description here.
     * Creation date: (17/03/2002 12:14:58 AM)
     * @param stemmingZone java.lang.StringBuffer
     */
    private boolean removeI(StringBuffer stemmingZone)
    {
        if (stemmingZone.length() > 0
            && stemmingZone.charAt(stemmingZone.length() - 1) == charset[I])
        {
            stemmingZone.setLength(stemmingZone.length() - 1);
            return true;
        }
        else
        {
            return false;
        }
    }

    /**
     * Insert the method's description here.
     * Creation date: (17/03/2002 12:14:58 AM)
     * @param stemmingZone java.lang.StringBuffer
     */
    private boolean removeSoft(StringBuffer stemmingZone)
    {
        if (stemmingZone.length() > 0
            && stemmingZone.charAt(stemmingZone.length() - 1) == charset[SOFT])
        {
            stemmingZone.setLength(stemmingZone.length() - 1);
            return true;
        }
        else
        {
            return false;
        }
    }

    /**
     * Insert the method's description here.
     * Creation date: (16/03/2002 10:58:42 PM)
     * @param newCharset char[]
     */
    public void setCharset(char[] newCharset)
    {
        charset = newCharset;
    }

    /**
     * Set ending definition as in Russian stemming algorithm.
     * Creation date: (16/03/2002 11:16:36 PM)
     */
    private void setEndings()
    {
        vowels = new char[] { A, E, I, O, U, Y, AE, IU, IA };

        perfectiveGerundEndings1 = new char[][] {
            { V }, { V, SH, I }, { V, SH, I, S, SOFT }
        };

        perfectiveGerund1Predessors = new char[][] { { A }, { IA }
        };

        perfectiveGerundEndings2 = new char[][] {
            { I, V },
            { Y, V },
            { I, V, SH, I },
            { Y, V, SH, I },
            { I, V, SH, I, S, SOFT },
            { Y, V, SH, I, S, SOFT }
        };

        adjectiveEndings = new char[][] {
            { E, E },
            { I, E },
            { Y, E },
            { O, E },
            { E, I_ },
            { I, I_ },
            { Y, I_ },
            { O, I_ },
            { E, M },
            { I, M },
            { Y, M },
            { O, M },
            { I, X },
            { Y, X },
            { U, IU },
            { IU, IU },
            { A, IA },
            { IA, IA },
            { O, IU },
            { E, IU },
            { I, M, I },
            { Y, M, I },
            { E, G, O },
            { O, G, O },
            { E, M, U },
            { O, M, U }
        };

        participleEndings1 = new char[][] {
            { SHCH },
            { E, M },
            { N, N },
            { V, SH },
            { IU, SHCH }
        };

        participleEndings2 = new char[][] {
            { I, V, SH },
            { Y, V, SH },
            { U, IU, SHCH }
        };

        participle1Predessors = new char[][] {
            { A },
            { IA }
        };

        reflexiveEndings = new char[][] {
            { S, IA },
            { S, SOFT }
        };

        verbEndings1 = new char[][] {
            { I_ },
            { L },
            { N },
            { L, O },
            { N, O },
            { E, T },
            { IU, T },
            { L, A },
            { N, A },
            { L, I },
            { E, M },
            { N, Y },
            { E, T, E },
            { I_, T, E },
            { T, SOFT },
            { E, SH, SOFT },
            { N, N, O }
        };

        verbEndings2 = new char[][] {
            { IU },
            { U, IU },
            { E, N },
            { E, I_ },
            { IA, T },
            { U, I_ },
            { I, L },
            { Y, L },
            { I, M },
            { Y, M },
            { I, T },
            { Y, T },
            { I, L, A },
            { Y, L, A },
            { E, N, A },
            { I, T, E },
            { I, L, I },
            { Y, L, I },
            { I, L, O },
            { Y, L, O },
            { E, N, O },
            { U, E, T },
            { U, IU, T },
            { E, N, Y },
            { I, T, SOFT },
            { Y, T, SOFT },
            { I, SH, SOFT },
            { E, I_, T, E },
            { U, I_, T, E }
        };

        verb1Predessors = new char[][] {
            { A },
            { IA }
        };

        nounEndings = new char[][] {
            { A },
            { IU },
            { I_ },
            { O },
            { U },
            { E },
            { Y },
            { I },
            { SOFT },
            { IA },
            { E, V },
            { O, V },
            { I, E },
            { SOFT, E },
            { IA, X },
            { I, IU },
            { E, I },
            { I, I },
            { E, I_ },
            { O, I_ },
            { E, M },
            { A, M },
            { O, M },
            { A, X },
            { SOFT, IU },
            { I, IA },
            { SOFT, IA },
            { I, I_ },
            { IA, M },
            { IA, M, I },
            { A, M, I },
            { I, E, I_ },
            { I, IA, M },
            { I, E, M },
            { I, IA, X },
            { I, IA, M, I }
        };

        superlativeEndings = new char[][] {
            { E, I_, SH },
            { E, I_, SH, E }
        };

        derivationalEndings = new char[][] {
            { O, S, T },
            { O, S, T, SOFT }
        };
    }

    /**
     * Finds the stem for given Russian word.
     * Creation date: (16/03/2002 3:36:48 PM)
     * @return java.lang.String
     * @param input java.lang.String
     */
    public String stem(String input)
    {
        markPositions(input);
        if (RV == 0)
            return input; //RV wasn't detected, nothing to stem
        StringBuffer stemmingZone = new StringBuffer(input.substring(RV));
        // stemming goes on in RV
        // Step 1

        if (!perfectiveGerund(stemmingZone))
        {
            reflexive(stemmingZone);
            boolean r =
                adjectival(stemmingZone)
                || verb(stemmingZone)
                || noun(stemmingZone);
        }
        // Step 2
        removeI(stemmingZone);
        // Step 3
        derivational(stemmingZone);
        // Step 4
        superlative(stemmingZone);
        undoubleN(stemmingZone);
        removeSoft(stemmingZone);
        // return result
        return input.substring(0, RV) + stemmingZone.toString();
    }

    /**
     * Superlative endings.
     * Creation date: (17/03/2002 12:14:58 AM)
     * @param stemmingZone java.lang.StringBuffer
     */
    private boolean superlative(StringBuffer stemmingZone)
    {
        return findAndRemoveEnding(stemmingZone, superlativeEndings);
    }

    /**
     * Undoubles N.
     * Creation date: (17/03/2002 12:14:58 AM)
     * @param stemmingZone java.lang.StringBuffer
     */
    private boolean undoubleN(StringBuffer stemmingZone)
    {
        char[][] doubleN = {
            { N, N }
        };
        if (findEnding(stemmingZone, doubleN) != 0)
        {
            stemmingZone.setLength(stemmingZone.length() - 1);
            return true;
        }
        else
        {
            return false;
        }
    }

    /**
     * Verb endings.
     * Creation date: (17/03/2002 12:14:58 AM)
     * @param stemmingZone java.lang.StringBuffer
     */
    private boolean verb(StringBuffer stemmingZone)
    {
        return findAndRemoveEnding(
            stemmingZone,
            verbEndings1,
            verb1Predessors)
            || findAndRemoveEnding(stemmingZone, verbEndings2);
    }

    /**
     * Static method for stemming with different charsets
     */
    public static String stem(String theWord, char[] charset)
    {
        RussianStemmer stemmer = new RussianStemmer();
        stemmer.setCharset(charset);
        return stemmer.stem(theWord);
    }
}
... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2024 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.