alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (GreekStemmer.java)

This example Lucene source code file (GreekStemmer.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

chararrayset, chararrayset, greekstemmer, greekstemmer, string, string, util

The Lucene GreekStemmer.java source code

package org.apache.lucene.analysis.el;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.Version;

import java.util.Arrays;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 * A stemmer for Greek words, according to: <i>Development of a Stemmer for the
 * Greek Language.</i> Georgios Ntais
 * <p>
 * NOTE: Input is expected to be casefolded for Greek (including folding of final
 * sigma to sigma), and with diacritics removed. This can be achieved with 
 * either {@link GreekLowerCaseFilter} or ICUFoldingFilter.
 * @lucene.experimental
 */
public class GreekStemmer {
  public int stem(char s[], int len) {
    if (len < 4) // too short
      return len;
    
    final int origLen = len;
    // "short rules": if it hits one of these, it skips the "long list"
    len = rule0(s, len);
    len = rule1(s, len);
    len = rule2(s, len);
    len = rule3(s, len);
    len = rule4(s, len);
    len = rule5(s, len);
    len = rule6(s, len);
    len = rule7(s, len);
    len = rule8(s, len);
    len = rule9(s, len);
    len = rule10(s, len);
    len = rule11(s, len);
    len = rule12(s, len);
    len = rule13(s, len);
    len = rule14(s, len);
    len = rule15(s, len);
    len = rule16(s, len);
    len = rule17(s, len);
    len = rule18(s, len);
    len = rule19(s, len);
    len = rule20(s, len);
    // "long list"
    if (len == origLen)
      len = rule21(s, len);
    
    return rule22(s, len);
  }

  private int rule0(char s[], int len) {
    if (len > 9 && (endsWith(s, len, "καθεστωτοσ")
        || endsWith(s, len, "καθεστωτων")))
      return len - 4;
    
    if (len > 8 && (endsWith(s, len, "γεγονοτοσ")
        || endsWith(s, len, "γεγονοτων")))
      return len - 4;
    
    if (len > 8 && endsWith(s, len, "καθεστωτα"))
      return len - 3;
    
    if (len > 7 && (endsWith(s, len, "τατογιου")
        || endsWith(s, len, "τατογιων")))
      return len - 4;
    
    if (len > 7 && endsWith(s, len, "γεγονοτα"))
      return len - 3;
    
    if (len > 7 && endsWith(s, len, "καθεστωσ"))
      return len - 2;
    
    if (len > 6 && (endsWith(s, len, "σκαγιου"))
        || endsWith(s, len, "σκαγιων")
        || endsWith(s, len, "ολογιου")
        || endsWith(s, len, "ολογιων")
        || endsWith(s, len, "κρεατοσ")
        || endsWith(s, len, "κρεατων")
        || endsWith(s, len, "περατοσ")
        || endsWith(s, len, "περατων")
        || endsWith(s, len, "τερατοσ")
        || endsWith(s, len, "τερατων"))
      return len - 4;
    
    if (len > 6 && endsWith(s, len, "τατογια"))
      return len - 3;
    
    if (len > 6 && endsWith(s, len, "γεγονοσ"))
      return len - 2;
    
    if (len > 5 && (endsWith(s, len, "φαγιου")
        || endsWith(s, len, "φαγιων")
        || endsWith(s, len, "σογιου")
        || endsWith(s, len, "σογιων")))
      return len - 4;
    
    if (len > 5 && (endsWith(s, len, "σκαγια")
        || endsWith(s, len, "ολογια")
        || endsWith(s, len, "κρεατα")
        || endsWith(s, len, "περατα")
        || endsWith(s, len, "τερατα")))
      return len - 3;
    
    if (len > 4 && (endsWith(s, len, "φαγια")
        || endsWith(s, len, "σογια")
        || endsWith(s, len, "φωτοσ")
        || endsWith(s, len, "φωτων")))
      return len - 3;
    
    if (len > 4 && (endsWith(s, len, "κρεασ")
        || endsWith(s, len, "περασ")
        || endsWith(s, len, "τερασ")))
      return len - 2;
    
    if (len > 3 && endsWith(s, len, "φωτα"))
      return len - 2;
    
    if (len > 2 && endsWith(s, len, "φωσ"))
      return len - 1;
    
    return len;
  }

  private int rule1(char s[], int len) {
    if (len > 4 && (endsWith(s, len, "αδεσ") || endsWith(s, len, "αδων"))) {
      len -= 4;
      if (!(endsWith(s, len, "οκ") ||
          endsWith(s, len, "μαμ") ||
          endsWith(s, len, "μαν") ||
          endsWith(s, len, "μπαμπ") ||
          endsWith(s, len, "πατερ") ||
          endsWith(s, len, "γιαγι") ||
          endsWith(s, len, "νταντ") ||
          endsWith(s, len, "κυρ") ||
          endsWith(s, len, "θει") ||
          endsWith(s, len, "πεθερ")))
        len += 2; // add back -αδ
    }
    return len;
  }
  
  private int rule2(char s[], int len) {
    if (len > 4 && (endsWith(s, len, "εδεσ") || endsWith(s, len, "εδων"))) {
      len -= 4;
      if (endsWith(s, len, "οπ") ||
          endsWith(s, len, "ιπ") ||
          endsWith(s, len, "εμπ") ||
          endsWith(s, len, "υπ") ||
          endsWith(s, len, "γηπ") ||
          endsWith(s, len, "δαπ") ||
          endsWith(s, len, "κρασπ") ||
          endsWith(s, len, "μιλ"))
        len += 2; // add back -εδ
    }
    return len;
  }
  
  private int rule3(char s[], int len) {
    if (len > 5 && (endsWith(s, len, "ουδεσ") || endsWith(s, len, "ουδων"))) {
      len -= 5;
      if (endsWith(s, len, "αρκ") ||
          endsWith(s, len, "καλιακ") ||
          endsWith(s, len, "πεταλ") ||
          endsWith(s, len, "λιχ") ||
          endsWith(s, len, "πλεξ") ||
          endsWith(s, len, "σκ") ||
          endsWith(s, len, "σ") ||
          endsWith(s, len, "φλ") ||
          endsWith(s, len, "φρ") ||
          endsWith(s, len, "βελ") ||
          endsWith(s, len, "λουλ") ||
          endsWith(s, len, "χν") ||
          endsWith(s, len, "σπ") ||
          endsWith(s, len, "τραγ") ||
          endsWith(s, len, "φε"))
        len += 3; // add back -ουδ
    }
    return len;
  }
  
  private static final CharArraySet exc4 = new CharArraySet(Version.LUCENE_31,
      Arrays.asList("θ", "δ", "ελ", "γαλ", "ν", "π", "ιδ", "παρ"),
      false);
  
  private int rule4(char s[], int len) {   
    if (len > 3 && (endsWith(s, len, "εωσ") || endsWith(s, len, "εων"))) {
      len -= 3;
      if (exc4.contains(s, 0, len))
        len++; // add back -ε
    }
    return len;
  }
  
  private int rule5(char s[], int len) {
    if (len > 2 && endsWith(s, len, "ια")) {
      len -= 2;
      if (endsWithVowel(s, len))
        len++; // add back -ι
    } else if (len > 3 && (endsWith(s, len, "ιου") || endsWith(s, len, "ιων"))) {
      len -= 3;
      if (endsWithVowel(s, len))
        len++; // add back -ι
    }
    return len;
  }

  private static final CharArraySet exc6 = new CharArraySet(Version.LUCENE_31,
      Arrays.asList("αλ", "αδ", "ενδ", "αμαν", "αμμοχαλ", "ηθ", "ανηθ",
          "αντιδ", "φυσ", "βρωμ", "γερ", "εξωδ", "καλπ", "καλλιν", "καταδ",
          "μουλ", "μπαν", "μπαγιατ", "μπολ", "μποσ", "νιτ", "ξικ", "συνομηλ",
          "πετσ", "πιτσ", "πικαντ", "πλιατσ", "ποστελν", "πρωτοδ", "σερτ",
          "συναδ", "τσαμ", "υποδ", "φιλον", "φυλοδ", "χασ"), 
       false);

  private int rule6(char s[], int len) {
    boolean removed = false;
    if (len > 3 && (endsWith(s, len, "ικα") || endsWith(s, len, "ικο"))) {
      len -= 3;
      removed = true;
    } else if (len > 4 && (endsWith(s, len, "ικου") || endsWith(s, len, "ικων"))) {
      len -= 4;
      removed = true;
    }
    
    if (removed) {
      if (endsWithVowel(s, len) || exc6.contains(s, 0, len))
        len += 2; // add back -ικ
    }
    return len;
  }
  
  private static final CharArraySet exc7 = new CharArraySet(Version.LUCENE_31,
      Arrays.asList("αναπ", "αποθ", "αποκ", "αποστ", "βουβ", "ξεθ", "ουλ",
          "πεθ", "πικρ", "ποτ", "σιχ", "χ"), 
      false);
  
  private int rule7(char s[], int len) {
    if (len == 5 && endsWith(s, len, "αγαμε"))
      return len - 1;
    
    if (len > 7 && endsWith(s, len, "ηθηκαμε"))
      len -= 7;
    else if (len > 6 && endsWith(s, len, "ουσαμε"))
      len -= 6;
    else if (len > 5 && (endsWith(s, len, "αγαμε") ||
             endsWith(s, len, "ησαμε") ||
             endsWith(s, len, "ηκαμε")))
      len -= 5;
    
    if (len > 3 && endsWith(s, len, "αμε")) {
      len -= 3;
      if (exc7.contains(s, 0, len))
        len += 2; // add back -αμ
    }

    return len;
  }

  private static final CharArraySet exc8a = new CharArraySet(Version.LUCENE_31,
      Arrays.asList("τρ", "τσ"),
      false);

  private static final CharArraySet exc8b = new CharArraySet(Version.LUCENE_31,
      Arrays.asList("βετερ", "βουλκ", "βραχμ", "γ", "δραδουμ", "θ", "καλπουζ",
          "καστελ", "κορμορ", "λαοπλ", "μωαμεθ", "μ", "μουσουλμ", "ν", "ουλ",
          "π", "πελεκ", "πλ", "πολισ", "πορτολ", "σαρακατσ", "σουλτ",
          "τσαρλατ", "ορφ", "τσιγγ", "τσοπ", "φωτοστεφ", "χ", "ψυχοπλ", "αγ",
          "ορφ", "γαλ", "γερ", "δεκ", "διπλ", "αμερικαν", "ουρ", "πιθ",
          "πουριτ", "σ", "ζωντ", "ικ", "καστ", "κοπ", "λιχ", "λουθηρ", "μαιντ",
          "μελ", "σιγ", "σπ", "στεγ", "τραγ", "τσαγ", "φ", "ερ", "αδαπ",
          "αθιγγ", "αμηχ", "ανικ", "ανοργ", "απηγ", "απιθ", "ατσιγγ", "βασ",
          "βασκ", "βαθυγαλ", "βιομηχ", "βραχυκ", "διατ", "διαφ", "ενοργ",
          "θυσ", "καπνοβιομηχ", "καταγαλ", "κλιβ", "κοιλαρφ", "λιβ",
          "μεγλοβιομηχ", "μικροβιομηχ", "νταβ", "ξηροκλιβ", "ολιγοδαμ",
          "ολογαλ", "πενταρφ", "περηφ", "περιτρ", "πλατ", "πολυδαπ", "πολυμηχ",
          "στεφ", "ταβ", "τετ", "υπερηφ", "υποκοπ", "χαμηλοδαπ", "ψηλοταβ"),
      false);
  
  private int rule8(char s[], int len) {
    boolean removed = false;
    
    if (len > 8 && endsWith(s, len, "ιουντανε")) {
      len -= 8;
      removed = true;
    } else if (len > 7 && endsWith(s, len, "ιοντανε") ||
        endsWith(s, len, "ουντανε") ||
        endsWith(s, len, "ηθηκανε")) {
      len -= 7;
      removed = true;
    } else if (len > 6 && endsWith(s, len, "ιοτανε") ||
        endsWith(s, len, "οντανε") ||
        endsWith(s, len, "ουσανε")) {
      len -= 6;
      removed = true;
    } else if (len > 5 && endsWith(s, len, "αγανε") ||
        endsWith(s, len, "ησανε") ||
        endsWith(s, len, "οτανε") ||
        endsWith(s, len, "ηκανε")) {
      len -= 5;
      removed = true;
    }
    
    if (removed && exc8a.contains(s, 0, len)) {
      // add -αγαν (we removed > 4 chars so its safe)
      len += 4;
      s[len - 4] = 'α';
      s[len - 3] = 'γ';
      s[len - 2] = 'α';
      s[len - 1] = 'ν';
    }
    
    if (len > 3 && endsWith(s, len, "ανε")) {
      len -= 3;
      if (endsWithVowelNoY(s, len) || exc8b.contains(s, 0, len)) {
        len += 2; // add back -αν
      }
    }
    
    return len;
  }
  
  private static final CharArraySet exc9 = new CharArraySet(Version.LUCENE_31,
      Arrays.asList("αβαρ", "βεν", "εναρ", "αβρ", "αδ", "αθ", "αν", "απλ",
          "βαρον", "ντρ", "σκ", "κοπ", "μπορ", "νιφ", "παγ", "παρακαλ", "σερπ",
          "σκελ", "συρφ", "τοκ", "υ", "δ", "εμ", "θαρρ", "θ"), 
      false);
  
  private int rule9(char s[], int len) {
    if (len > 5 && endsWith(s, len, "ησετε"))
      len -= 5;
    
    if (len > 3 && endsWith(s, len, "ετε")) {
      len -= 3;
      if (exc9.contains(s, 0, len) ||
          endsWithVowelNoY(s, len) ||
          endsWith(s, len, "οδ") ||
          endsWith(s, len, "αιρ") ||
          endsWith(s, len, "φορ") ||
          endsWith(s, len, "ταθ") ||
          endsWith(s, len, "διαθ") ||
          endsWith(s, len, "σχ") ||
          endsWith(s, len, "ενδ") ||
          endsWith(s, len, "ευρ") ||
          endsWith(s, len, "τιθ") ||
          endsWith(s, len, "υπερθ") ||
          endsWith(s, len, "ραθ") ||
          endsWith(s, len, "ενθ") ||
          endsWith(s, len, "ροθ") ||
          endsWith(s, len, "σθ") ||
          endsWith(s, len, "πυρ") ||
          endsWith(s, len, "αιν") ||
          endsWith(s, len, "συνδ") ||
          endsWith(s, len, "συν") ||
          endsWith(s, len, "συνθ") ||
          endsWith(s, len, "χωρ") ||
          endsWith(s, len, "πον") ||
          endsWith(s, len, "βρ") ||
          endsWith(s, len, "καθ") ||
          endsWith(s, len, "ευθ") ||
          endsWith(s, len, "εκθ") ||
          endsWith(s, len, "νετ") ||
          endsWith(s, len, "ρον") ||
          endsWith(s, len, "αρκ") ||
          endsWith(s, len, "βαρ") ||
          endsWith(s, len, "βολ") ||
          endsWith(s, len, "ωφελ")) {
        len += 2; // add back -ετ
      }
    }
    
    return len;
  }

  private int rule10(char s[], int len) {
    if (len > 5 && (endsWith(s, len, "οντασ") || endsWith(s, len, "ωντασ"))) {
      len -= 5;
      if (len == 3 && endsWith(s, len, "αρχ")) {
        len += 3; // add back *ντ
        s[len - 3] = 'ο';
      }
      if (endsWith(s, len, "κρε")) {
        len += 3; // add back *ντ
        s[len - 3] = 'ω';
      }
    }
    
    return len;
  }
  
  private int rule11(char s[], int len) {
    if (len > 6 && endsWith(s, len, "ομαστε")) {
      len -= 6;
      if (len == 2 && endsWith(s, len, "ον")) {
        len += 5; // add back -ομαστ
      }
    } else if (len > 7 && endsWith(s, len, "ιομαστε")) {
      len -= 7;
      if (len == 2 && endsWith(s, len, "ον")) {
        len += 5;
        s[len - 5] = 'ο';
        s[len - 4] = 'μ';
        s[len - 3] = 'α';
        s[len - 2] = 'σ';
        s[len - 1] = 'τ';
      }
    }
    return len;
  }

  private static final CharArraySet exc12a = new CharArraySet(Version.LUCENE_31,
      Arrays.asList("π", "απ", "συμπ", "ασυμπ", "ακαταπ", "αμεταμφ"),
      false);

  private static final CharArraySet exc12b = new CharArraySet(Version.LUCENE_31,
      Arrays.asList("αλ", "αρ", "εκτελ", "ζ", "μ", "ξ", "παρακαλ", "αρ", "προ", "νισ"),
      false);
  
  private int rule12(char s[], int len) {
    if (len > 5 && endsWith(s, len, "ιεστε")) {
      len -= 5;
      if (exc12a.contains(s, 0, len))   
        len += 4; // add back -ιεστ
    }
    
    if (len > 4 && endsWith(s, len, "εστε")) {
      len -= 4;
      if (exc12b.contains(s, 0, len))
        len += 3; // add back -εστ
    }
    
    return len;
  }
  
  private static final CharArraySet exc13 = new CharArraySet(Version.LUCENE_31,
      Arrays.asList("διαθ", "θ", "παρακαταθ", "προσθ", "συνθ"),
      false);
  
  private int rule13(char s[], int len) {
    if (len > 6 && endsWith(s, len, "ηθηκεσ")) {
      len -= 6;
    } else if (len > 5 && (endsWith(s, len, "ηθηκα") || endsWith(s, len, "ηθηκε"))) {
      len -= 5;
    }
    
    boolean removed = false;
    
    if (len > 4 && endsWith(s, len, "ηκεσ")) {
      len -= 4;
      removed = true;
    } else if (len > 3 && (endsWith(s, len, "ηκα") || endsWith(s, len, "ηκε"))) {
      len -= 3;
      removed = true;
    }

    if (removed && (exc13.contains(s, 0, len) 
        || endsWith(s, len, "σκωλ")
        || endsWith(s, len, "σκουλ")
        || endsWith(s, len, "ναρθ")
        || endsWith(s, len, "σφ")
        || endsWith(s, len, "οθ")
        || endsWith(s, len, "πιθ"))) { 
      len += 2; // add back the -ηκ
    }
    
    return len;
  }
  
  private static final CharArraySet exc14 = new CharArraySet(Version.LUCENE_31,
      Arrays.asList("φαρμακ", "χαδ", "αγκ", "αναρρ", "βρομ", "εκλιπ", "λαμπιδ",
          "λεχ", "μ", "πατ", "ρ", "λ", "μεδ", "μεσαζ", "υποτειν", "αμ", "αιθ",
          "ανηκ", "δεσποζ", "ενδιαφερ", "δε", "δευτερευ", "καθαρευ", "πλε",
          "τσα"), 
      false);

  private int rule14(char s[], int len) {
    boolean removed = false;
    
    if (len > 5 && endsWith(s, len, "ουσεσ")) {
      len -= 5;
      removed = true;
    } else if (len > 4 && (endsWith(s, len, "ουσα") || endsWith(s, len, "ουσε"))) {
      len -= 4;
      removed = true;
    }
    
    if (removed && (exc14.contains(s, 0, len) 
        || endsWithVowel(s, len)
        || endsWith(s, len, "ποδαρ")
        || endsWith(s, len, "βλεπ")
        || endsWith(s, len, "πανταχ")
        || endsWith(s, len, "φρυδ") 
        || endsWith(s, len, "μαντιλ")
        || endsWith(s, len, "μαλλ")
        || endsWith(s, len, "κυματ")
        || endsWith(s, len, "λαχ")
        || endsWith(s, len, "ληγ")
        || endsWith(s, len, "φαγ")
        || endsWith(s, len, "ομ")
        || endsWith(s, len, "πρωτ"))) {
      len += 3; // add back -ουσ
    }

   return len;
  }
  
  private static final CharArraySet exc15a = new CharArraySet(Version.LUCENE_31,
      Arrays.asList("αβαστ", "πολυφ", "αδηφ", "παμφ", "ρ", "ασπ", "αφ", "αμαλ",
          "αμαλλι", "ανυστ", "απερ", "ασπαρ", "αχαρ", "δερβεν", "δροσοπ",
          "ξεφ", "νεοπ", "νομοτ", "ολοπ", "ομοτ", "προστ", "προσωποπ", "συμπ",
          "συντ", "τ", "υποτ", "χαρ", "αειπ", "αιμοστ", "ανυπ", "αποτ",
          "αρτιπ", "διατ", "εν", "επιτ", "κροκαλοπ", "σιδηροπ", "λ", "ναυ",
          "ουλαμ", "ουρ", "π", "τρ", "μ"), 
      false);
  
  private static final CharArraySet exc15b = new CharArraySet(Version.LUCENE_31,
      Arrays.asList("ψοφ", "ναυλοχ"),
      false);
  
  private int rule15(char s[], int len) {
    boolean removed = false;
    if (len > 4 && endsWith(s, len, "αγεσ")) {
      len -= 4;
      removed = true;
    } else if (len > 3 && (endsWith(s, len, "αγα") || endsWith(s, len, "αγε"))) {
      len -= 3;
      removed = true;
    }
    
    if (removed) {
      final boolean cond1 = exc15a.contains(s, 0, len) 
        || endsWith(s, len, "οφ")
        || endsWith(s, len, "πελ")
        || endsWith(s, len, "χορτ")
        || endsWith(s, len, "λλ")
        || endsWith(s, len, "σφ")
        || endsWith(s, len, "ρπ")
        || endsWith(s, len, "φρ")
        || endsWith(s, len, "πρ")
        || endsWith(s, len, "λοχ")
        || endsWith(s, len, "σμην");
      
      final boolean cond2 = exc15b.contains(s, 0, len)
        || endsWith(s, len, "κολλ");
      
      if (cond1 && !cond2)
        len += 2; // add back -αγ  
    }
    
    return len;
  }
  
  private static final CharArraySet exc16 = new CharArraySet(Version.LUCENE_31,
      Arrays.asList("ν", "χερσον", "δωδεκαν", "ερημον", "μεγαλον", "επταν"),
      false);
  
  private int rule16(char s[], int len) {
    boolean removed = false;
    if (len > 4 && endsWith(s, len, "ησου")) {
      len -= 4;
      removed = true;
    } else if (len > 3 && (endsWith(s, len, "ησε") || endsWith(s, len, "ησα"))) {
      len -= 3;
      removed = true;
    }
    
    if (removed && exc16.contains(s, 0, len))
      len += 2; // add back -ησ
    
    return len;
  }
  
  private static final CharArraySet exc17 = new CharArraySet(Version.LUCENE_31,
      Arrays.asList("ασβ", "σβ", "αχρ", "χρ", "απλ", "αειμν", "δυσχρ", "ευχρ", "κοινοχρ", "παλιμψ"),
      false);
  
  private int rule17(char s[], int len) {
    if (len > 4 && endsWith(s, len, "ηστε")) {
      len -= 4;
      if (exc17.contains(s, 0, len))
        len += 3; // add back the -ηστ
    }
    
    return len;
  }
  
  private static final CharArraySet exc18 = new CharArraySet(Version.LUCENE_31,
      Arrays.asList("ν", "ρ", "σπι", "στραβομουτσ", "κακομουτσ", "εξων"),
      false);
  
  private int rule18(char s[], int len) {
    boolean removed = false;
    
    if (len > 6 && (endsWith(s, len, "ησουνε") || endsWith(s, len, "ηθουνε"))) {
      len -= 6;
      removed = true;
    } else if (len > 4 && endsWith(s, len, "ουνε")) {
      len -= 4;
      removed = true;
    }
    
    if (removed && exc18.contains(s, 0, len)) {
      len += 3;
      s[len - 3] = 'ο';
      s[len - 2] = 'υ';
      s[len - 1] = 'ν';
    }
    return len;
  }
  
  private static final CharArraySet exc19 = new CharArraySet(Version.LUCENE_31,
      Arrays.asList("παρασουσ", "φ", "χ", "ωριοπλ", "αζ", "αλλοσουσ", "ασουσ"),
      false);
  
  private int rule19(char s[], int len) {
    boolean removed = false;
    
    if (len > 6 && (endsWith(s, len, "ησουμε") || endsWith(s, len, "ηθουμε"))) {
      len -= 6;
      removed = true;
    } else if (len > 4 && endsWith(s, len, "ουμε")) {
      len -= 4;
      removed = true;
    }
    
    if (removed && exc19.contains(s, 0, len)) {
      len += 3;
      s[len - 3] = 'ο';
      s[len - 2] = 'υ';
      s[len - 1] = 'μ';
    }
    return len;
  }
  
  private int rule20(char s[], int len) {
    if (len > 5 && (endsWith(s, len, "ματων") || endsWith(s, len, "ματοσ")))
      len -= 3;
    else if (len > 4 && endsWith(s, len, "ματα"))
      len -= 2;
    return len;
  }

  private int rule21(char s[], int len) {
    if (len > 9 && endsWith(s, len, "ιοντουσαν"))
      return len - 9;
    
    if (len > 8 && (endsWith(s, len, "ιομασταν") ||
        endsWith(s, len, "ιοσασταν") ||
        endsWith(s, len, "ιουμαστε") ||
        endsWith(s, len, "οντουσαν")))
      return len - 8;
    
    if (len > 7 && (endsWith(s, len, "ιεμαστε") ||
        endsWith(s, len, "ιεσαστε") ||
        endsWith(s, len, "ιομουνα") ||
        endsWith(s, len, "ιοσαστε") ||
        endsWith(s, len, "ιοσουνα") ||
        endsWith(s, len, "ιουνται") ||
        endsWith(s, len, "ιουνταν") ||
        endsWith(s, len, "ηθηκατε") ||
        endsWith(s, len, "ομασταν") ||
        endsWith(s, len, "οσασταν") ||
        endsWith(s, len, "ουμαστε")))
      return len - 7;
    
    if (len > 6 && (endsWith(s, len, "ιομουν") ||
        endsWith(s, len, "ιονταν") ||
        endsWith(s, len, "ιοσουν") ||
        endsWith(s, len, "ηθειτε") ||
        endsWith(s, len, "ηθηκαν") ||
        endsWith(s, len, "ομουνα") ||
        endsWith(s, len, "οσαστε") ||
        endsWith(s, len, "οσουνα") ||
        endsWith(s, len, "ουνται") ||
        endsWith(s, len, "ουνταν") ||
        endsWith(s, len, "ουσατε")))
      return len - 6;
    
    if (len > 5 && (endsWith(s, len, "αγατε") ||
        endsWith(s, len, "ιεμαι") ||
        endsWith(s, len, "ιεται") ||
        endsWith(s, len, "ιεσαι") ||
        endsWith(s, len, "ιοταν") ||
        endsWith(s, len, "ιουμα") ||
        endsWith(s, len, "ηθεισ") ||
        endsWith(s, len, "ηθουν") ||
        endsWith(s, len, "ηκατε") ||
        endsWith(s, len, "ησατε") ||
        endsWith(s, len, "ησουν") ||
        endsWith(s, len, "ομουν") ||
        endsWith(s, len, "ονται") ||
        endsWith(s, len, "ονταν") ||
        endsWith(s, len, "οσουν") ||
        endsWith(s, len, "ουμαι") ||
        endsWith(s, len, "ουσαν")))
      return len - 5;
    
    if (len > 4 && (endsWith(s, len, "αγαν") ||
        endsWith(s, len, "αμαι") ||
        endsWith(s, len, "ασαι") ||
        endsWith(s, len, "αται") ||
        endsWith(s, len, "ειτε") ||
        endsWith(s, len, "εσαι") ||
        endsWith(s, len, "εται") ||
        endsWith(s, len, "ηδεσ") ||
        endsWith(s, len, "ηδων") ||
        endsWith(s, len, "ηθει") ||
        endsWith(s, len, "ηκαν") ||
        endsWith(s, len, "ησαν") ||
        endsWith(s, len, "ησει") ||
        endsWith(s, len, "ησεσ") ||
        endsWith(s, len, "ομαι") ||
        endsWith(s, len, "οταν")))
      return len - 4;
    
    if (len > 3 && (endsWith(s, len, "αει") ||
        endsWith(s, len, "εισ") ||
        endsWith(s, len, "ηθω") ||
        endsWith(s, len, "ησω") ||
        endsWith(s, len, "ουν") ||
        endsWith(s, len, "ουσ")))
      return len - 3;
    
    if (len > 2 && (endsWith(s, len, "αν") ||
        endsWith(s, len, "ασ") ||
        endsWith(s, len, "αω") ||
        endsWith(s, len, "ει") ||
        endsWith(s, len, "εσ") ||
        endsWith(s, len, "ησ") ||
        endsWith(s, len, "οι") ||
        endsWith(s, len, "οσ") ||
        endsWith(s, len, "ου") ||
        endsWith(s, len, "υσ") ||
        endsWith(s, len, "ων")))
      return len - 2;
    
    if (len > 1 && endsWithVowel(s, len))
      return len - 1;

    return len;
  }
  
  private int rule22(char s[], int len) {
    if (endsWith(s, len, "εστερ") ||
        endsWith(s, len, "εστατ"))
      return len - 5;
    
    if (endsWith(s, len, "οτερ") ||
        endsWith(s, len, "οτατ") ||
        endsWith(s, len, "υτερ") ||
        endsWith(s, len, "υτατ") ||
        endsWith(s, len, "ωτερ") ||
        endsWith(s, len, "ωτατ"))
      return len - 4;

    return len;
  }

  private boolean endsWith(char s[], int len, String suffix) {
    final int suffixLen = suffix.length();
    if (suffixLen > len)
      return false;
    for (int i = suffixLen - 1; i >= 0; i--)
      if (s[len -(suffixLen - i)] != suffix.charAt(i))
        return false;
    
    return true;
  }
  
  private boolean endsWithVowel(char s[], int len) {
    if (len == 0)
      return false;
    switch(s[len - 1]) {
      case 'α':
      case 'ε':
      case 'η':
      case 'ι':
      case 'ο':
      case 'υ':
      case 'ω':
        return true;
      default:
        return false;
    }
  }
  
  private boolean endsWithVowelNoY(char s[], int len) {
    if (len == 0)
      return false;
    switch(s[len - 1]) {
      case 'α':
      case 'ε':
      case 'η':
      case 'ι':
      case 'ο':
      case 'ω':
        return true;
      default:
        return false;
    }
  }
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene GreekStemmer.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.