alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (TestFrenchAnalyzer.java)

This example Lucene source code file (TestFrenchAnalyzer.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

c3po, chararrayset, chat, cheval, cheval, deprecated, exception, exception, frenchanalyzer, frenchanalyzer, io, ioexception, jean-fran, string, string

The Lucene TestFrenchAnalyzer.java source code

package org.apache.lucene.analysis.fr;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;

import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.Version;

/**
 * Test case for FrenchAnalyzer.
 *
 * @version   $version$
 */

public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {

	public void testAnalyzer() throws Exception {
		FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
	
		assertAnalyzesTo(fa, "", new String[] {
		});

		assertAnalyzesTo(
			fa,
			"chien chat cheval",
			new String[] { "chien", "chat", "cheval" });

		assertAnalyzesTo(
			fa,
			"chien CHAT CHEVAL",
			new String[] { "chien", "chat", "cheval" });

		assertAnalyzesTo(
			fa,
			"  chien  ,? + = -  CHAT /: > CHEVAL",
			new String[] { "chien", "chat", "cheval" });

		assertAnalyzesTo(fa, "chien++", new String[] { "chien" });

		assertAnalyzesTo(
			fa,
			"mot \"entreguillemet\"",
			new String[] { "mot", "entreguillemet" });

		// let's do some french specific tests now	

		/* 1. couldn't resist
		 I would expect this to stay one term as in French the minus 
		sign is often used for composing words */
		assertAnalyzesTo(
			fa,
			"Jean-François",
			new String[] { "jean", "françois" });

		// 2. stopwords
		assertAnalyzesTo(
			fa,
			"le la chien les aux chat du des à cheval",
			new String[] { "chien", "chat", "cheval" });

		// some nouns and adjectives
		assertAnalyzesTo(
			fa,
			"lances chismes habitable chiste éléments captifs",
			new String[] {
				"lanc",
				"chism",
				"habit",
				"chist",
				"élément",
				"captif" });

		// some verbs
		assertAnalyzesTo(
			fa,
			"finissions souffrirent rugissante",
			new String[] { "fin", "souffr", "rug" });

		// some everything else
		// aujourd'hui stays one term which is OK
		assertAnalyzesTo(
			fa,
			"C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
			new String[] {
				"c3po",
				"aujourd'hui",
				"oeuf",
				"ïâöûàä",
				"anticonstitutionnel",
				"jav" });

		// some more everything else
		// here 1940-1945 stays as one term, 1940:1945 not ?
		assertAnalyzesTo(
			fa,
			"33Bis 1940-1945 1940:1945 (---i+++)*",
			new String[] { "33bis", "1940", "1945", "1940", "1945", "i" });

	}
	
	/**
	 * @deprecated remove this test for Lucene 4.0
	 */
	@Deprecated
	public void testAnalyzer30() throws Exception {
	    FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_30);
	  
	    assertAnalyzesTo(fa, "", new String[] {
	    });

	    assertAnalyzesTo(
	      fa,
	      "chien chat cheval",
	      new String[] { "chien", "chat", "cheval" });

	    assertAnalyzesTo(
	      fa,
	      "chien CHAT CHEVAL",
	      new String[] { "chien", "chat", "cheval" });

	    assertAnalyzesTo(
	      fa,
	      "  chien  ,? + = -  CHAT /: > CHEVAL",
	      new String[] { "chien", "chat", "cheval" });

	    assertAnalyzesTo(fa, "chien++", new String[] { "chien" });

	    assertAnalyzesTo(
	      fa,
	      "mot \"entreguillemet\"",
	      new String[] { "mot", "entreguillemet" });

	    // let's do some french specific tests now  

	    /* 1. couldn't resist
	     I would expect this to stay one term as in French the minus 
	    sign is often used for composing words */
	    assertAnalyzesTo(
	      fa,
	      "Jean-François",
	      new String[] { "jean", "françois" });

	    // 2. stopwords
	    assertAnalyzesTo(
	      fa,
	      "le la chien les aux chat du des à cheval",
	      new String[] { "chien", "chat", "cheval" });

	    // some nouns and adjectives
	    assertAnalyzesTo(
	      fa,
	      "lances chismes habitable chiste éléments captifs",
	      new String[] {
	        "lanc",
	        "chism",
	        "habit",
	        "chist",
	        "élément",
	        "captif" });

	    // some verbs
	    assertAnalyzesTo(
	      fa,
	      "finissions souffrirent rugissante",
	      new String[] { "fin", "souffr", "rug" });

	    // some everything else
	    // aujourd'hui stays one term which is OK
	    assertAnalyzesTo(
	      fa,
	      "C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
	      new String[] {
	        "c3po",
	        "aujourd'hui",
	        "oeuf",
	        "ïâöûàä",
	        "anticonstitutionnel",
	        "jav" });

	    // some more everything else
	    // here 1940-1945 stays as one term, 1940:1945 not ?
	    assertAnalyzesTo(
	      fa,
	      "33Bis 1940-1945 1940:1945 (---i+++)*",
	      new String[] { "33bis", "1940-1945", "1940", "1945", "i" });

	  }
	
	public void testReusableTokenStream() throws Exception {
	  FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
	  // stopwords
      assertAnalyzesToReuse(
          fa,
          "le la chien les aux chat du des à cheval",
          new String[] { "chien", "chat", "cheval" });

      // some nouns and adjectives
      assertAnalyzesToReuse(
          fa,
          "lances chismes habitable chiste éléments captifs",
          new String[] {
              "lanc",
              "chism",
              "habit",
              "chist",
              "élément",
              "captif" });
	}

	/* 
	 * Test that changes to the exclusion table are applied immediately
	 * when using reusable token streams.
	 */
	public void testExclusionTableReuse() throws Exception {
	  FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
	  assertAnalyzesToReuse(fa, "habitable", new String[] { "habit" });
	  fa.setStemExclusionTable(new String[] { "habitable" });
	  assertAnalyzesToReuse(fa, "habitable", new String[] { "habitable" });
	}
	
  public void testExclusionTableViaCtor() throws Exception {
    CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
    set.add("habitable");
    FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT,
        CharArraySet.EMPTY_SET, set);
    assertAnalyzesToReuse(fa, "habitable chiste", new String[] { "habitable",
        "chist" });

    fa = new FrenchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
    assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
        "chist" });
  }
  
  public void testElision() throws Exception {
    FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
    assertAnalyzesTo(fa, "voir l'embrouille", new String[] { "voir", "embrouill" });
  }
  
  /**
   * Prior to 3.1, this analyzer had no lowercase filter.
   * stopwords were case sensitive. Preserve this for back compat.
   * @deprecated Remove this test in Lucene 4.0
   */
  @Deprecated
  public void testBuggyStopwordsCasing() throws IOException {
    FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_30);
    assertAnalyzesTo(a, "Votre", new String[] { "votr" });
  }
  
  /**
   * Test that stopwords are not case sensitive
   */
  public void testStopwordsCasing() throws IOException {
    FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_31);
    assertAnalyzesTo(a, "Votre", new String[] { });
  }
  
  /** blast some random strings through the analyzer */
  public void testRandomStrings() throws Exception {
    checkRandomData(random, new FrenchAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
  }
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene TestFrenchAnalyzer.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.