alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (TestClassicAnalyzer.java)

This example Lucene source code file (TestClassicAnalyzer.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

alphanum, c, classicanalyzer, classicanalyzer, document, exception, exception, host, host, indexwriter, io, num, string, string, term, util

The Lucene TestClassicAnalyzer.java source code

package org.apache.lucene.analysis;

import org.apache.lucene.analysis.standard.ClassicAnalyzer;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;

import java.io.IOException;
import java.util.Arrays;


/**
 * Copyright 2004 The Apache Software Foundation
 * <p/>
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * <p/>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p/>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

public class TestClassicAnalyzer extends BaseTokenStreamTestCase {

  private Analyzer a = new ClassicAnalyzer(TEST_VERSION_CURRENT);

  public void testMaxTermLength() throws Exception {
    ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
    sa.setMaxTokenLength(5);
    assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"});
  }

  public void testMaxTermLength2() throws Exception {
    ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
    assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "toolong", "xy", "z"});
    sa.setMaxTokenLength(5);
    
    assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"}, new int[]{1, 1, 2, 1});
  }

  public void testMaxTermLength3() throws Exception {
    char[] chars = new char[255];
    for(int i=0;i<255;i++)
      chars[i] = 'a';
    String longTerm = new String(chars, 0, 255);
    
    assertAnalyzesTo(a, "ab cd " + longTerm + " xy z", new String[]{"ab", "cd", longTerm, "xy", "z"});
    assertAnalyzesTo(a, "ab cd " + longTerm + "a xy z", new String[]{"ab", "cd", "xy", "z"});
  }

  public void testAlphanumeric() throws Exception {
    // alphanumeric tokens
    assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
    assertAnalyzesTo(a, "2B", new String[]{"2b"});
  }

  public void testUnderscores() throws Exception {
    // underscores are delimiters, but not in email addresses (below)
    assertAnalyzesTo(a, "word_having_underscore", new String[]{"word", "having", "underscore"});
    assertAnalyzesTo(a, "word_with_underscore_and_stopwords", new String[]{"word", "underscore", "stopwords"});
  }

  public void testDelimiters() throws Exception {
    // other delimiters: "-", "/", ","
    assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"});
    assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
    assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
  }

  public void testApostrophes() throws Exception {
    // internal apostrophes: O'Reilly, you're, O'Reilly's
    // possessives are actually removed by StardardFilter, not the tokenizer
    assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
    assertAnalyzesTo(a, "you're", new String[]{"you're"});
    assertAnalyzesTo(a, "she's", new String[]{"she"});
    assertAnalyzesTo(a, "Jim's", new String[]{"jim"});
    assertAnalyzesTo(a, "don't", new String[]{"don't"});
    assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly"});
  }

  public void testTSADash() throws Exception {
    // t and s had been stopwords in Lucene <= 2.0, which made it impossible
    // to correctly search for these terms:
    assertAnalyzesTo(a, "s-class", new String[]{"s", "class"});
    assertAnalyzesTo(a, "t-com", new String[]{"t", "com"});
    // 'a' is still a stopword:
    assertAnalyzesTo(a, "a-class", new String[]{"class"});
  }

  public void testCompanyNames() throws Exception {
    // company names
    assertAnalyzesTo(a, "AT&T", new String[]{"at&t"});
    assertAnalyzesTo(a, "Excite@Home", new String[]{"excite@home"});
  }

  public void testLucene1140() throws Exception {
    try {
      ClassicAnalyzer analyzer = new ClassicAnalyzer(TEST_VERSION_CURRENT);
      assertAnalyzesTo(analyzer, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
    } catch (NullPointerException e) {
      fail("Should not throw an NPE and it did");
    }

  }

  public void testDomainNames() throws Exception {
    // Current lucene should not show the bug
    ClassicAnalyzer a2 = new ClassicAnalyzer(TEST_VERSION_CURRENT);

    // domain names
    assertAnalyzesTo(a2, "www.nutch.org", new String[]{"www.nutch.org"});
    //Notice the trailing .  See https://issues.apache.org/jira/browse/LUCENE-1068.
    // the following should be recognized as HOST:
    assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });

    // 2.3 should show the bug
    a2 = new ClassicAnalyzer(org.apache.lucene.util.Version.LUCENE_23);
    assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "wwwnutchorg" }, new String[] { "<ACRONYM>" });

    // 2.4 should not show the bug
    a2 = new ClassicAnalyzer(Version.LUCENE_24);
    assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
  }

  public void testEMailAddresses() throws Exception {
    // email addresses, possibly with underscores, periods, etc
    assertAnalyzesTo(a, "test@example.com", new String[]{"test@example.com"});
    assertAnalyzesTo(a, "first.lastname@example.com", new String[]{"first.lastname@example.com"});
    assertAnalyzesTo(a, "first_lastname@example.com", new String[]{"first_lastname@example.com"});
  }

  public void testNumeric() throws Exception {
    // floating point, serial, model numbers, ip addresses, etc.
    // every other segment must have at least one digit
    assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
    assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
    assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
    assertAnalyzesTo(a, "1-2-3", new String[]{"1-2-3"});
    assertAnalyzesTo(a, "a1-b2-c3", new String[]{"a1-b2-c3"});
    assertAnalyzesTo(a, "a1-b-c3", new String[]{"a1-b-c3"});
  }

  public void testTextWithNumbers() throws Exception {
    // numbers
    assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
  }

  public void testVariousText() throws Exception {
    // various
    assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"});
    assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
    assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
    assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
  }

  public void testAcronyms() throws Exception {
    // acronyms have their dots stripped
    assertAnalyzesTo(a, "U.S.A.", new String[]{"usa"});
  }

  public void testCPlusPlusHash() throws Exception {
    // It would be nice to change the grammar in StandardTokenizer.jj to make "C#" and "C++" end up as tokens.
    assertAnalyzesTo(a, "C++", new String[]{"c"});
    assertAnalyzesTo(a, "C#", new String[]{"c"});
  }

  public void testKorean() throws Exception {
    // Korean words
    assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
  }

  // Compliance with the "old" JavaCC-based analyzer, see:
  // https://issues.apache.org/jira/browse/LUCENE-966#action_12516752

  public void testComplianceFileName() throws Exception {
    assertAnalyzesTo(a, "2004.jpg",
            new String[]{"2004.jpg"},
            new String[]{"<HOST>"});
  }

  public void testComplianceNumericIncorrect() throws Exception {
    assertAnalyzesTo(a, "62.46",
            new String[]{"62.46"},
            new String[]{"<HOST>"});
  }

  public void testComplianceNumericLong() throws Exception {
    assertAnalyzesTo(a, "978-0-94045043-1",
            new String[]{"978-0-94045043-1"},
            new String[]{"<NUM>"});
  }

  public void testComplianceNumericFile() throws Exception {
    assertAnalyzesTo(
            a,
            "78academyawards/rules/rule02.html",
            new String[]{"78academyawards/rules/rule02.html"},
            new String[]{"<NUM>"});
  }

  public void testComplianceNumericWithUnderscores() throws Exception {
    assertAnalyzesTo(
            a,
            "2006-03-11t082958z_01_ban130523_rtridst_0_ozabs",
            new String[]{"2006-03-11t082958z_01_ban130523_rtridst_0_ozabs"},
            new String[]{"<NUM>"});
  }

  public void testComplianceNumericWithDash() throws Exception {
    assertAnalyzesTo(a, "mid-20th", new String[]{"mid-20th"},
            new String[]{"<NUM>"});
  }

  public void testComplianceManyTokens() throws Exception {
    assertAnalyzesTo(
            a,
            "/money.cnn.com/magazines/fortune/fortune_archive/2007/03/19/8402357/index.htm "
                    + "safari-0-sheikh-zayed-grand-mosque.jpg",
            new String[]{"money.cnn.com", "magazines", "fortune",
                    "fortune", "archive/2007/03/19/8402357", "index.htm",
                    "safari-0-sheikh", "zayed", "grand", "mosque.jpg"},
            new String[]{"<HOST>", "", "",
                    "<ALPHANUM>", "", "", "", "",
                    "<ALPHANUM>", ""});
  }

  public void testJava14BWCompatibility() throws Exception {
    ClassicAnalyzer sa = new ClassicAnalyzer(Version.LUCENE_30);
    assertAnalyzesTo(sa, "test\u02C6test", new String[] { "test", "test" });
  }

  /**
   * Make sure we skip wicked long terms.
   */
  public void testWickedLongTerm() throws IOException {
    RAMDirectory dir = new RAMDirectory();
    IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
      TEST_VERSION_CURRENT, new ClassicAnalyzer(TEST_VERSION_CURRENT)));

    char[] chars = new char[IndexWriter.MAX_TERM_LENGTH];
    Arrays.fill(chars, 'x');
    Document doc = new Document();
    final String bigTerm = new String(chars);

    // This produces a too-long term:
    String contents = "abc xyz x" + bigTerm + " another term";
    doc.add(new Field("content", contents, Field.Store.NO, Field.Index.ANALYZED));
    writer.addDocument(doc);

    // Make sure we can add another normal document
    doc = new Document();
    doc.add(new Field("content", "abc bbb ccc", Field.Store.NO, Field.Index.ANALYZED));
    writer.addDocument(doc);
    writer.close();

    IndexReader reader = IndexReader.open(dir, true);

    // Make sure all terms < max size were indexed
    assertEquals(2, reader.docFreq(new Term("content", "abc")));
    assertEquals(1, reader.docFreq(new Term("content", "bbb")));
    assertEquals(1, reader.docFreq(new Term("content", "term")));
    assertEquals(1, reader.docFreq(new Term("content", "another")));

    // Make sure position is still incremented when
    // massive term is skipped:
    TermPositions tps = reader.termPositions(new Term("content", "another"));
    assertTrue(tps.next());
    assertEquals(1, tps.freq());
    assertEquals(3, tps.nextPosition());

    // Make sure the doc that has the massive term is in
    // the index:
    assertEquals("document with wicked long term should is not in the index!", 2, reader.numDocs());

    reader.close();

    // Make sure we can add a document with exactly the
    // maximum length term, and search on that term:
    doc = new Document();
    doc.add(new Field("content", bigTerm, Field.Store.NO, Field.Index.ANALYZED));
    ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
    sa.setMaxTokenLength(100000);
    writer  = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, sa));
    writer.addDocument(doc);
    writer.close();
    reader = IndexReader.open(dir, true);
    assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
    reader.close();

    dir.close();
  }
  
  /** blast some random strings through the analyzer */
  public void testRandomStrings() throws Exception {
    checkRandomData(random, new ClassicAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
  }
}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene TestClassicAnalyzer.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2024 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.