alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (WikipediaTokenizerImpl.jflex)

This example Lucene source code file (WikipediaTokenizerImpl.jflex) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

alpha, alphanum, alphanum, category, digit, double_bracket, external_link, external_link, has_digit, p, p, whitespace, whitespace, yyinitial

The Lucene WikipediaTokenizerImpl.jflex source code

package org.apache.lucene.analysis.wikipedia;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

%%

%class WikipediaTokenizerImpl
%unicode 3.0
%integer
%function getNextToken
%pack
%char

%{

public static final int ALPHANUM          = WikipediaTokenizer.ALPHANUM_ID;
public static final int APOSTROPHE        = WikipediaTokenizer.APOSTROPHE_ID;
public static final int ACRONYM           = WikipediaTokenizer.ACRONYM_ID;
public static final int COMPANY           = WikipediaTokenizer.COMPANY_ID;
public static final int EMAIL             = WikipediaTokenizer.EMAIL_ID;
public static final int HOST              = WikipediaTokenizer.HOST_ID;
public static final int NUM               = WikipediaTokenizer.NUM_ID;
public static final int CJ                = WikipediaTokenizer.CJ_ID;
public static final int INTERNAL_LINK     = WikipediaTokenizer.INTERNAL_LINK_ID;
public static final int EXTERNAL_LINK     = WikipediaTokenizer.EXTERNAL_LINK_ID;
public static final int CITATION          = WikipediaTokenizer.CITATION_ID;
public static final int CATEGORY          = WikipediaTokenizer.CATEGORY_ID;
public static final int BOLD              = WikipediaTokenizer.BOLD_ID;
public static final int ITALICS           = WikipediaTokenizer.ITALICS_ID;
public static final int BOLD_ITALICS      = WikipediaTokenizer.BOLD_ITALICS_ID;
public static final int HEADING           = WikipediaTokenizer.HEADING_ID;
public static final int SUB_HEADING       = WikipediaTokenizer.SUB_HEADING_ID;
public static final int EXTERNAL_LINK_URL = WikipediaTokenizer.EXTERNAL_LINK_URL_ID;


private int currentTokType;
private int numBalanced = 0;
private int positionInc = 1;
private int numLinkToks = 0;
//Anytime we start a new on a Wiki reserved token (category, link, etc.) this value will be 0, otherwise it will be the number of tokens seen
//this can be useful for detecting when a new reserved token is encountered
//see https://issues.apache.org/jira/browse/LUCENE-1133
private int numWikiTokensSeen = 0;

public static final String [] TOKEN_TYPES = WikipediaTokenizer.TOKEN_TYPES;

/**
Returns the number of tokens seen inside a category or link, etc.
@return the number of tokens seen inside the context of wiki syntax.
**/
public final int getNumWikiTokensSeen(){
  return numWikiTokensSeen;
}

public final int yychar()
{
    return yychar;
}

public final int getPositionIncrement(){
  return positionInc;
}

/**
 * Fills Lucene token with the current token text.
 */
final void getText(CharTermAttribute t) {
  t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}

final int setText(StringBuilder buffer){
  int length = zzMarkedPos - zzStartRead;
  buffer.append(zzBuffer, zzStartRead, length);
  return length;
}


%}

// basic word: a sequence of digits & letters
ALPHANUM   = ({LETTER}|{DIGIT}|{KOREAN})+

// internal apostrophes: O'Reilly, you're, O'Reilly's
// use a post-filter to remove possesives
APOSTROPHE =  {ALPHA} ("'" {ALPHA})+

// acronyms: U.S.A., I.B.M., etc.
// use a post-filter to remove dots
ACRONYM    =  {ALPHA} "." ({ALPHA} ".")+

// company names like AT&T and Excite@Home.
COMPANY    =  {ALPHA} ("&"|"@") {ALPHA}

// email addresses
EMAIL      =  {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+

// hostname
HOST       =  {ALPHANUM} ((".") {ALPHANUM})+

// floating point, serial, model numbers, ip addresses, etc.
// every other segment must have at least one digit
NUM        = ({ALPHANUM} {P} {HAS_DIGIT}
           | {DIGIT}+ {P} {DIGIT}+
           | {HAS_DIGIT} {P} {ALPHANUM}
           | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
           | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
           | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
           | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)

TAGS = "<"\/?{ALPHANUM}({WHITESPACE}*{ALPHANUM}=\"{ALPHANUM}\")*">"

// punctuation
P	         = ("_"|"-"|"/"|"."|",")

// at least one digit
HAS_DIGIT  =
    ({LETTER}|{DIGIT})*
    {DIGIT}
    ({LETTER}|{DIGIT})*

ALPHA      = ({LETTER})+


LETTER     = [\u0041-\u005a\u0061-\u007a\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u00ff\u0100-\u1fff\uffa0-\uffdc]

DIGIT      = [\u0030-\u0039\u0660-\u0669\u06f0-\u06f9\u0966-\u096f\u09e6-\u09ef\u0a66-\u0a6f\u0ae6-\u0aef\u0b66-\u0b6f\u0be7-\u0bef\u0c66-\u0c6f\u0ce6-\u0cef\u0d66-\u0d6f\u0e50-\u0e59\u0ed0-\u0ed9\u1040-\u1049]

KOREAN     = [\uac00-\ud7af\u1100-\u11ff]

// Chinese, Japanese
CJ         = [\u3040-\u318f\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]

WHITESPACE = \r\n | [ \r\n\t\f]

//Wikipedia
DOUBLE_BRACKET = "["{2}
DOUBLE_BRACKET_CLOSE = "]"{2}
DOUBLE_BRACKET_CAT = "["{2}":"?"Category:"
EXTERNAL_LINK = "["
TWO_SINGLE_QUOTES = "'"{2}
CITATION = "<ref>"
CITATION_CLOSE = "</ref>"
INFOBOX = {DOUBLE_BRACE}("I"|"i")nfobox_

DOUBLE_BRACE = "{"{2}
DOUBLE_BRACE_CLOSE = "}"{2}
PIPE = "|"
DOUBLE_EQUALS = "="{2}


%state CATEGORY_STATE
%state INTERNAL_LINK_STATE
%state EXTERNAL_LINK_STATE

%state TWO_SINGLE_QUOTES_STATE
%state THREE_SINGLE_QUOTES_STATE
%state FIVE_SINGLE_QUOTES_STATE
%state DOUBLE_EQUALS_STATE
%state DOUBLE_BRACE_STATE
%state STRING

%%

<YYINITIAL>{ALPHANUM}                                                     {positionInc = 1; return ALPHANUM; }
<YYINITIAL>{APOSTROPHE}                                                   {positionInc = 1; return APOSTROPHE; }
<YYINITIAL>{ACRONYM}                                                      {positionInc = 1; return ACRONYM; }
<YYINITIAL>{COMPANY}                                                      {positionInc = 1; return COMPANY; }
<YYINITIAL>{EMAIL}                                                        {positionInc = 1; return EMAIL; }
<YYINITIAL>{NUM}                                                          {positionInc = 1; return NUM; }
<YYINITIAL>{HOST}                                                         {positionInc = 1; return HOST; }
<YYINITIAL>{CJ}                                                           {positionInc = 1; return CJ; }

//wikipedia
<YYINITIAL>{
  //First {ALPHANUM} is always the link, set positioninc to 1 for double bracket, but then inside the internal link state
  //set it to 0 for the next token, such that the link and the first token are in the same position, but then subsequent
  //tokens within the link are incremented
  {DOUBLE_BRACKET} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);}
  {DOUBLE_BRACKET_CAT} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);}
  {EXTERNAL_LINK} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);}
  {TWO_SINGLE_QUOTES} {numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}}
  {DOUBLE_EQUALS} {numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);}
  {DOUBLE_BRACE} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);}
  {CITATION} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);}
//ignore
  . | {WHITESPACE} |{INFOBOX}                                               {numWikiTokensSeen = 0;  positionInc = 1; }
}

<INTERNAL_LINK_STATE>{
//First {ALPHANUM} is always the link, set position to 0 for these
//This is slightly different from EXTERNAL_LINK_STATE because that one has an explicit grammar for capturing the URL
  {ALPHANUM} {yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;}
  {DOUBLE_BRACKET_CLOSE} {numLinkToks = 0; yybegin(YYINITIAL);}
  //ignore
  . | {WHITESPACE}                                               { positionInc = 1; }
}

<EXTERNAL_LINK_STATE>{
//increment the link token, but then don't increment the tokens after that which are still in the link
  ("http://"|"https://"){HOST}("/"?({ALPHANUM}|{P}|\?|"&"|"="|"#")*)* {positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;}
  {ALPHANUM} {if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;}
  "]" {numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL);}
  {WHITESPACE}                                               { positionInc = 1; }
}

<CATEGORY_STATE>{
  {ALPHANUM} {yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;}
  {DOUBLE_BRACKET_CLOSE} {yybegin(YYINITIAL);}
  //ignore
  . | {WHITESPACE}                                               { positionInc = 1; }
}
//italics
<TWO_SINGLE_QUOTES_STATE>{
  "'" {currentTokType = BOLD;  yybegin(THREE_SINGLE_QUOTES_STATE);}
   "'''" {currentTokType = BOLD_ITALICS;  yybegin(FIVE_SINGLE_QUOTES_STATE);}
   {ALPHANUM} {currentTokType = ITALICS; numWikiTokensSeen++;  yybegin(STRING); return currentTokType;/*italics*/}
   //we can have links inside, let those override
   {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);}
   {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);}
   {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);}

   //ignore
  . | {WHITESPACE}                                               { /* ignore */ }
}
//bold
<THREE_SINGLE_QUOTES_STATE>{
  {ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;}
  //we can have links inside, let those override
   {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);}
   {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);}
   {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);}

   //ignore
  . | {WHITESPACE}                                               { /* ignore */ }

}
//bold italics
<FIVE_SINGLE_QUOTES_STATE>{
  {ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;}
  //we can have links inside, let those override
   {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0;  yybegin(INTERNAL_LINK_STATE);}
   {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);}
   {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);}

   //ignore
  . | {WHITESPACE}                                               { /* ignore */ }
}

<DOUBLE_EQUALS_STATE>{
 "=" {currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING);}
 {ALPHANUM} {currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;}
 {DOUBLE_EQUALS} {yybegin(YYINITIAL);}
  //ignore
  . | {WHITESPACE}                                               { /* ignore */ }
}

<DOUBLE_BRACE_STATE>{
  {ALPHANUM} {yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;}
  {DOUBLE_BRACE_CLOSE} {yybegin(YYINITIAL);}
  {CITATION_CLOSE} {yybegin(YYINITIAL);}
   //ignore
  . | {WHITESPACE}                                               { /* ignore */ }
}

<STRING> {
  "'''''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end bold italics*/}
  "'''" {numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/}
  "''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/}
  "===" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/}
  {ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/}
  //we can have links inside, let those override
   {DOUBLE_BRACKET} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
   {DOUBLE_BRACKET_CAT} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
   {EXTERNAL_LINK} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}


  {PIPE} {yybegin(STRING); return currentTokType;/*pipe*/}

  .|{WHITESPACE}                                              { /* ignore STRING */ }
}




/*
{INTERNAL_LINK}                                                { return curentTokType; }

{CITATION}                                                { return currentTokType; }
{CATEGORY}                                                { return currentTokType; }

{BOLD}                                                { return currentTokType; }
{ITALICS}                                                { return currentTokType; }
{BOLD_ITALICS}                                                { return currentTokType; }
{HEADING}                                                { return currentTokType; }
{SUB_HEADING}                                                { return currentTokType; }

*/
//end wikipedia

/** Ignore the rest */
. | {WHITESPACE}|{TAGS}                                                { /* ignore */ }


//INTERNAL_LINK = "["{2}({ALPHANUM}+{WHITESPACE}*)+"]"{2}
//EXTERNAL_LINK = "["http://"{HOST}.*?"]"
//CITATION = "{"{2}({ALPHANUM}+{WHITESPACE}*)+"}"{2}
//CATEGORY = "["{2}"Category:"({ALPHANUM}+{WHITESPACE}*)+"]"{2}
//CATEGORY_COLON = "["{2}":Category:"({ALPHANUM}+{WHITESPACE}*)+"]"{2}
//BOLD = '''({ALPHANUM}+{WHITESPACE}*)+'''
//ITALICS = ''({ALPHANUM}+{WHITESPACE}*)+''
//BOLD_ITALICS = '''''({ALPHANUM}+{WHITESPACE}*)+'''''
//HEADING = "="{2}({ALPHANUM}+{WHITESPACE}*)+"="{2}
//SUB_HEADING ="="{3}({ALPHANUM}+{WHITESPACE}*)+"="{3}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene WikipediaTokenizerImpl.jflex source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.