alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (build.xml)

This example Lucene source code file (build.xml) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

benchmarking, benchmarking, contributions, directory, directory, jira, jira, lucene, run, run

The Lucene build.xml source code

<?xml version="1.0"?>
<project name="benchmark" default="default">

    <description>
        Lucene Benchmarking Contributions
    </description>

    <import file="../contrib-build.xml"/>
    <property name="working.dir" location="work"/>

    <!-- the tests have some parallel problems -->
    <property name="tests.threadspercpu" value="0"/>

    <contrib-uptodate name="highlighter" property="highlighter.uptodate" classpath.property="highlighter.jar"/>
    <contrib-uptodate name="icu" property="icu.uptodate" classpath.property="icu.jar"/>
    <!-- analyzers common needs a hack for the jar file: -->
    <contrib-uptodate name="analyzers/common" jarfile="${common.dir}/build/contrib/analyzers/common/lucene-analyzers-${version}.jar"
      property="analyzers-common.uptodate" classpath.property="analyzers-common.jar"/>
    <contrib-uptodate name="memory" property="memory.uptodate" classpath.property="memory.jar"/>

    <target name="check-files">
        <available file="temp/news20.tar.gz" property="news20.exists"/>

        <available file="${working.dir}/20_newsgroup" property="news20.expanded"/>

        <available file="temp/reuters21578.tar.gz" property="reuters.exists"/>
        <available file="${working.dir}/reuters" property="reuters.expanded"/>
        <available file="${working.dir}/reuters-out" property="reuters.extracted"/>
        <available file="temp/20news-18828.tar.gz" property="20news-18828.exists"/>
        <available file="${working.dir}/20news-18828" property="20news-18828.expanded"/>
        <available file="${working.dir}/mini_newsgroups" property="mini.expanded"/>
        
        <available file="temp/enwiki-20070527-pages-articles.xml.bz2" property="enwiki.exists"/>
        <available file="temp/enwiki-20070527-pages-articles.xml" property="enwiki.expanded"/>
        <available file="${working.dir}/enwiki.txt" property="enwiki.extracted"/>
    	<available file="temp/${top.100k.words.archive.filename}"
                   property="top.100k.words.archive.present"/>
    	<available file="${working.dir}/top100k-out" 
                   property="top.100k.word.files.expanded"/>
    </target>

    <target name="enwiki-files" depends="check-files">
        <mkdir dir="temp"/>
        <antcall target="get-enwiki"/>
        <antcall target="expand-enwiki"/>
    </target>

    <target name="get-enwiki" unless="enwiki.exists">
        <get src="http://people.apache.org/~gsingers/wikipedia/enwiki-20070527-pages-articles.xml.bz2"
             dest="temp/enwiki-20070527-pages-articles.xml.bz2"/>
    </target>

    <target name="expand-enwiki"  unless="enwiki.expanded">
        <bunzip2 src="temp/enwiki-20070527-pages-articles.xml.bz2" dest="temp"/>
    </target>

    <target name="get-news-20" unless="20news-18828.exists">
        <get src="http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz"
             dest="temp/news20.tar.gz"/>

    </target>
    <target name="get-reuters" unless="reuters.exists">

        <get src="http://www.daviddlewis.com/resources/testcollections/reuters21578/reuters21578.tar.gz"
            dest="temp/reuters21578.tar.gz"/>
    </target>

    <target name="expand-news-20"  unless="news20.expanded">
        <gunzip src="temp/news20.tar.gz" dest="temp"/>
        <untar src="temp/news20.tar" dest="${working.dir}"/>
    </target>
    <target name="expand-reuters" unless="reuters.expanded">
        <gunzip src="temp/reuters21578.tar.gz" dest="temp"/>
        <mkdir dir="${working.dir}/reuters"/>
        <untar src="temp/reuters21578.tar" dest="${working.dir}/reuters"/>
        <delete >
            <fileset dir="${working.dir}/reuters">
                <include name="*.txt"/>
            </fileset>
        </delete>

    </target>
    <target name="extract-reuters" depends="check-files" unless="reuters.extracted">
        <java classname="org.apache.lucene.benchmark.utils.ExtractReuters" maxmemory="1024M" fork="true">
            <classpath refid="run.classpath"/>
            <arg file="${working.dir}/reuters"/>
            <arg file="${working.dir}/reuters-out"/>
        </java>
    </target>
    <target name="get-20news-18828" unless="20news-18828.exists">
        <get src="http://people.csail.mit.edu/u/j/jrennie/public_html/20Newsgroups/20news-18828.tar.gz"
             dest="temp/20news-18828.tar.gz"/>

    </target>
    <target name="expand-20news-18828" unless="20news-18828.expanded">
        <gunzip src="temp/20news-18828.tar.gz" dest="temp"/>
        <untar src="temp/20news-18828.tar" dest="${working.dir}"/>
    </target>
    <target name="get-mini-news" unless="mini.exists">
        <get src="http://kdd.ics.uci.edu/databases/20newsgroups/mini_newsgroups.tar.gz"
             dest="temp/mini_newsgroups.tar.gz"/>
    </target>
    <target name="expand-mini-news" unless="mini.expanded">
        <gunzip src="temp/mini_newsgroups.tar.gz" dest="temp"/>
        <untar src="temp/mini_newsgroups.tar" dest="${working.dir}"/>
    </target>

	<property name="top.100k.words.archive.filename" 
	          value="top.100k.words.de.en.fr.uk.wikipedia.2009-11.tar.bz2"/>
	<property name="top.100k.words.archive.base.url"
	          value="http://people.apache.org/~rmuir/wikipedia"/>
	<target name="get-top-100k-words-archive" unless="top.100k.words.archive.present">
		<mkdir dir="temp"/>
	    <get src="${top.100k.words.archive.base.url}/${top.100k.words.archive.filename}"
	         dest="temp/${top.100k.words.archive.filename}"/>
	</target>
	<target name="expand-top-100k-word-files" unless="top.100k.word.files.expanded">
		<mkdir dir="${working.dir}/top100k-out"/>
	    <untar src="temp/${top.100k.words.archive.filename}"
	           overwrite="true" compression="bzip2" dest="${working.dir}/top100k-out"/>
	</target>
	
	<target name="top-100k-wiki-word-files" depends="check-files">
	  <mkdir dir="${working.dir}"/>
	  <antcall target="get-top-100k-words-archive"/>
	  <antcall target="expand-top-100k-word-files"/>
	</target>
	
    <target name="get-files" depends="check-files">
        <mkdir dir="temp"/>
        <antcall target="get-reuters"/>
        <antcall target="expand-reuters"/>
        <antcall target="extract-reuters"/>
    </target>

    <path id="classpath">
      <pathelement path="${memory.jar}"/>
      <pathelement path="${highlighter.jar}"/>
      <pathelement path="${analyzers-common.jar}"/>
      <path refid="base.classpath"/>
    	<fileset dir="lib">
    		<include name="**/*.jar"/>
    	</fileset>
    </path>
    <path id="run.classpath">
        <path refid="classpath"/>
        <pathelement location="${build.dir}/classes/java"/>
        <pathelement path="${benchmark.ext.classpath}"/>
    </path>

    <property name="task.alg" location="conf/micro-standard.alg"/>
    <property name="task.mem" value="140M"/>

    <target name="run-task" depends="compile,check-files,get-files" 
     description="Run compound penalty perf test (optional: -Dtask.alg=your-algorithm-file -Dtask.mem=java-max-mem)">
        <echo>Working Directory: ${working.dir}
        <java classname="org.apache.lucene.benchmark.byTask.Benchmark" maxmemory="${task.mem}" fork="true">
            <classpath refid="run.classpath"/>
            <arg file="${task.alg}"/>
        </java>
    </target>

    <target name="enwiki" depends="compile,check-files,enwiki-files">
        <echo>Working Directory: ${working.dir}
        <java classname="org.apache.lucene.benchmark.byTask.Benchmark" maxmemory="1024M" fork="true">
            <assertions>
              <enable/>
            </assertions>
            <classpath refid="run.classpath"/>
            <arg file="conf/extractWikipedia.alg"/>
        </java>
    </target>

	<property name="collation.alg.file" location="conf/collation.alg"/>
	<property name="collation.output.file" 
	          value="${working.dir}/collation.benchmark.output.txt"/>
	<property name="collation.jira.output.file" 
	          value="${working.dir}/collation.bm2jira.output.txt"/>
	
	<path id="collation.runtime.classpath">
	  <path refid="run.classpath"/>
    <pathelement path="${icu.jar}"/>
    <fileset dir="${common.dir}/contrib/icu/lib" includes="icu4j*.jar"/>
	</path>
	
	<target name="collation" depends="compile,compile-icu,top-100k-wiki-word-files">
	    <echo>Running contrib/benchmark with alg file: ${collation.alg.file}
	    <java fork="true" classname="org.apache.lucene.benchmark.byTask.Benchmark" 
	          maxmemory="${task.mem}" output="${collation.output.file}">
	      <classpath refid="collation.runtime.classpath"/>
	      <arg file="${collation.alg.file}"/>
	    </java>
	    <echo>Benchmark output is in file: ${collation.output.file}
	    <echo>Converting to JIRA table format...
	    <exec executable="perl" output="${collation.jira.output.file}" failonerror="true">
	      <arg value="scripts/collation.bm2jira.pl"/>
	      <arg value="${collation.output.file}"/>
	    </exec>
	    <echo>Benchmark output in JIRA table format is in file: ${collation.jira.output.file}
	</target>
	
    <property name="shingle.alg.file" location="conf/shingle.alg"/>
    <property name="shingle.output.file" 
              value="${working.dir}/shingle.benchmark.output.txt"/>
    <property name="shingle.jira.output.file" 
              value="${working.dir}/shingle.bm2jira.output.txt"/>
	
    <path id="shingle.runtime.classpath">
      <path refid="run.classpath"/>
    </path>
	
    <target name="shingle" depends="compile,get-files">
      <echo>Running contrib/benchmark with alg file: ${shingle.alg.file}
      <java fork="true" classname="org.apache.lucene.benchmark.byTask.Benchmark" 
            maxmemory="${task.mem}" output="${shingle.output.file}">
        <classpath refid="run.classpath"/>
        <arg file="${shingle.alg.file}"/>
      </java>
      <echo>Benchmark output is in file: ${shingle.output.file}
      <echo>Converting to JIRA table format...
      <exec executable="perl" output="${shingle.jira.output.file}" failonerror="true">
        <arg value="scripts/shingle.bm2jira.pl"/>
        <arg value="${shingle.output.file}"/>
      </exec>
      <echo>Benchmark output in JIRA table format is in file: ${shingle.jira.output.file}
    </target>

    <target name="compile-highlighter" unless="highlighter.uptodate">
      <subant target="default">
         <fileset dir="${common.dir}/contrib/highlighter" includes="build.xml"/>
      </subant>
    </target>
    <target name="compile-icu" unless="icu.uptodate">
      <subant target="default">
         <fileset dir="${common.dir}/contrib/icu" includes="build.xml"/>
      </subant>
    </target>
    <target name="compile-analyzers-common" unless="analyzers-common.uptodate">
      <subant target="default">
        <fileset dir="${common.dir}/contrib/analyzers/common" includes="build.xml"/>
      </subant>
    </target>
    <target name="compile-memory" unless="memory.uptodate">
      <subant target="default">
         <fileset dir="${common.dir}/contrib/memory" includes="build.xml"/>
      </subant>
    </target>

    <target name="init" depends="contrib-build.init,compile-memory,compile-highlighter,compile-analyzers-common"/>
  
    <target name="clean-javacc">
      <fileset dir="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml" includes="*.java">
	<containsregexp expression="Generated.*By.*JavaCC"/>
      </fileset>
    </target>
    
    <target name="javacc" depends="init,javacc-check" if="javacc.present">
      <invoke-javacc target="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.jj"
                     outputDir="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml"
		     />
    </target>

  <target name="dist-maven" depends="contrib-build.dist-maven">
    <m2-deploy-with-pom-template pom.xml="lib/lucene-xercesImpl-pom.xml.template"
                                 jar.file="lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar" />
  </target>
</project>

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene build.xml source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2024 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.