alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Java example source code file (Trie.java)

This example Java source code file (Trie.java) is included in the alvinalexander.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Learn more about this Java project at its project page.

Java - Java tags/keywords

datainputstream, datamanipulate, defaultgetfoldingoffset, header_options_data_is_32_bit_, header_options_index_shift_, header_options_latin1_is_linear_mask_, header_options_shift_mask_, header_signature_, index_stage_1_shift_, index_stage_2_shift_, index_stage_3_mask_, ioexception, surrogate_mask_, trie

The Trie.java Java example source code

/*
 * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */
/*
 *******************************************************************************
 * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved         *
 *                                                                             *
 * The original version of this source code and documentation is copyrighted   *
 * and owned by IBM, These materials are provided under terms of a License     *
 * Agreement between IBM and Sun. This technology is protected by multiple     *
 * US and International patents. This notice and attribution to IBM may not    *
 * to removed.                                                                 *
 *******************************************************************************
 */

package sun.text.normalizer;

import java.io.DataInputStream;
import java.io.InputStream;
import java.io.IOException;

/**
 * <p>A trie is a kind of compressed, serializable table of values
 * associated with Unicode code points (0..0x10ffff).</p>
 * <p>This class defines the basic structure of a trie and provides methods
 * to <b>retrieve the offsets to the actual data.

* <p>Data will be the form of an array of basic types, char or int.

* <p>The actual data format will have to be specified by the user in the * inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p> * <p>This trie implementation is optimized for getting offset while walking * forward through a UTF-16 string. * Therefore, the simplest and fastest access macros are the * fromLead() and fromOffsetTrail() methods. * The fromBMP() method are a little more complicated; they get offsets even * for lead surrogate codepoints, while the fromLead() method get special * "folded" offsets for lead surrogate code units if there is relevant data * associated with them. * From such a folded offsets, an offset needs to be extracted to supply * to the fromOffsetTrail() methods. * To handle such supplementary codepoints, some offset information are kept * in the data.</p> * <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve * that offset from the folded value for the lead surrogate unit.</p> * <p>For examples of use, see com.ibm.icu.impl.CharTrie or * com.ibm.icu.impl.IntTrie.</p> * @author synwee * @see com.ibm.icu.impl.CharTrie * @see com.ibm.icu.impl.IntTrie * @since release 2.1, Jan 01 2002 */ public abstract class Trie { // public class declaration ---------------------------------------- /** * Character data in com.ibm.impl.Trie have different user-specified format * for different purposes. * This interface specifies methods to be implemented in order for * com.ibm.impl.Trie, to surrogate offset information encapsulated within * the data. */ public static interface DataManipulate { /** * Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's * data * the index array offset of the indexes for that lead surrogate. * @param value data value for a surrogate from the trie, including the * folding offset * @return data offset or 0 if there is no data for the lead surrogate */ public int getFoldingOffset(int value); } // default implementation private static class DefaultGetFoldingOffset implements DataManipulate { public int getFoldingOffset(int value) { return value; } } // protected constructor ------------------------------------------- /** * Trie constructor for CharTrie use. * @param inputStream ICU data file input stream which contains the * trie * @param dataManipulate object containing the information to parse the * trie data * @throws IOException thrown when input stream does not have the * right header. */ protected Trie(InputStream inputStream, DataManipulate dataManipulate) throws IOException { DataInputStream input = new DataInputStream(inputStream); // Magic number to authenticate the data. int signature = input.readInt(); m_options_ = input.readInt(); if (!checkHeader(signature)) { throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file"); } if(dataManipulate != null) { m_dataManipulate_ = dataManipulate; } else { m_dataManipulate_ = new DefaultGetFoldingOffset(); } m_isLatin1Linear_ = (m_options_ & HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0; m_dataOffset_ = input.readInt(); m_dataLength_ = input.readInt(); unserialize(inputStream); } /** * Trie constructor * @param index array to be used for index * @param options used by the trie * @param dataManipulate object containing the information to parse the * trie data */ protected Trie(char index[], int options, DataManipulate dataManipulate) { m_options_ = options; if(dataManipulate != null) { m_dataManipulate_ = dataManipulate; } else { m_dataManipulate_ = new DefaultGetFoldingOffset(); } m_isLatin1Linear_ = (m_options_ & HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0; m_index_ = index; m_dataOffset_ = m_index_.length; } // protected data members ------------------------------------------ /** * Lead surrogate code points' index displacement in the index array. * 0x10000-0xd800=0x2800 * 0x2800 >> INDEX_STAGE_1_SHIFT_ */ protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5; /** * Shift size for shifting right the input index. 1..9 */ protected static final int INDEX_STAGE_1_SHIFT_ = 5; /** * Shift size for shifting left the index array values. * Increases possible data size with 16-bit index values at the cost * of compactability. * This requires blocks of stage 2 data to be aligned by * DATA_GRANULARITY. * 0..INDEX_STAGE_1_SHIFT */ protected static final int INDEX_STAGE_2_SHIFT_ = 2; /** * Number of data values in a stage 2 (data array) block. */ protected static final int DATA_BLOCK_LENGTH=1<INDEX_STAGE_1_SHIFT_ */ protected static final int SURROGATE_BLOCK_COUNT=(1<INDEX_STAGE_1_SHIFT_; /** * Surrogate mask to use when shifting offset to retrieve supplementary * values */ protected static final int SURROGATE_MASK_ = 0x3FF; /** * Index or UTF16 characters */ protected char m_index_[]; /** * Internal TrieValue which handles the parsing of the data value. * This class is to be implemented by the user */ protected DataManipulate m_dataManipulate_; /** * Start index of the data portion of the trie. CharTrie combines * index and data into a char array, so this is used to indicate the * initial offset to the data portion. * Note this index always points to the initial value. */ protected int m_dataOffset_; /** * Length of the data array */ protected int m_dataLength_; // protected methods ----------------------------------------------- /** * Gets the offset to the data which the surrogate pair points to. * @param lead lead surrogate * @param trail trailing surrogate * @return offset to data */ protected abstract int getSurrogateOffset(char lead, char trail); /** * Gets the value at the argument index * @param index value at index will be retrieved * @return 32 bit value */ protected abstract int getValue(int index); /** * Gets the default initial value * @return 32 bit value */ protected abstract int getInitialValue(); /** * Gets the offset to the data which the index ch after variable offset * points to. * Note for locating a non-supplementary character data offset, calling * <p> * getRawOffset(0, ch); * </p> * will do. Otherwise if it is a supplementary character formed by * surrogates lead and trail. Then we would have to call getRawOffset() * with getFoldingIndexOffset(). See getSurrogateOffset(). * @param offset index offset which ch is to start from * @param ch index to be used after offset * @return offset to the data */ protected final int getRawOffset(int offset, char ch) { return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)] << INDEX_STAGE_2_SHIFT_) + (ch & INDEX_STAGE_3_MASK_); } /** * Gets the offset to data which the BMP character points to * Treats a lead surrogate as a normal code point. * @param ch BMP character * @return offset to data */ protected final int getBMPOffset(char ch) { return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) ? getRawOffset(LEAD_INDEX_OFFSET_, ch) : getRawOffset(0, ch); // using a getRawOffset(ch) makes no diff } /** * Gets the offset to the data which this lead surrogate character points * to. * Data at the returned offset may contain folding offset information for * the next trailing surrogate character. * @param ch lead surrogate character * @return offset to data */ protected final int getLeadOffset(char ch) { return getRawOffset(0, ch); } /** * Internal trie getter from a code point. * Could be faster(?) but longer with * if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); } * Gets the offset to data which the codepoint points to * @param ch codepoint * @return offset to data */ protected final int getCodePointOffset(int ch) { // if ((ch >> 16) == 0) slower if (ch < 0) { return -1; } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) { // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works return getRawOffset(0, (char)ch); } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) { // BMP codepoint return getBMPOffset((char)ch); } else if (ch <= UCharacter.MAX_VALUE) { // look at the construction of supplementary characters // trail forms the ends of it. return getSurrogateOffset(UTF16.getLeadSurrogate(ch), (char)(ch & SURROGATE_MASK_)); } else { // return -1 // if there is an error, in this case we return return -1; } } /** * <p>Parses the inputstream and creates the trie index with it.

* <p>This is overwritten by the child classes. * @param inputStream input stream containing the trie information * @exception IOException thrown when data reading fails. */ protected void unserialize(InputStream inputStream) throws IOException { //indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_ m_index_ = new char[m_dataOffset_]; DataInputStream input = new DataInputStream(inputStream); for (int i = 0; i < m_dataOffset_; i ++) { m_index_[i] = input.readChar(); } } /** * Determines if this is a 32 bit trie * @return true if options specifies this is a 32 bit trie */ protected final boolean isIntTrie() { return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0; } /** * Determines if this is a 16 bit trie * @return true if this is a 16 bit trie */ protected final boolean isCharTrie() { return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0; } // private data members -------------------------------------------- /** * Latin 1 option mask */ protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200; /** * Constant number to authenticate the byte block */ protected static final int HEADER_SIGNATURE_ = 0x54726965; /** * Header option formatting */ private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF; protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4; protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100; /** * Flag indicator for Latin quick access data block */ private boolean m_isLatin1Linear_; /** * <p>Trie options field.

* <p>options bit field:
* 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br> * 8 0 = 16-bit data, 1=32-bit data<br> * 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br> * 3..0 INDEX_STAGE_2_SHIFT // 1..9<br> */ private int m_options_; // private methods --------------------------------------------------- /** * Authenticates raw data header. * Checking the header information, signature and options. * @param signature This contains the options and type of a Trie * @return true if the header is authenticated valid */ private final boolean checkHeader(int signature) { // check the signature // Trie in big-endian US-ASCII (0x54726965). // Magic number to authenticate the data. if (signature != HEADER_SIGNATURE_) { return false; } if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) != INDEX_STAGE_1_SHIFT_ || ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) & HEADER_OPTIONS_SHIFT_MASK_) != INDEX_STAGE_2_SHIFT_) { return false; } return true; } }

Other Java examples (source code examples)

Here is a short list of links related to this Java Trie.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.