Java example source code file (CMap.java)

This example Java source code file (CMap.java) is included in the alvinalexander.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" ^TM.

Learn more about this Java project at its project page.
Java - Java tags/keywords

charbuffer, cmap, cmapformat0, cmapformat10, cmapformat12, cmapformat2, cmapformat4, cmapformat6, cmapformat8, intmask, nio, nullcmapclass, runtimeexception, shiftjisencoding, util
The CMap.java Java example source code

/*
 * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

package sun.font;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.IntBuffer;
import java.util.Locale;
import java.nio.charset.*;

/*
 * A tt font has a CMAP table which is in turn made up of sub-tables which
 * describe the char to glyph mapping in (possibly) multiple ways.
 * CMAP subtables are described by 3 values.
 * 1. Platform ID (eg 3=Microsoft, which is the id we look for in JDK)
 * 2. Encoding (eg 0=symbol, 1=unicode)
 * 3. TrueType subtable format (how the char->glyph mapping for the encoding
 * is stored in the subtable). See the TrueType spec. Format 4 is required
 * by MS in fonts for windows. Its uses segmented mapping to delta values.
 * Most typically we see are (3,1,4) :
 * CMAP Platform ID=3 is what we use.
 * Encodings that are used in practice by JDK on Solaris are
 *  symbol (3,0)
 *  unicode (3,1)
 *  GBK (3,5) (note that solaris zh fonts report 3,4 but are really 3,5)
 * The format for almost all subtables is 4. However the solaris (3,5)
 * encodings are typically in format 2.
 */
abstract class CMap {

//     static char WingDings_b2c[] = {
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0x2702, 0x2701, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0x2706, 0x2709, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x2707, 0x270d,
//         0xfffd, 0x270c, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0x2708, 0xfffd, 0xfffd, 0x2744, 0xfffd, 0x271e, 0xfffd,
//         0x2720, 0x2721, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0x2751, 0x2752, 0xfffd, 0xfffd, 0x2756, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0x2740, 0x273f, 0x275d, 0x275e, 0xfffd,
//         0xfffd, 0x2780, 0x2781, 0x2782, 0x2783, 0x2784, 0x2785, 0x2786,
//         0x2787, 0x2788, 0x2789, 0xfffd, 0x278a, 0x278b, 0x278c, 0x278d,
//         0x278e, 0x278f, 0x2790, 0x2791, 0x2792, 0x2793, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x274d, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x2736, 0x2734, 0xfffd, 0x2735,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x272a, 0x2730, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x27a5, 0xfffd, 0x27a6, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0x27a2, 0xfffd, 0xfffd, 0xfffd, 0x27b3, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0x27a1, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0x27a9, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0x2717, 0x2713, 0xfffd, 0xfffd, 0xfffd,
//    };

//     static char Symbols_b2c[] = {
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0x2200, 0xfffd, 0x2203, 0xfffd, 0xfffd, 0x220d,
//         0xfffd, 0xfffd, 0x2217, 0xfffd, 0xfffd, 0x2212, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0x2245, 0x0391, 0x0392, 0x03a7, 0x0394, 0x0395, 0x03a6, 0x0393,
//         0x0397, 0x0399, 0x03d1, 0x039a, 0x039b, 0x039c, 0x039d, 0x039f,
//         0x03a0, 0x0398, 0x03a1, 0x03a3, 0x03a4, 0x03a5, 0x03c2, 0x03a9,
//         0x039e, 0x03a8, 0x0396, 0xfffd, 0x2234, 0xfffd, 0x22a5, 0xfffd,
//         0xfffd, 0x03b1, 0x03b2, 0x03c7, 0x03b4, 0x03b5, 0x03c6, 0x03b3,
//         0x03b7, 0x03b9, 0x03d5, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03bf,
//         0x03c0, 0x03b8, 0x03c1, 0x03c3, 0x03c4, 0x03c5, 0x03d6, 0x03c9,
//         0x03be, 0x03c8, 0x03b6, 0xfffd, 0xfffd, 0xfffd, 0x223c, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0x03d2, 0xfffd, 0x2264, 0x2215, 0x221e, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0x2218, 0xfffd, 0xfffd, 0x2265, 0xfffd, 0x221d, 0xfffd, 0x2219,
//         0xfffd, 0x2260, 0x2261, 0x2248, 0x22ef, 0x2223, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x2297, 0x2295, 0x2205, 0x2229,
//         0x222a, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,
//         0xfffd, 0x2207, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x221a, 0x22c5,
//         0xfffd, 0x2227, 0x2228, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0x22c4, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x2211, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0x222b, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//         0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd,
//     };

    static final short ShiftJISEncoding = 2;
    static final short GBKEncoding      = 3;
    static final short Big5Encoding     = 4;
    static final short WansungEncoding  = 5;
    static final short JohabEncoding    = 6;
    static final short MSUnicodeSurrogateEncoding = 10;

    static final char noSuchChar = (char)0xfffd;
    static final int SHORTMASK = 0x0000ffff;
    static final int INTMASK   = 0xffffffff;

    static final char[][] converterMaps = new char[7][];

    /*
     * Unicode->other encoding translation array. A pre-computed look up
     * which can be shared across all fonts using that encoding.
     * Using this saves running character coverters repeatedly.
     */
    char[] xlat;

    static CMap initialize(TrueTypeFont font) {

        CMap cmap = null;

        int offset, platformID, encodingID=-1;

        int three0=0, three1=0, three2=0, three3=0, three4=0, three5=0,
            three6=0, three10=0;
        boolean threeStar = false;

        ByteBuffer cmapBuffer = font.getTableBuffer(TrueTypeFont.cmapTag);
        int cmapTableOffset = font.getTableSize(TrueTypeFont.cmapTag);
        short numberSubTables = cmapBuffer.getShort(2);

        /* locate the offsets of all 3,*  (ie Microsoft platform) encodings */
        for (int i=0; i<numberSubTables; i++) {
            cmapBuffer.position(i * 8 + 4);
            platformID = cmapBuffer.getShort();
            if (platformID == 3) {
                threeStar = true;
                encodingID = cmapBuffer.getShort();
                offset     = cmapBuffer.getInt();
                switch (encodingID) {
                case 0:  three0  = offset; break; // MS Symbol encoding
                case 1:  three1  = offset; break; // MS Unicode cmap
                case 2:  three2  = offset; break; // ShiftJIS cmap.
                case 3:  three3  = offset; break; // GBK cmap
                case 4:  three4  = offset; break; // Big 5 cmap
                case 5:  three5  = offset; break; // Wansung
                case 6:  three6  = offset; break; // Johab
                case 10: three10 = offset; break; // MS Unicode surrogates
                }
            }
        }

        /* This defines the preference order for cmap subtables */
        if (threeStar) {
            if (three10 != 0) {
                cmap = createCMap(cmapBuffer, three10, null);
            }
            else if  (three0 != 0) {
                /* The special case treatment of these fonts leads to
                 * anomalies where a user can view "wingdings" and "wingdings2"
                 * and the latter shows all its code points in the unicode
                 * private use area at 0xF000->0XF0FF and the former shows
                 * a scattered subset of its glyphs that are known mappings to
                 * unicode code points.
                 * The primary purpose of these mappings was to facilitate
                 * display of symbol chars etc in composite fonts, however
                 * this is not needed as all these code points are covered
                 * by Lucida Sans Regular.
                 * Commenting this out reduces the role of these two files
                 * (assuming that they continue to be used in font.properties)
                 * to just one of contributing to the overall composite
                 * font metrics, and also AWT can still access the fonts.
                 * Clients which explicitly accessed these fonts as names
                 * "Symbol" and "Wingdings" (ie as physical fonts) and
                 * expected to see a scattering of these characters will
                 * see them now as missing. How much of a problem is this?
                 * Perhaps we could still support this mapping just for
                 * "Symbol.ttf" but I suspect some users would prefer it
                 * to be mapped in to the Latin range as that is how
                 * the "symbol" font is used in native apps.
                 */
//              String name = font.platName.toLowerCase(Locale.ENGLISH);
//              if (name.endsWith("symbol.ttf")) {
//                  cmap = createSymbolCMap(cmapBuffer, three0, Symbols_b2c);
//              } else if (name.endsWith("wingding.ttf")) {
//                  cmap = createSymbolCMap(cmapBuffer, three0, WingDings_b2c);
//              } else {
                    cmap = createCMap(cmapBuffer, three0, null);
//              }
            }
            else if (three1 != 0) {
                cmap = createCMap(cmapBuffer, three1, null);
            }
            else if (three2 != 0) {
                cmap = createCMap(cmapBuffer, three2,
                                  getConverterMap(ShiftJISEncoding));
            }
            else if (three3 != 0) {
                cmap = createCMap(cmapBuffer, three3,
                                  getConverterMap(GBKEncoding));
            }
            else if (three4 != 0) {
                /* GB2312 TrueType fonts on Solaris have wrong encoding ID for
                 * cmap table, these fonts have EncodingID 4 which is Big5
                 * encoding according the TrueType spec, but actually the
                 * fonts are using gb2312 encoding, have to use this
                 * workaround to make Solaris zh_CN locale work.  -sherman
                 */
                if (FontUtilities.isSolaris && font.platName != null &&
                    (font.platName.startsWith(
                     "/usr/openwin/lib/locale/zh_CN.EUC/X11/fonts/TrueType") ||
                     font.platName.startsWith(
                     "/usr/openwin/lib/locale/zh_CN/X11/fonts/TrueType") ||
                     font.platName.startsWith(
                     "/usr/openwin/lib/locale/zh/X11/fonts/TrueType"))) {
                    cmap = createCMap(cmapBuffer, three4,
                                       getConverterMap(GBKEncoding));
                }
                else {
                    cmap = createCMap(cmapBuffer, three4,
                                      getConverterMap(Big5Encoding));
                }
            }
            else if (three5 != 0) {
                cmap = createCMap(cmapBuffer, three5,
                                  getConverterMap(WansungEncoding));
            }
            else if (three6 != 0) {
                cmap = createCMap(cmapBuffer, three6,
                                  getConverterMap(JohabEncoding));
            }
        } else {
            /* No 3,* subtable was found. Just use whatever is the first
             * table listed. Not very useful but maybe better than
             * rejecting the font entirely?
             */
            cmap = createCMap(cmapBuffer, cmapBuffer.getInt(8), null);
        }
        return cmap;
    }

    /* speed up the converting by setting the range for double
     * byte characters;
     */
    static char[] getConverter(short encodingID) {
        int dBegin = 0x8000;
        int dEnd   = 0xffff;
        String encoding;

        switch (encodingID) {
        case ShiftJISEncoding:
            dBegin = 0x8140;
            dEnd   = 0xfcfc;
            encoding = "SJIS";
            break;
        case GBKEncoding:
            dBegin = 0x8140;
            dEnd   = 0xfea0;
            encoding = "GBK";
            break;
        case Big5Encoding:
            dBegin = 0xa140;
            dEnd   = 0xfefe;
            encoding = "Big5";
            break;
        case WansungEncoding:
            dBegin = 0xa1a1;
            dEnd   = 0xfede;
            encoding = "EUC_KR";
            break;
        case JohabEncoding:
            dBegin = 0x8141;
            dEnd   = 0xfdfe;
            encoding = "Johab";
            break;
        default:
            return null;
        }

        try {
            char[] convertedChars = new char[65536];
            for (int i=0; i<65536; i++) {
                convertedChars[i] = noSuchChar;
            }

            byte[] inputBytes = new byte[(dEnd-dBegin+1)*2];
            char[] outputChars = new char[(dEnd-dBegin+1)];

            int j = 0;
            int firstByte;
            if (encodingID == ShiftJISEncoding) {
                for (int i = dBegin; i <= dEnd; i++) {
                    firstByte = (i >> 8 & 0xff);
                    if (firstByte >= 0xa1 && firstByte <= 0xdf) {
                        //sjis halfwidth katakana
                        inputBytes[j++] = (byte)0xff;
                        inputBytes[j++] = (byte)0xff;
                    } else {
                        inputBytes[j++] = (byte)firstByte;
                        inputBytes[j++] = (byte)(i & 0xff);
                    }
                }
            } else {
                for (int i = dBegin; i <= dEnd; i++) {
                    inputBytes[j++] = (byte)(i>>8 & 0xff);
                    inputBytes[j++] = (byte)(i & 0xff);
                }
            }

            Charset.forName(encoding).newDecoder()
            .onMalformedInput(CodingErrorAction.REPLACE)
            .onUnmappableCharacter(CodingErrorAction.REPLACE)
            .replaceWith("\u0000")
            .decode(ByteBuffer.wrap(inputBytes, 0, inputBytes.length),
                    CharBuffer.wrap(outputChars, 0, outputChars.length),
                    true);

            // ensure single byte ascii
            for (int i = 0x20; i <= 0x7e; i++) {
                convertedChars[i] = (char)i;
            }

            //sjis halfwidth katakana
            if (encodingID == ShiftJISEncoding) {
                for (int i = 0xa1; i <= 0xdf; i++) {
                    convertedChars[i] = (char)(i - 0xa1 + 0xff61);
                }
            }

            /* It would save heap space (approx 60Kbytes for each of these
             * converters) if stored only valid ranges (ie returned
             * outputChars directly. But this is tricky since want to
             * include the ASCII range too.
             */
//          System.err.println("oc.len="+outputChars.length);
//          System.err.println("cc.len="+convertedChars.length);
//          System.err.println("dbegin="+dBegin);
            System.arraycopy(outputChars, 0, convertedChars, dBegin,
                             outputChars.length);

            //return convertedChars;
            /* invert this map as now want it to map from Unicode
             * to other encoding.
             */
            char [] invertedChars = new char[65536];
            for (int i=0;i<65536;i++) {
                if (convertedChars[i] != noSuchChar) {
                    invertedChars[convertedChars[i]] = (char)i;
                }
            }
            return invertedChars;

        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    /*
     * The returned array maps to unicode from some other 2 byte encoding
     * eg for a 2byte index which represents a SJIS char, the indexed
     * value is the corresponding unicode char.
     */
    static char[] getConverterMap(short encodingID) {
        if (converterMaps[encodingID] == null) {
           converterMaps[encodingID] = getConverter(encodingID);
        }
        return converterMaps[encodingID];
    }


    static CMap createCMap(ByteBuffer buffer, int offset, char[] xlat) {
        /* First do a sanity check that this cmap subtable is contained
         * within the cmap table.
         */
        int subtableFormat = buffer.getChar(offset);
        long subtableLength;
        if (subtableFormat < 8) {
            subtableLength = buffer.getChar(offset+2);
        } else {
            subtableLength = buffer.getInt(offset+4) & INTMASK;
        }
        if (offset+subtableLength > buffer.capacity()) {
            if (FontUtilities.isLogging()) {
                FontUtilities.getLogger().warning("Cmap subtable overflows buffer.");
            }
        }
        switch (subtableFormat) {
        case 0:  return new CMapFormat0(buffer, offset);
        case 2:  return new CMapFormat2(buffer, offset, xlat);
        case 4:  return new CMapFormat4(buffer, offset, xlat);
        case 6:  return new CMapFormat6(buffer, offset, xlat);
        case 8:  return new CMapFormat8(buffer, offset, xlat);
        case 10: return new CMapFormat10(buffer, offset, xlat);
        case 12: return new CMapFormat12(buffer, offset, xlat);
        default: throw new RuntimeException("Cmap format unimplemented: " +
                                            (int)buffer.getChar(offset));
        }
    }

/*
    final char charVal(byte[] cmap, int index) {
        return (char)(((0xff & cmap[index]) << 8)+(0xff & cmap[index+1]));
    }

    final short shortVal(byte[] cmap, int index) {
        return (short)(((0xff & cmap[index]) << 8)+(0xff & cmap[index+1]));
    }
*/
    abstract char getGlyph(int charCode);

    /* Format 4 Header is
     * ushort format (off=0)
     * ushort length (off=2)
     * ushort language (off=4)
     * ushort segCountX2 (off=6)
     * ushort searchRange (off=8)
     * ushort entrySelector (off=10)
     * ushort rangeShift (off=12)
     * ushort endCount[segCount] (off=14)
     * ushort reservedPad
     * ushort startCount[segCount]
     * short idDelta[segCount]
     * idRangeOFfset[segCount]
     * ushort glyphIdArray[]
     */
    static class CMapFormat4 extends CMap {
        int segCount;
        int entrySelector;
        int rangeShift;
        char[] endCount;
        char[] startCount;
        short[] idDelta;
        char[] idRangeOffset;
        char[] glyphIds;

        CMapFormat4(ByteBuffer bbuffer, int offset, char[] xlat) {

            this.xlat = xlat;

            bbuffer.position(offset);
            CharBuffer buffer = bbuffer.asCharBuffer();
            buffer.get(); // skip, we already know format=4
            int subtableLength = buffer.get();
            /* Try to recover from some bad fonts which specify a subtable
             * length that would overflow the byte buffer holding the whole
             * cmap table. If this isn't a recoverable situation an exception
             * may be thrown which is caught higher up the call stack.
             * Whilst this may seem lenient, in practice, unless the "bad"
             * subtable we are using is the last one in the cmap table we
             * would have no way of knowing about this problem anyway.
             */
            if (offset+subtableLength > bbuffer.capacity()) {
                subtableLength = bbuffer.capacity() - offset;
            }
            buffer.get(); // skip language
            segCount = buffer.get()/2;
            int searchRange = buffer.get();
            entrySelector = buffer.get();
            rangeShift    = buffer.get()/2;
            startCount = new char[segCount];
            endCount = new char[segCount];
            idDelta = new short[segCount];
            idRangeOffset = new char[segCount];

            for (int i=0; i<segCount; i++) {
                endCount[i] = buffer.get();
            }
            buffer.get(); // 2 bytes for reserved pad
            for (int i=0; i<segCount; i++) {
                startCount[i] = buffer.get();
            }

            for (int i=0; i<segCount; i++) {
                idDelta[i] = (short)buffer.get();
            }

            for (int i=0; i<segCount; i++) {
                char ctmp = buffer.get();
                idRangeOffset[i] = (char)((ctmp>>1)&0xffff);
            }
            /* Can calculate the number of glyph IDs by subtracting
             * "pos" from the length of the cmap
             */
            int pos = (segCount*8+16)/2;
            buffer.position(pos);
            int numGlyphIds = (subtableLength/2 - pos);
            glyphIds = new char[numGlyphIds];
            for (int i=0;i<numGlyphIds;i++) {
                glyphIds[i] = buffer.get();
            }
/*
            System.err.println("segcount="+segCount);
            System.err.println("entrySelector="+entrySelector);
            System.err.println("rangeShift="+rangeShift);
            for (int j=0;j<segCount;j++) {
              System.err.println("j="+j+ " sc="+(int)(startCount[j]&0xffff)+
                                 " ec="+(int)(endCount[j]&0xffff)+
                                 " delta="+idDelta[j] +
                                 " ro="+(int)idRangeOffset[j]);
            }

            //System.err.println("numglyphs="+glyphIds.length);
            for (int i=0;i<numGlyphIds;i++) {
                  System.err.println("gid["+i+"]="+(int)glyphIds[i]);
            }
*/
        }

        char getGlyph(int charCode) {

            int index = 0;
            char glyphCode = 0;

            int controlGlyph = getControlCodeGlyph(charCode, true);
            if (controlGlyph >= 0) {
                return (char)controlGlyph;
            }

            /* presence of translation array indicates that this
             * cmap is in some other (non-unicode encoding).
             * In order to look-up a char->glyph mapping we need to
             * translate the unicode code point to the encoding of
             * the cmap.
             * REMIND: VALID CHARCODES??
             */
            if (xlat != null) {
                charCode = xlat[charCode];
            }

            /*
             * Citation from the TrueType (and OpenType) spec:
             *   The segments are sorted in order of increasing endCode
             *   values, and the segment values are specified in four parallel
             *   arrays. You search for the first endCode that is greater than
             *   or equal to the character code you want to map. If the
             *   corresponding startCode is less than or equal to the
             *   character code, then you use the corresponding idDelta and
             *   idRangeOffset to map the character code to a glyph index
             *   (otherwise, the missingGlyph is returned).
             */

            /*
             * CMAP format4 defines several fields for optimized search of
             * the segment list (entrySelector, searchRange, rangeShift).
             * However, benefits are neglible and some fonts have incorrect
             * data - so we use straightforward binary search (see bug 6247425)
             */
            int left = 0, right = startCount.length;
            index = startCount.length >> 1;
            while (left < right) {
                if (endCount[index] < charCode) {
                    left = index + 1;
                } else {
                    right = index;
                }
                index = (left + right) >> 1;
            }

            if (charCode >= startCount[index] && charCode <= endCount[index]) {
                int rangeOffset = idRangeOffset[index];

                if (rangeOffset == 0) {
                    glyphCode = (char)(charCode + idDelta[index]);
                } else {
                    /* Calculate an index into the glyphIds array */

/*
                    System.err.println("rangeoffset="+rangeOffset+
                                       " charCode=" + charCode +
                                       " scnt["+index+"]="+(int)startCount[index] +
                                       " segCnt="+segCount);
*/

                    int glyphIDIndex = rangeOffset - segCount + index
                                         + (charCode - startCount[index]);
                    glyphCode = glyphIds[glyphIDIndex];
                    if (glyphCode != 0) {
                        glyphCode = (char)(glyphCode + idDelta[index]);
                    }
                }
            }
            if (glyphCode != 0) {
            //System.err.println("cc="+Integer.toHexString((int)charCode) + " gc="+(int)glyphCode);
            }
            return glyphCode;
        }
    }

    // Format 0: Byte Encoding table
    static class CMapFormat0 extends CMap {
        byte [] cmap;

        CMapFormat0(ByteBuffer buffer, int offset) {

            /* skip 6 bytes of format, length, and version */
            int len = buffer.getChar(offset+2);
            cmap = new byte[len-6];
            buffer.position(offset+6);
            buffer.get(cmap);
        }

        char getGlyph(int charCode) {
            if (charCode < 256) {
                if (charCode < 0x0010) {
                    switch (charCode) {
                    case 0x0009:
                    case 0x000a:
                    case 0x000d: return CharToGlyphMapper.INVISIBLE_GLYPH_ID;
                    }
                }
                return (char)(0xff & cmap[charCode]);
            } else {
                return 0;
            }
        }
    }

//     static CMap createSymbolCMap(ByteBuffer buffer, int offset, char[] syms) {

//      CMap cmap = createCMap(buffer, offset, null);
//      if (cmap == null) {
//          return null;
//      } else {
//          return new CMapFormatSymbol(cmap, syms);
//      }
//     }

//     static class CMapFormatSymbol extends CMap {

//      CMap cmap;
//      static final int NUM_BUCKETS = 128;
//      Bucket[] buckets = new Bucket[NUM_BUCKETS];

//      class Bucket {
//          char unicode;
//          char glyph;
//          Bucket next;

//          Bucket(char u, char g) {
//              unicode = u;
//              glyph = g;
//          }
//      }

//      CMapFormatSymbol(CMap cmap, char[] syms) {

//          this.cmap = cmap;

//          for (int i=0;i<syms.length;i++) {
//              char unicode = syms[i];
//              if (unicode != noSuchChar) {
//                  char glyph = cmap.getGlyph(i + 0xf000);
//                  int hash = unicode % NUM_BUCKETS;
//                  Bucket bucket = new Bucket(unicode, glyph);
//                  if (buckets[hash] == null) {
//                      buckets[hash] = bucket;
//                  } else {
//                      Bucket b = buckets[hash];
//                      while (b.next != null) {
//                          b = b.next;
//                      }
//                      b.next = bucket;
//                  }
//              }
//          }
//      }

//      char getGlyph(int unicode) {
//          if (unicode >= 0x1000) {
//              return 0;
//          }
//          else if (unicode >=0xf000 && unicode < 0xf100) {
//              return cmap.getGlyph(unicode);
//          } else {
//              Bucket b = buckets[unicode % NUM_BUCKETS];
//              while (b != null) {
//                  if (b.unicode == unicode) {
//                      return b.glyph;
//                  } else {
//                      b = b.next;
//                  }
//              }
//              return 0;
//          }
//      }
//     }

    // Format 2: High-byte mapping through table
    static class CMapFormat2 extends CMap {

        char[] subHeaderKey = new char[256];
         /* Store subheaders in individual arrays
          * A SubHeader entry theortically looks like {
          *   char firstCode;
          *   char entryCount;
          *   short idDelta;
          *   char idRangeOffset;
          * }
          */
        char[] firstCodeArray;
        char[] entryCountArray;
        short[] idDeltaArray;
        char[] idRangeOffSetArray;

        char[] glyphIndexArray;

        CMapFormat2(ByteBuffer buffer, int offset, char[] xlat) {

            this.xlat = xlat;

            int tableLen = buffer.getChar(offset+2);
            buffer.position(offset+6);
            CharBuffer cBuffer = buffer.asCharBuffer();
            char maxSubHeader = 0;
            for (int i=0;i<256;i++) {
                subHeaderKey[i] = cBuffer.get();
                if (subHeaderKey[i] > maxSubHeader) {
                    maxSubHeader = subHeaderKey[i];
                }
            }
            /* The value of the subHeaderKey is 8 * the subHeader index,
             * so the number of subHeaders can be obtained by dividing
             * this value bv 8 and adding 1.
             */
            int numSubHeaders = (maxSubHeader >> 3) +1;
            firstCodeArray = new char[numSubHeaders];
            entryCountArray = new char[numSubHeaders];
            idDeltaArray  = new short[numSubHeaders];
            idRangeOffSetArray  = new char[numSubHeaders];
            for (int i=0; i<numSubHeaders; i++) {
                firstCodeArray[i] = cBuffer.get();
                entryCountArray[i] = cBuffer.get();
                idDeltaArray[i] = (short)cBuffer.get();
                idRangeOffSetArray[i] = cBuffer.get();
//              System.out.println("sh["+i+"]:fc="+(int)firstCodeArray[i]+
//                                 " ec="+(int)entryCountArray[i]+
//                                 " delta="+(int)idDeltaArray[i]+
//                                 " offset="+(int)idRangeOffSetArray[i]);
            }

            int glyphIndexArrSize = (tableLen-518-numSubHeaders*8)/2;
            glyphIndexArray = new char[glyphIndexArrSize];
            for (int i=0; i<glyphIndexArrSize;i++) {
                glyphIndexArray[i] = cBuffer.get();
            }
        }

        char getGlyph(int charCode) {
            int controlGlyph = getControlCodeGlyph(charCode, true);
            if (controlGlyph >= 0) {
                return (char)controlGlyph;
            }

            if (xlat != null) {
                charCode = xlat[charCode];
            }

            char highByte = (char)(charCode >> 8);
            char lowByte = (char)(charCode & 0xff);
            int key = subHeaderKey[highByte]>>3; // index into subHeaders
            char mapMe;

            if (key != 0) {
                mapMe = lowByte;
            } else {
                mapMe = highByte;
                if (mapMe == 0) {
                    mapMe = lowByte;
                }
            }

//          System.err.println("charCode="+Integer.toHexString(charCode)+
//                             " key="+key+ " mapMe="+Integer.toHexString(mapMe));
            char firstCode = firstCodeArray[key];
            if (mapMe < firstCode) {
                return 0;
            } else {
                mapMe -= firstCode;
            }

            if (mapMe < entryCountArray[key]) {
                /* "address" arithmetic is needed to calculate the offset
                 * into glyphIndexArray. "idRangeOffSetArray[key]" specifies
                 * the number of bytes from that location in the table where
                 * the subarray of glyphIndexes starting at "firstCode" begins.
                 * Each entry in the subHeader table is 8 bytes, and the
                 * idRangeOffSetArray field is at offset 6 in the entry.
                 * The glyphIndexArray immediately follows the subHeaders.
                 * So if there are "N" entries then the number of bytes to the
                 * start of glyphIndexArray is (N-key)*8-6.
                 * Subtract this from the idRangeOffSetArray value to get
                 * the number of bytes into glyphIndexArray and divide by 2 to
                 * get the (char) array index.
                 */
                int glyphArrayOffset = ((idRangeOffSetArray.length-key)*8)-6;
                int glyphSubArrayStart =
                        (idRangeOffSetArray[key] - glyphArrayOffset)/2;
                char glyphCode = glyphIndexArray[glyphSubArrayStart+mapMe];
                if (glyphCode != 0) {
                    glyphCode += idDeltaArray[key]; //idDelta
                    return glyphCode;
                }
            }
            return 0;
        }
    }

    // Format 6: Trimmed table mapping
    static class CMapFormat6 extends CMap {

        char firstCode;
        char entryCount;
        char[] glyphIdArray;

        CMapFormat6(ByteBuffer bbuffer, int offset, char[] xlat) {

             bbuffer.position(offset+6);
             CharBuffer buffer = bbuffer.asCharBuffer();
             firstCode = buffer.get();
             entryCount = buffer.get();
             glyphIdArray = new char[entryCount];
             for (int i=0; i< entryCount; i++) {
                 glyphIdArray[i] = buffer.get();
             }
         }

         char getGlyph(int charCode) {
            int controlGlyph = getControlCodeGlyph(charCode, true);
            if (controlGlyph >= 0) {
                return (char)controlGlyph;
            }

             if (xlat != null) {
                 charCode = xlat[charCode];
             }

             charCode -= firstCode;
             if (charCode < 0 || charCode >= entryCount) {
                  return 0;
             } else {
                  return glyphIdArray[charCode];
             }
         }
    }

    // Format 8: mixed 16-bit and 32-bit coverage
    // Seems unlikely this code will ever get tested as we look for
    // MS platform Cmaps and MS states (in the Opentype spec on their website)
    // that MS doesn't support this format
    static class CMapFormat8 extends CMap {
         byte[] is32 = new byte[8192];
         int nGroups;
         int[] startCharCode;
         int[] endCharCode;
         int[] startGlyphID;

         CMapFormat8(ByteBuffer bbuffer, int offset, char[] xlat) {

             bbuffer.position(12);
             bbuffer.get(is32);
             nGroups = bbuffer.getInt();
             startCharCode = new int[nGroups];
             endCharCode   = new int[nGroups];
             startGlyphID  = new int[nGroups];
         }

        char getGlyph(int charCode) {
            if (xlat != null) {
                throw new RuntimeException("xlat array for cmap fmt=8");
            }
            return 0;
        }

    }


    // Format 4-byte 10: Trimmed table mapping
    // Seems unlikely this code will ever get tested as we look for
    // MS platform Cmaps and MS states (in the Opentype spec on their website)
    // that MS doesn't support this format
    static class CMapFormat10 extends CMap {

         long firstCode;
         int entryCount;
         char[] glyphIdArray;

         CMapFormat10(ByteBuffer bbuffer, int offset, char[] xlat) {

             firstCode = bbuffer.getInt() & INTMASK;
             entryCount = bbuffer.getInt() & INTMASK;
             bbuffer.position(offset+20);
             CharBuffer buffer = bbuffer.asCharBuffer();
             glyphIdArray = new char[entryCount];
             for (int i=0; i< entryCount; i++) {
                 glyphIdArray[i] = buffer.get();
             }
         }

         char getGlyph(int charCode) {

             if (xlat != null) {
                 throw new RuntimeException("xlat array for cmap fmt=10");
             }

             int code = (int)(charCode - firstCode);
             if (code < 0 || code >= entryCount) {
                 return 0;
             } else {
                 return glyphIdArray[code];
             }
         }
    }

    // Format 12: Segmented coverage for UCS-4 (fonts supporting
    // surrogate pairs)
    static class CMapFormat12 extends CMap {

        int numGroups;
        int highBit =0;
        int power;
        int extra;
        long[] startCharCode;
        long[] endCharCode;
        int[] startGlyphID;

        CMapFormat12(ByteBuffer buffer, int offset, char[] xlat) {
            if (xlat != null) {
                throw new RuntimeException("xlat array for cmap fmt=12");
            }

            numGroups = buffer.getInt(offset+12);
            startCharCode = new long[numGroups];
            endCharCode = new long[numGroups];
            startGlyphID = new int[numGroups];
            buffer.position(offset+16);
            buffer = buffer.slice();
            IntBuffer ibuffer = buffer.asIntBuffer();
            for (int i=0; i<numGroups; i++) {
                startCharCode[i] = ibuffer.get() & INTMASK;
                endCharCode[i] = ibuffer.get() & INTMASK;
                startGlyphID[i] = ibuffer.get() & INTMASK;
            }

            /* Finds the high bit by binary searching through the bits */
            int value = numGroups;

            if (value >= 1 << 16) {
                value >>= 16;
                highBit += 16;
            }

            if (value >= 1 << 8) {
                value >>= 8;
                highBit += 8;
            }

            if (value >= 1 << 4) {
                value >>= 4;
                highBit += 4;
            }

            if (value >= 1 << 2) {
                value >>= 2;
                highBit += 2;
            }

            if (value >= 1 << 1) {
                value >>= 1;
                highBit += 1;
            }

            power = 1 << highBit;
            extra = numGroups - power;
        }

        char getGlyph(int charCode) {
            int controlGlyph = getControlCodeGlyph(charCode, false);
            if (controlGlyph >= 0) {
                return (char)controlGlyph;
            }
            int probe = power;
            int range = 0;

            if (startCharCode[extra] <= charCode) {
                range = extra;
            }

            while (probe > 1) {
                probe >>= 1;

                if (startCharCode[range+probe] <= charCode) {
                    range += probe;
                }
            }

            if (startCharCode[range] <= charCode &&
                  endCharCode[range] >= charCode) {
                return (char)
                    (startGlyphID[range] + (charCode - startCharCode[range]));
            }

            return 0;
        }

    }

    /* Used to substitute for bad Cmaps. */
    static class NullCMapClass extends CMap {

        char getGlyph(int charCode) {
            return 0;
        }
    }

    public static final NullCMapClass theNullCmap = new NullCMapClass();

    final int getControlCodeGlyph(int charCode, boolean noSurrogates) {
        if (charCode < 0x0010) {
            switch (charCode) {
            case 0x0009:
            case 0x000a:
            case 0x000d: return CharToGlyphMapper.INVISIBLE_GLYPH_ID;
            }
        } else if (charCode >= 0x200c) {
            if ((charCode <= 0x200f) ||
                (charCode >= 0x2028 && charCode <= 0x202e) ||
                (charCode >= 0x206a && charCode <= 0x206f)) {
                return CharToGlyphMapper.INVISIBLE_GLYPH_ID;
            } else if (noSurrogates && charCode >= 0xFFFF) {
                return 0;
            }
        }
        return -1;
    }
}
Other Java examples (source code examples)

Here is a short list of links related to this Java CMap.java source code file:
Copyright 1998-2024 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.
... this post is sponsored by my books ...
#1 New Release!	FP Best Seller
Java example source code file (CMap.java)

Java - Java tags/keywords

The CMap.java Java example source code

Other Java examples (source code examples)

new blog posts