alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Java example source code file (EUC_TW.java)

This example Java source code file (EUC_TW.java) is included in the alvinalexander.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Learn more about this Java project at its project page.

Java - Java tags/keywords

ascii, bytebuffer, charbuffer, charsetdecoder, coderresult, codeset, decoder, encoder, euc_tw, nio, ss2, string, unmappable_decoding, unmappable_encoding, util

The EUC_TW.java Java example source code

/*
 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

package sun.nio.cs.ext;

import java.io.*;
import java.nio.CharBuffer;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.util.Arrays;
import sun.nio.cs.HistoricallyNamedCharset;
import static sun.nio.cs.CharsetMapping.*;

public class EUC_TW extends Charset implements HistoricallyNamedCharset
{
    private static final int SS2 = 0x8E;

    /*
       (1) EUC_TW
       Second byte of EUC_TW for cs2 is in range of
       0xA1-0xB0 for plane 1-16. According to CJKV /163,
       plane1 is coded in both cs1 and cs2. This impl
       however does not decode the codepoints of plane1
       in cs2, so only p2-p7 and p15 are supported in cs2.

       Plane2  0xA2;
       Plane3  0xA3;
       Plane4  0xA4;
       Plane5  0xA5;
       Plane6  0xA6;
       Plane7  0xA7;
       Plane15 0xAF;

       (2) Mapping
       The fact that all supplementary characters encoded in EUC_TW are
       in 0x2xxxx range gives us the room to optimize the data tables.

       Decoding:
       (1) save the lower 16-bit value of all codepoints of b->c mapping
           in a String array table  String[plane] b2c.
       (2) save "codepoint is supplementary" info (one bit) in a
           byte[] b2cIsSupp, so 8 codepoints (same codepoint value, different
           plane No) share one byte.

       Encoding:
       (1)c->b mappings are stored in
          char[]c2b/char[]c2bIndex
          char[]c2bSupp/char[]c2bIndexsupp  (indexed by lower 16-bit
       (2)byte[] c2bPlane stores the "plane info" of each euc-tw codepoints,
          BMP and Supp share the low/high 4 bits of one byte.

       Mapping tables are stored separated in EUC_TWMapping, which
       is generated by tool.
     */

    public EUC_TW() {
        super("x-EUC-TW", ExtendedCharsets.aliasesFor("x-EUC-TW"));
    }

    public String historicalName() {
        return "EUC_TW";
    }

    public boolean contains(Charset cs) {
        return ((cs.name().equals("US-ASCII"))
                || (cs instanceof EUC_TW));
    }

    public CharsetDecoder newDecoder() {
        return new Decoder(this);
    }

    public CharsetEncoder newEncoder() {
        return new Encoder(this);
    }

    public static class Decoder extends CharsetDecoder {
        public Decoder(Charset cs) {
            super(cs, 2.0f, 2.0f);
        }

        char[] c1 = new char[1];
        char[] c2 = new char[2];
        public char[] toUnicode(int b1, int b2, int p) {
            return decode(b1, b2, p, c1, c2);
        }

        static final String[] b2c =  EUC_TWMapping.b2c;
        static final int b1Min    =  EUC_TWMapping.b1Min;
        static final int b1Max    =  EUC_TWMapping.b1Max;
        static final int b2Min    =  EUC_TWMapping.b2Min;
        static final int b2Max    =  EUC_TWMapping.b2Max;
        static final int dbSegSize = b2Max - b2Min + 1;
        static final byte[] b2cIsSupp;

        // adjust from cns planeNo to the plane index of b2c
        static final byte[] cnspToIndex = new byte[0x100];
        static {
            Arrays.fill(cnspToIndex, (byte)-1);
            cnspToIndex[0xa2] = 1; cnspToIndex[0xa3] = 2; cnspToIndex[0xa4] = 3;
            cnspToIndex[0xa5] = 4; cnspToIndex[0xa6] = 5; cnspToIndex[0xa7] = 6;
            cnspToIndex[0xaf] = 7;
        }

        //static final BitSet b2cIsSupp;
        static {
            String b2cIsSuppStr = EUC_TWMapping.b2cIsSuppStr;
            // work on a local copy is much faster than operate
            // directly on b2cIsSupp
            byte[] flag = new byte[b2cIsSuppStr.length() << 1];
            int off = 0;
            for (int i = 0; i < b2cIsSuppStr.length(); i++) {
                char c = b2cIsSuppStr.charAt(i);
                flag[off++] = (byte)(c >> 8);
                flag[off++] = (byte)(c & 0xff);
            }
            b2cIsSupp = flag;
        }

        static boolean isLegalDB(int b) {
           return b >= b1Min && b <= b1Max;
        }

        static char[] decode(int b1, int b2, int p, char[] c1, char[] c2)
        {
            if (b1 < b1Min || b1 > b1Max || b2 < b2Min || b2 > b2Max)
                return null;
            int index = (b1 - b1Min) * dbSegSize + b2 - b2Min;
            char c = b2c[p].charAt(index);
            if (c == UNMAPPABLE_DECODING)
                return null;
            if ((b2cIsSupp[index] & (1 << p)) == 0) {
                c1[0] = c;
                return c1;
            } else {
                c2[0] = Character.highSurrogate(0x20000 + c);
                c2[1] = Character.lowSurrogate(0x20000 + c);
                return c2;
            }
        }

        private CoderResult decodeArrayLoop(ByteBuffer src,
                                            CharBuffer dst)
        {
            byte[] sa = src.array();
            int sp = src.arrayOffset() + src.position();
            int sl = src.arrayOffset() + src.limit();

            char[] da = dst.array();
            int dp = dst.arrayOffset() + dst.position();
            int dl = dst.arrayOffset() + dst.limit();
            try {
                while (sp < sl) {
                    int byte1 = sa[sp] & 0xff;
                    if (byte1 == SS2) { // Codeset 2  G2
                        if ( sl - sp < 4)
                            return CoderResult.UNDERFLOW;
                        int cnsPlane = cnspToIndex[sa[sp + 1] & 0xff];
                        if (cnsPlane < 0)
                            return CoderResult.malformedForLength(2);
                        byte1 = sa[sp + 2] & 0xff;
                        int byte2 = sa[sp + 3] & 0xff;
                        char[] cc = toUnicode(byte1, byte2, cnsPlane);
                        if (cc == null) {
                            if (!isLegalDB(byte1) || !isLegalDB(byte2))
                                return CoderResult.malformedForLength(4);
                            return CoderResult.unmappableForLength(4);
                        }
                        if (dl - dp < cc.length)
                            return CoderResult.OVERFLOW;
                        if (cc.length == 1) {
                            da[dp++] = cc[0];
                        } else {
                            da[dp++] = cc[0];
                            da[dp++] = cc[1];
                        }
                        sp += 4;
                    } else if (byte1 < 0x80) {  // ASCII      G0
                        if (dl - dp < 1)
                           return CoderResult.OVERFLOW;
                        da[dp++] = (char) byte1;
                        sp++;
                    } else {                    // Codeset 1  G1
                        if ( sl - sp < 2)
                            return CoderResult.UNDERFLOW;
                        int byte2 = sa[sp + 1] & 0xff;
                        char[] cc = toUnicode(byte1, byte2, 0);
                        if (cc == null) {
                            if (!isLegalDB(byte1) || !isLegalDB(byte2))
                                return CoderResult.malformedForLength(1);
                            return CoderResult.unmappableForLength(2);
                        }
                        if (dl - dp < 1)
                            return CoderResult.OVERFLOW;
                        da[dp++] = cc[0];
                        sp += 2;
                    }
                }
                return CoderResult.UNDERFLOW;
            } finally {
                src.position(sp - src.arrayOffset());
                dst.position(dp - dst.arrayOffset());
            }
        }

        private CoderResult decodeBufferLoop(ByteBuffer src,
                                             CharBuffer dst)
        {
            int mark = src.position();
            try {
                while (src.hasRemaining()) {
                    int byte1 = src.get() & 0xff;
                    if (byte1 == SS2) {            // Codeset 2  G2
                        if ( src.remaining() < 3)
                            return CoderResult.UNDERFLOW;
                        int cnsPlane = cnspToIndex[src.get() & 0xff];
                        if (cnsPlane < 0)
                            return CoderResult.malformedForLength(2);
                        byte1 = src.get() & 0xff;
                        int byte2 = src.get() & 0xff;
                        char[] cc = toUnicode(byte1, byte2, cnsPlane);
                        if (cc == null) {
                            if (!isLegalDB(byte1) || !isLegalDB(byte2))
                                return CoderResult.malformedForLength(4);
                            return CoderResult.unmappableForLength(4);
                        }
                        if (dst.remaining() < cc.length)
                            return CoderResult.OVERFLOW;
                        if (cc.length == 1) {
                            dst.put(cc[0]);
                        } else {
                            dst.put(cc[0]);
                            dst.put(cc[1]);
                        }
                        mark += 4;
                    } else if (byte1 < 0x80) {        // ASCII      G0
                        if (!dst.hasRemaining())
                           return CoderResult.OVERFLOW;
                        dst.put((char) byte1);
                        mark++;
                    } else {                          // Codeset 1  G1
                        if (!src.hasRemaining())
                            return CoderResult.UNDERFLOW;
                        int byte2 = src.get() & 0xff;
                        char[] cc = toUnicode(byte1, byte2, 0);
                        if (cc == null) {
                            if (!isLegalDB(byte1) || !isLegalDB(byte2))
                                return CoderResult.malformedForLength(1);
                            return CoderResult.unmappableForLength(2);
                        }
                        if (!dst.hasRemaining())
                            return CoderResult.OVERFLOW;
                        dst.put(cc[0]);
                        mark +=2;
                    }
               }
               return CoderResult.UNDERFLOW;
            } finally {
                src.position(mark);
            }
        }

        protected CoderResult decodeLoop(ByteBuffer src, CharBuffer dst)
        {
            if (src.hasArray() && dst.hasArray())
                return decodeArrayLoop(src, dst);
            else
                return decodeBufferLoop(src, dst);
        }
    }

    public static class Encoder extends CharsetEncoder {
        private byte[] bb = new byte[4];

        public Encoder(Charset cs) {
            super(cs, 4.0f, 4.0f);
        }

        public boolean canEncode(char c) {
            return (c <= '\u007f' || toEUC(c, bb) != -1);
        }

        public boolean canEncode(CharSequence cs) {
            int i = 0;
            while (i < cs.length()) {
                char c = cs.charAt(i++);
                if (Character.isHighSurrogate(c)) {
                    if (i == cs.length())
                        return false;
                    char low = cs.charAt(i++);
                    if (!Character.isLowSurrogate(low) || toEUC(c, low, bb) == -1)
                        return false;
                } else if (!canEncode(c)) {
                    return false;
                }
            }
            return true;
        }

        public int toEUC(char hi, char low, byte[] bb) {
            return encode(hi, low, bb);
        }

        public int toEUC(char c, byte[] bb) {
            return encode(c, bb);
        }

        private CoderResult encodeArrayLoop(CharBuffer src,
                                            ByteBuffer dst)
        {
            char[] sa = src.array();
            int sp = src.arrayOffset() + src.position();
            int sl = src.arrayOffset() + src.limit();

            byte[] da = dst.array();
            int dp = dst.arrayOffset() + dst.position();
            int dl = dst.arrayOffset() + dst.limit();

            int inSize;
            int outSize;

            try {
                while (sp < sl) {
                    char c = sa[sp];
                    inSize = 1;
                    if (c < 0x80) {  // ASCII
                        bb[0] = (byte)c;
                        outSize = 1;
                    } else {
                        outSize = toEUC(c, bb);
                        if (outSize == -1) {
                            // to check surrogates only after BMP failed
                            // has the benefit of improving the BMP encoding
                            // 10% faster, with the price of the slowdown of
                            // supplementary character encoding. given the use
                            // of supplementary characters is really rare, this
                            // is something worth doing.
                            if (Character.isHighSurrogate(c)) {
                                if ((sp + 1) == sl)
                                    return CoderResult.UNDERFLOW;
                                if (!Character.isLowSurrogate(sa[sp + 1]))
                                    return CoderResult.malformedForLength(1);
                                outSize = toEUC(c, sa[sp+1], bb);
                                    inSize = 2;
                            } else if (Character.isLowSurrogate(c)) {
                                return CoderResult.malformedForLength(1);
                            }
                        }
                    }
                    if (outSize == -1)
                        return CoderResult.unmappableForLength(inSize);
                    if ( dl - dp < outSize)
                        return CoderResult.OVERFLOW;
                    for (int i = 0; i < outSize; i++)
                        da[dp++] = bb[i];
                    sp  += inSize;
                }
                return CoderResult.UNDERFLOW;
            } finally {
                src.position(sp - src.arrayOffset());
                dst.position(dp - dst.arrayOffset());
            }
        }

        private CoderResult encodeBufferLoop(CharBuffer src,
                                             ByteBuffer dst)
        {
            int outSize;
            int inSize;
            int mark = src.position();

            try {
                while (src.hasRemaining()) {
                    inSize = 1;
                    char c = src.get();
                    if (c < 0x80) {   // ASCII
                        outSize = 1;
                        bb[0] = (byte)c;
                    } else {
                        outSize = toEUC(c, bb);
                        if (outSize == -1) {
                            if (Character.isHighSurrogate(c)) {
                                if (!src.hasRemaining())
                                    return CoderResult.UNDERFLOW;
                                char c2 = src.get();
                                if (!Character.isLowSurrogate(c2))
                                    return CoderResult.malformedForLength(1);
                                outSize = toEUC(c, c2, bb);
                                inSize = 2;
                            } else if (Character.isLowSurrogate(c)) {
                                return CoderResult.malformedForLength(1);
                            }
                        }
                    }
                    if (outSize == -1)
                        return CoderResult.unmappableForLength(inSize);
                    if (dst.remaining() < outSize)
                        return CoderResult.OVERFLOW;
                    for (int i = 0; i < outSize; i++)
                        dst.put(bb[i]);
                    mark += inSize;
                }
                return CoderResult.UNDERFLOW;
            } finally {
                src.position(mark);
            }
        }

        protected CoderResult encodeLoop(CharBuffer src, ByteBuffer dst)
        {
            if (src.hasArray() && dst.hasArray())
                return encodeArrayLoop(src, dst);
            else
                return encodeBufferLoop(src, dst);
        }

        static int encode(char hi, char low, byte[] bb) {
            int c = Character.toCodePoint(hi, low);
            if ((c & 0xf0000) != 0x20000)
                return -1;
            c -= 0x20000;
            int index = c2bSuppIndex[c >> 8];
            if (index  == UNMAPPABLE_ENCODING)
                return -1;
            index = index + (c & 0xff);
            int db = c2bSupp[index];
            if (db == UNMAPPABLE_ENCODING)
                return -1;
            int p = (c2bPlane[index] >> 4) & 0xf;
            bb[0] = (byte)SS2;
            bb[1] = (byte)(0xa0 | p);
            bb[2] = (byte)(db >> 8);
            bb[3] = (byte)db;
            return 4;
        }

        static int encode(char c, byte[] bb) {
            int index = c2bIndex[c >> 8];
            if (index  == UNMAPPABLE_ENCODING)
                return -1;
            index = index + (c & 0xff);
            int db = c2b[index];
            if (db == UNMAPPABLE_ENCODING)
                return -1;
            int p = c2bPlane[index] & 0xf;
            if (p == 0) {
                bb[0] = (byte)(db >> 8);
                bb[1] = (byte)db;
                return 2;
            } else {
                bb[0] = (byte)SS2;
                bb[1] = (byte)(0xa0 | p);
                bb[2] = (byte)(db >> 8);
                bb[3] = (byte)db;
                return 4;
            }
        }

        static final char[] c2b;
        static final char[] c2bIndex;
        static final char[] c2bSupp;
        static final char[] c2bSuppIndex;
        static final byte[] c2bPlane;
        static {
            int b1Min    =  Decoder.b1Min;
            int b1Max    =  Decoder.b1Max;
            int b2Min    =  Decoder.b2Min;
            int b2Max    =  Decoder.b2Max;
            int dbSegSize = Decoder.dbSegSize;
            String[] b2c = Decoder.b2c;
            byte[] b2cIsSupp = Decoder.b2cIsSupp;

            c2bIndex = EUC_TWMapping.c2bIndex;
            c2bSuppIndex = EUC_TWMapping.c2bSuppIndex;
            char[] c2b0 = new char[EUC_TWMapping.C2BSIZE];
            char[] c2bSupp0 = new char[EUC_TWMapping.C2BSUPPSIZE];
            byte[] c2bPlane0 = new byte[Math.max(EUC_TWMapping.C2BSIZE,
                                                 EUC_TWMapping.C2BSUPPSIZE)];

            Arrays.fill(c2b0, (char)UNMAPPABLE_ENCODING);
            Arrays.fill(c2bSupp0, (char)UNMAPPABLE_ENCODING);

            for (int p = 0; p < b2c.length; p++) {
                String db = b2c[p];
                /*
                   adjust the "plane" from 0..7 to 0, 2, 3, 4, 5, 6, 7, 0xf,
                   which helps balance between footprint (to save the plane
                   info in 4 bits) and runtime performance (to require only
                   one operation "0xa0 | plane" to encode the plane byte)
                */
                int plane = p;
                if (plane == 7)
                    plane = 0xf;
                else if (plane != 0)
                    plane = p + 1;

                int off = 0;
                for (int b1 = b1Min; b1 <= b1Max; b1++) {
                    for (int b2 = b2Min; b2 <= b2Max; b2++) {
                        char c = db.charAt(off);
                        if (c != UNMAPPABLE_DECODING) {
                            if ((b2cIsSupp[off] & (1 << p)) != 0) {
                                int index = c2bSuppIndex[c >> 8] + (c&0xff);
                                c2bSupp0[index] = (char)((b1 << 8) + b2);
                                c2bPlane0[index] |= (byte)(plane << 4);
                            } else {
                                int index = c2bIndex[c >> 8] + (c&0xff);
                                c2b0[index] = (char)((b1 << 8) + b2);
                                c2bPlane0[index] |= (byte)plane;
                            }
                        }
                        off++;
                    }
                }
            }
            c2b = c2b0;
            c2bSupp = c2bSupp0;
            c2bPlane = c2bPlane0;
        }
    }
}

Other Java examples (source code examples)

Here is a short list of links related to this Java EUC_TW.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.