alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Java example source code file (Punycode.java)

This example Java source code file (Punycode.java) is included in the alvinalexander.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Learn more about this Java project at its project page.

Java - Java tags/keywords

capital_a, capital_z, damp, delimiter, hyphen, illegal, indexoutofboundsexception, initial_n, max_cp_count, parseexception, stringbuffer, text, tmax, tmin, zero

The Punycode.java Java example source code

/*
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */
/*
 *******************************************************************************
 * Copyright (C) 2003-2004, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 */
//
// CHANGELOG
//      2005-05-19 Edward Wang
//          - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/Punycode.java
//          - move from package com.ibm.icu.text to package sun.net.idn
//          - use ParseException instead of StringPrepParseException
//      2007-08-14 Martin Buchholz
//          - remove redundant casts
//
package sun.net.idn;

import java.text.ParseException;
import sun.text.normalizer.UCharacter;
import sun.text.normalizer.UTF16;

/**
 * Ported code from ICU punycode.c
 * @author ram
 */

/* Package Private class */
public final class Punycode {

    /* Punycode parameters for Bootstring */
    private static final int BASE           = 36;
    private static final int TMIN           = 1;
    private static final int TMAX           = 26;
    private static final int SKEW           = 38;
    private static final int DAMP           = 700;
    private static final int INITIAL_BIAS   = 72;
    private static final int INITIAL_N      = 0x80;

    /* "Basic" Unicode/ASCII code points */
    private static final int HYPHEN         = 0x2d;
    private static final int DELIMITER      = HYPHEN;

    private static final int ZERO           = 0x30;
    private static final int NINE           = 0x39;

    private static final int SMALL_A        = 0x61;
    private static final int SMALL_Z        = 0x7a;

    private static final int CAPITAL_A      = 0x41;
    private static final int CAPITAL_Z      = 0x5a;

    //  TODO: eliminate the 256 limitation
    private static final int MAX_CP_COUNT   = 256;

    private static final int UINT_MAGIC     = 0x80000000;
    private static final long ULONG_MAGIC   = 0x8000000000000000L;

    private static int adaptBias(int delta, int length, boolean firstTime){
        if(firstTime){
            delta /=DAMP;
        }else{
            delta /=  2;
        }
        delta += delta/length;

        int count=0;
        for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
            delta/=(BASE-TMIN);
        }

        return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
    }

    /**
     * basicToDigit[] contains the numeric value of a basic code
     * point (for use in representing integers) in the range 0 to
     * BASE-1, or -1 if b is does not represent a value.
     */
    static final int[]    basicToDigit= new int[]{
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,

        -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
        15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,

        -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
        15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,

        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
    };

    private static char asciiCaseMap(char b, boolean uppercase) {
        if(uppercase) {
            if(SMALL_A<=b && b<=SMALL_Z) {
                b-=(SMALL_A-CAPITAL_A);
            }
        } else {
            if(CAPITAL_A<=b && b<=CAPITAL_Z) {
                b+=(SMALL_A-CAPITAL_A);
            }
        }
        return b;
    }

    /**
     * digitToBasic() returns the basic code point whose value
     * (when used for representing integers) is d, which must be in the
     * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
     * nonzero, in which case the uppercase form is used.
     */
    private static char digitToBasic(int digit, boolean uppercase) {
        /*  0..25 map to ASCII a..z or A..Z */
        /* 26..35 map to ASCII 0..9         */
        if(digit<26) {
            if(uppercase) {
                return (char)(CAPITAL_A+digit);
            } else {
                return (char)(SMALL_A+digit);
            }
        } else {
            return (char)((ZERO-26)+digit);
        }
    }
    /**
     * Converts Unicode to Punycode.
     * The input string must not contain single, unpaired surrogates.
     * The output will be represented as an array of ASCII code points.
     *
     * @param src
     * @param caseFlags
     * @return
     * @throws ParseException
     */
    public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws ParseException{

        int[] cpBuffer = new int[MAX_CP_COUNT];
        int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
        char c, c2;
        int srcLength = src.length();
        int destCapacity = MAX_CP_COUNT;
        char[] dest = new char[destCapacity];
        StringBuffer result = new StringBuffer();
        /*
         * Handle the basic code points and
         * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
         */
        srcCPCount=destLength=0;

        for(j=0; j<srcLength; ++j) {
            if(srcCPCount==MAX_CP_COUNT) {
                /* too many input code points */
                throw new IndexOutOfBoundsException();
            }
            c=src.charAt(j);
            if(isBasic(c)) {
                if(destLength<destCapacity) {
                    cpBuffer[srcCPCount++]=0;
                    dest[destLength]=
                        caseFlags!=null ?
                            asciiCaseMap(c, caseFlags[j]) :
                            c;
                }
                ++destLength;
            } else {
                n=((caseFlags!=null && caseFlags[j])? 1 : 0)<<31L;
                if(!UTF16.isSurrogate(c)) {
                    n|=c;
                } else if(UTF16.isLeadSurrogate(c) && (j+1)<srcLength && UTF16.isTrailSurrogate(c2=src.charAt(j+1))) {
                    ++j;

                    n|=UCharacter.getCodePoint(c, c2);
                } else {
                    /* error: unmatched surrogate */
                    throw new ParseException("Illegal char found", -1);
                }
                cpBuffer[srcCPCount++]=n;
            }
        }

        /* Finish the basic string - if it is not empty - with a delimiter. */
        basicLength=destLength;
        if(basicLength>0) {
            if(destLength<destCapacity) {
                dest[destLength]=DELIMITER;
            }
            ++destLength;
        }

        /*
         * handledCPCount is the number of code points that have been handled
         * basicLength is the number of basic code points
         * destLength is the number of chars that have been output
         */

        /* Initialize the state: */
        n=INITIAL_N;
        delta=0;
        bias=INITIAL_BIAS;

        /* Main encoding loop: */
        for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
            /*
             * All non-basic code points < n have been handled already.
             * Find the next larger one:
             */
            for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
                q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
                if(n<=q && q, but guard against overflow:
             */
            if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
                throw new RuntimeException("Internal program error");
            }
            delta+=(m-n)*(handledCPCount+1);
            n=m;

            /* Encode a sequence of same code points n */
            for(j=0; j<srcCPCount; ++j) {
                q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
                if(q<n) {
                    ++delta;
                } else if(q==n) {
                    /* Represent delta as a generalized variable-length integer: */
                    for(q=delta, k=BASE; /* no condition */; k+=BASE) {

                        /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt

                        t=k-bias;
                        if(t<TMIN) {
                            t=TMIN;
                        } else if(t>TMAX) {
                            t=TMAX;
                        }
                        */

                        t=k-bias;
                        if(t<TMIN) {
                            t=TMIN;
                        } else if(k>=(bias+TMAX)) {
                            t=TMAX;
                        }

                        if(q<t) {
                            break;
                        }

                        if(destLength<destCapacity) {
                            dest[destLength++]=digitToBasic(t+(q-t)%(BASE-t), false);
                        }
                        q=(q-t)/(BASE-t);
                    }

                    if(destLength<destCapacity) {
                        dest[destLength++]=digitToBasic(q, (cpBuffer[j]<0));
                    }
                    bias=adaptBias(delta, handledCPCount+1,(handledCPCount==basicLength));
                    delta=0;
                    ++handledCPCount;
                }
            }

            ++delta;
            ++n;
        }

        return result.append(dest, 0, destLength);
    }

    private static boolean isBasic(int ch){
        return (ch < INITIAL_N);
    }

    private static boolean isBasicUpperCase(int ch){
        return( CAPITAL_A <= ch && ch <= CAPITAL_Z);
    }

    private static boolean isSurrogate(int ch){
        return (((ch)&0xfffff800)==0xd800);
    }
    /**
     * Converts Punycode to Unicode.
     * The Unicode string will be at most as long as the Punycode string.
     *
     * @param src
     * @param caseFlags
     * @return
     * @throws ParseException
     */
    public static StringBuffer decode(StringBuffer src, boolean[] caseFlags)
                               throws ParseException{
        int srcLength = src.length();
        StringBuffer result = new StringBuffer();
        int n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
                destCPCount, firstSupplementaryIndex, cpLength;
        char b;
        int destCapacity = MAX_CP_COUNT;
        char[] dest = new char[destCapacity];

        /*
         * Handle the basic code points:
         * Let basicLength be the number of input code points
         * before the last delimiter, or 0 if there is none,
         * then copy the first basicLength code points to the output.
         *
         * The two following loops iterate backward.
         */
        for(j=srcLength; j>0;) {
            if(src.charAt(--j)==DELIMITER) {
                break;
            }
        }
        destLength=basicLength=destCPCount=j;

        while(j>0) {
            b=src.charAt(--j);
            if(!isBasic(b)) {
                throw new ParseException("Illegal char found", -1);
            }

            if(j<destCapacity) {
                dest[j]= b;

                if(caseFlags!=null) {
                    caseFlags[j]=isBasicUpperCase(b);
                }
            }
        }

        /* Initialize the state: */
        n=INITIAL_N;
        i=0;
        bias=INITIAL_BIAS;
        firstSupplementaryIndex=1000000000;

        /*
         * Main decoding loop:
         * Start just after the last delimiter if any
         * basic code points were copied; start at the beginning otherwise.
         */
        for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
            /*
             * in is the index of the next character to be consumed, and
             * destCPCount is the number of code points in the output array.
             *
             * Decode a generalized variable-length integer into delta,
             * which gets added to i.  The overflow checking is easier
             * if we increase i as we go, then subtract off its starting
             * value at the end to obtain delta.
             */
            for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
                if(in>=srcLength) {
                    throw new ParseException("Illegal char found", -1);
                }

                digit=basicToDigit[(byte)src.charAt(in++)];
                if(digit<0) {
                    throw new ParseException("Invalid char found", -1);
                }
                if(digit>(0x7fffffff-i)/w) {
                    /* integer overflow */
                    throw new ParseException("Illegal char found", -1);
                }

                i+=digit*w;
                t=k-bias;
                if(t<TMIN) {
                    t=TMIN;
                } else if(k>=(bias+TMAX)) {
                    t=TMAX;
                }
                if(digit<t) {
                    break;
                }

                if(w>0x7fffffff/(BASE-t)) {
                    /* integer overflow */
                    throw new ParseException("Illegal char found", -1);
                }
                w*=BASE-t;
            }

            /*
             * Modification from sample code:
             * Increments destCPCount here,
             * where needed instead of in for() loop tail.
             */
            ++destCPCount;
            bias=adaptBias(i-oldi, destCPCount, (oldi==0));

            /*
             * i was supposed to wrap around from (incremented) destCPCount to 0,
             * incrementing n each time, so we'll fix that now:
             */
            if(i/destCPCount>(0x7fffffff-n)) {
                /* integer overflow */
                throw new ParseException("Illegal char found", -1);
            }

            n+=i/destCPCount;
            i%=destCPCount;
            /* not needed for Punycode: */
            /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */

            if(n>0x10ffff || isSurrogate(n)) {
                /* Unicode code point overflow */
                throw new ParseException("Illegal char found", -1);
            }

            /* Insert n at position i of the output: */
            cpLength=UTF16.getCharCount(n);
            if((destLength+cpLength)<destCapacity) {
                int codeUnitIndex;

                /*
                 * Handle indexes when supplementary code points are present.
                 *
                 * In almost all cases, there will be only BMP code points before i
                 * and even in the entire string.
                 * This is handled with the same efficiency as with UTF-32.
                 *
                 * Only the rare cases with supplementary code points are handled
                 * more slowly - but not too bad since this is an insertion anyway.
                 */
                if(i<=firstSupplementaryIndex) {
                    codeUnitIndex=i;
                    if(cpLength>1) {
                        firstSupplementaryIndex=codeUnitIndex;
                    } else {
                        ++firstSupplementaryIndex;
                    }
                } else {
                    codeUnitIndex=firstSupplementaryIndex;
                    codeUnitIndex=UTF16.moveCodePointOffset(dest, 0, destLength, codeUnitIndex, i-codeUnitIndex);
                }

                /* use the UChar index codeUnitIndex instead of the code point index i */
                if(codeUnitIndex<destLength) {
                    System.arraycopy(dest, codeUnitIndex,
                                     dest, codeUnitIndex+cpLength,
                                    (destLength-codeUnitIndex));
                    if(caseFlags!=null) {
                        System.arraycopy(caseFlags, codeUnitIndex,
                                         caseFlags, codeUnitIndex+cpLength,
                                         destLength-codeUnitIndex);
                    }
                }
                if(cpLength==1) {
                    /* BMP, insert one code unit */
                    dest[codeUnitIndex]=(char)n;
                } else {
                    /* supplementary character, insert two code units */
                    dest[codeUnitIndex]=UTF16.getLeadSurrogate(n);
                    dest[codeUnitIndex+1]=UTF16.getTrailSurrogate(n);
                }
                if(caseFlags!=null) {
                    /* Case of last character determines uppercase flag: */
                    caseFlags[codeUnitIndex]=isBasicUpperCase(src.charAt(in-1));
                    if(cpLength==2) {
                        caseFlags[codeUnitIndex+1]=false;
                    }
                }
            }
            destLength+=cpLength;
            ++i;
        }
        result.append(dest, 0, destLength);
        return result;
    }
}

Other Java examples (source code examples)

Here is a short list of links related to this Java Punycode.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.