Java example source code file (XmlReader.java)

This example Java source code file (XmlReader.java) is included in the alvinalexander.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" ^TM.

Learn more about this Java project at its project page.
Java - Java tags/keywords

basereader, charconversionexception, ioexception, iso8859_1reader, maxpushback, pushbackinputstream, reader, string, stringbuffer, unicode, utf-8, utf8reader, utf\-16, util, xmlreader
The XmlReader.java Java example source code

/*
 * Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

package com.sun.xml.internal.dtdparser;

import java.io.ByteArrayInputStream;
import java.io.CharConversionException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackInputStream;
import java.io.Reader;
import java.util.Hashtable;


// NOTE:  Add I18N support to this class when JDK gets the ability to
// defer selection of locale for exception messages ... use the same
// technique for both.


/**
 * This handles several XML-related tasks that normal java.io Readers
 * don't support, inluding use of IETF standard encoding names and
 * automatic detection of most XML encodings.  The former is needed
 * for interoperability; the latter is needed to conform with the XML
 * spec.  This class also optimizes reading some common encodings by
 * providing low-overhead unsynchronized Reader support.
 * <p/>
 * <P> Note that the autodetection facility should be used only on
 * data streams which have an unknown character encoding.  For example,
 * it should never be used on MIME text/xml entities.
 * <p/>
 * <P> Note that XML processors are only required to support UTF-8 and
 * UTF-16 character encodings.  Autodetection permits the underlying Java
 * implementation to provide support for many other encodings, such as
 * US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP.
 *
 * @author David Brownell
 * @author Janet Koenig
 * @version 1.3 00/02/24
 */
// package private
final class XmlReader extends Reader {
    private static final int MAXPUSHBACK = 512;

    private Reader in;
    private String assignedEncoding;
    private boolean closed;

    //
    // This class always delegates I/O to a reader, which gets
    // its data from the very beginning of the XML text.  It needs
    // to use a pushback stream since (a) autodetection can read
    // partial UTF-8 characters which need to be fully processed,
    // (b) the "Unicode" readers swallow characters that they think
    // are byte order marks, so tests fail if they don't see the
    // real byte order mark.
    //
    // It's got do this efficiently:  character I/O is solidly on the
    // critical path.  (So keep buffer length over 2 Kbytes to avoid
    // excess buffering. Many URL handlers stuff a BufferedInputStream
    // between here and the real data source, and larger buffers keep
    // that from slowing you down.)
    //

    /**
     * Constructs the reader from an input stream, autodetecting
     * the encoding to use according to the heuristic specified
     * in the XML 1.0 recommendation.
     *
     * @param in the input stream from which the reader is constructed
     * @throws IOException on error, such as unrecognized encoding
     */
    public static Reader createReader(InputStream in) throws IOException {
        return new XmlReader(in);
    }

    /**
     * Creates a reader supporting the given encoding, mapping
     * from standard encoding names to ones that understood by
     * Java where necessary.
     *
     * @param in       the input stream from which the reader is constructed
     * @param encoding the IETF standard name of the encoding to use;
     *                 if null, autodetection is used.
     * @throws IOException on error, including unrecognized encoding
     */
    public static Reader createReader(InputStream in, String encoding)
            throws IOException {
        if (encoding == null)
            return new XmlReader(in);
        if ("UTF-8".equalsIgnoreCase(encoding)
                || "UTF8".equalsIgnoreCase(encoding))
            return new Utf8Reader(in);
        if ("US-ASCII".equalsIgnoreCase(encoding)
                || "ASCII".equalsIgnoreCase(encoding))
            return new AsciiReader(in);
        if ("ISO-8859-1".equalsIgnoreCase(encoding)
        // plus numerous aliases ...
        )
            return new Iso8859_1Reader(in);

        //
        // What we really want is an administerable resource mapping
        // encoding names/aliases to classnames.  For example a property
        // file resource, "readers/mapping.props", holding and a set
        // of readers in that (sub)package... defaulting to this call
        // only if no better choice is available.
        //
        return new InputStreamReader(in, std2java(encoding));
    }

    //
    // JDK doesn't know all of the standard encoding names, and
    // in particular none of the EBCDIC ones IANA defines (and
    // which IBM encourages).
    //
    static private final Hashtable charsets = new Hashtable(31);

    static {
        charsets.put("UTF-16", "Unicode");
        charsets.put("ISO-10646-UCS-2", "Unicode");

        // NOTE: no support for ISO-10646-UCS-4 yet.

        charsets.put("EBCDIC-CP-US", "cp037");
        charsets.put("EBCDIC-CP-CA", "cp037");
        charsets.put("EBCDIC-CP-NL", "cp037");
        charsets.put("EBCDIC-CP-WT", "cp037");

        charsets.put("EBCDIC-CP-DK", "cp277");
        charsets.put("EBCDIC-CP-NO", "cp277");
        charsets.put("EBCDIC-CP-FI", "cp278");
        charsets.put("EBCDIC-CP-SE", "cp278");

        charsets.put("EBCDIC-CP-IT", "cp280");
        charsets.put("EBCDIC-CP-ES", "cp284");
        charsets.put("EBCDIC-CP-GB", "cp285");
        charsets.put("EBCDIC-CP-FR", "cp297");

        charsets.put("EBCDIC-CP-AR1", "cp420");
        charsets.put("EBCDIC-CP-HE", "cp424");
        charsets.put("EBCDIC-CP-BE", "cp500");
        charsets.put("EBCDIC-CP-CH", "cp500");

        charsets.put("EBCDIC-CP-ROECE", "cp870");
        charsets.put("EBCDIC-CP-YU", "cp870");
        charsets.put("EBCDIC-CP-IS", "cp871");
        charsets.put("EBCDIC-CP-AR2", "cp918");

        // IANA also defines two that JDK 1.2 doesn't handle:
        //    EBCDIC-CP-GR        --> CP423
        //    EBCDIC-CP-TR        --> CP905
    }

    // returns an encoding name supported by JDK >= 1.1.6
    // for some cases required by the XML spec
    private static String std2java(String encoding) {
        String temp = encoding.toUpperCase();
        temp = (String) charsets.get(temp);
        return temp != null ? temp : encoding;
    }

    /**
     * Returns the standard name of the encoding in use
     */
    public String getEncoding() {
        return assignedEncoding;
    }

    private XmlReader(InputStream stream) throws IOException {
        super(stream);

        PushbackInputStream pb;
        byte buf [];
        int len;

        if (stream instanceof PushbackInputStream)
            pb = (PushbackInputStream) stream;
        else
            pb = new PushbackInputStream(stream, MAXPUSHBACK);

        //
        // See if we can figure out the character encoding used
        // in this file by peeking at the first few bytes.
        //
        buf = new byte[4];
        len = pb.read(buf);
        if (len > 0)
            pb.unread(buf, 0, len);

        if (len == 4)
            switch (buf[0] & 0x0ff) {
            case 0:
                // 00 3c 00 3f == illegal UTF-16 big-endian
                if (buf[1] == 0x3c && buf[2] == 0x00 && buf[3] == 0x3f) {
                    setEncoding(pb, "UnicodeBig");
                    return;
                }
                // else it's probably UCS-4
                break;

            case '<':      // 0x3c: the most common cases!
                switch (buf[1] & 0x0ff) {
                // First character is '<'; could be XML without
                // an XML directive such as "<hello>", "