alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Java example source code file (Utf8Test.java)

This example Java source code file (Utf8Test.java) is included in the alvinalexander.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Learn more about this Java project at its project page.

Java - Java tags/keywords

expected_one_byte_roundtrippable_count, expected_three_byte_roundtrippable_count, expected_two_byte_roundtrippable_count, gwtincompatible, hashmap, ill_formed_strings, integer, min_high_surrogate, min_low_surrogate, one_byte_roundtrippable_characters, stringbuilder, three_byte_roundtrippable_characters, three_byte_surrogates, two_byte_roundtrippable_characters, util

The Utf8Test.java Java example source code

/*
 * Copyright (C) 2013 The Guava Authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.common.base;

import static com.google.common.truth.Truth.assertThat;
import static java.lang.Character.MAX_CODE_POINT;
import static java.lang.Character.MAX_HIGH_SURROGATE;
import static java.lang.Character.MAX_LOW_SURROGATE;
import static java.lang.Character.MIN_HIGH_SURROGATE;
import static java.lang.Character.MIN_LOW_SURROGATE;
import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;

import com.google.common.annotations.GwtCompatible;
import com.google.common.annotations.GwtIncompatible;
import com.google.common.collect.ImmutableList;

import junit.framework.TestCase;

import java.util.Arrays;
import java.util.HashMap;
import java.util.Random;

/**
 * Unit tests for {@link Utf8}.
 *
 * @author Jon Perlow
 * @author Martin Buchholz
 * @author Clément Roux
 */
@GwtCompatible(emulated = true)
public class Utf8Test extends TestCase {

  private static final ImmutableList<String> ILL_FORMED_STRINGS;
  static {
    ImmutableList.Builder<String> builder = ImmutableList.builder();
    char[] surrogates = {
      MAX_LOW_SURROGATE,
      MAX_HIGH_SURROGATE,
      MIN_LOW_SURROGATE,
      MIN_HIGH_SURROGATE,
    };
    for (char surrogate : surrogates) {
      builder.add(newString(surrogate));
      builder.add(newString(surrogate, 'n'));
      builder.add(newString('n', surrogate));
      builder.add(newString(surrogate, surrogate));
    }
    builder.add(newString(MIN_LOW_SURROGATE, MAX_HIGH_SURROGATE));
    ILL_FORMED_STRINGS = builder.build();
  }

  public void testEncodedLength_validStrings() {
    assertEquals(0, Utf8.encodedLength(""));
    assertEquals(11, Utf8.encodedLength("Hello world"));
    assertEquals(8, Utf8.encodedLength("Résumé"));
    assertEquals(461, Utf8.encodedLength("威廉·莎士比亞(William Shakespeare,"
        + "1564年4月26號—1616年4月23號[1])係隻英國嗰演員、劇作家同詩人,"
        + "有時間佢簡稱莎翁;中國清末民初哈拕翻譯做舌克斯毕、沙斯皮耳、筛斯比耳、"
        + "莎基斯庇尔、索士比尔、夏克思芘尔、希哀苦皮阿、叶斯壁、沙克皮尔、"
        + "狹斯丕爾。[2]莎士比亞編寫過好多作品,佢嗰劇作響西洋文學好有影響,"
        + "哈都拕人翻譯做好多話。"));
    // A surrogate pair
    assertEquals(4, Utf8.encodedLength(newString(MIN_HIGH_SURROGATE, MIN_LOW_SURROGATE)));
  }

  public void testEncodedLength_validStrings2() {
    HashMap<Integer, Integer> utf8Lengths = new HashMap();
    utf8Lengths.put(0x00, 1);
    utf8Lengths.put(0x7f, 1);
    utf8Lengths.put(0x80, 2);
    utf8Lengths.put(0x7ff, 2);
    utf8Lengths.put(0x800, 3);
    utf8Lengths.put(MIN_SUPPLEMENTARY_CODE_POINT - 1, 3);
    utf8Lengths.put(MIN_SUPPLEMENTARY_CODE_POINT, 4);
    utf8Lengths.put(MAX_CODE_POINT, 4);

    Integer[] codePoints = utf8Lengths.keySet().toArray(new Integer[]{});
    StringBuilder sb = new StringBuilder();
    Random rnd = new Random();
    for (int trial = 0; trial < 100; trial++) {
      sb.setLength(0);
      int utf8Length = 0;
      for (int i = 0; i < 6; i++) {
        Integer randomCodePoint = codePoints[rnd.nextInt(codePoints.length)];
        sb.appendCodePoint(randomCodePoint);
        utf8Length += utf8Lengths.get(randomCodePoint);
        if (utf8Length != Utf8.encodedLength(sb)) {
          StringBuilder repro = new StringBuilder();
          for (int j = 0; j < sb.length(); j++) {
            repro.append(" " + (int) sb.charAt(j));  // GWT compatible
          }
          assertEquals(repro.toString(), utf8Length, Utf8.encodedLength(sb));
        }
      }
    }
  }

  public void testEncodedLength_invalidStrings() {
    testEncodedLengthFails(newString(MIN_HIGH_SURROGATE), 0);
    testEncodedLengthFails("foobar" + newString(MIN_HIGH_SURROGATE), 6);
    testEncodedLengthFails(newString(MIN_LOW_SURROGATE), 0);
    testEncodedLengthFails("foobar" + newString(MIN_LOW_SURROGATE), 6);
    testEncodedLengthFails(newString(MIN_HIGH_SURROGATE, MIN_HIGH_SURROGATE), 0);
  }

  private static void testEncodedLengthFails(String invalidString, int invalidCodePointIndex) {
    try {
      Utf8.encodedLength(invalidString);
      fail();
    } catch (IllegalArgumentException expected) {
      assertThat(expected).hasMessage("Unpaired surrogate at index " + invalidCodePointIndex);
    }
  }

  // 128 - [chars 0x0000 to 0x007f]
  private static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
      0x007f - 0x0000 + 1;

  // 128
  private static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT =
      ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS;

  // 1920 [chars 0x0080 to 0x07FF]
  private static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS =
      0x07FF - 0x0080 + 1;

  // 18,304
  private static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT =
      // Both bytes are one byte characters
      (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) +
      // The possible number of two byte characters
      TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS;

  // 2048
  private static final long THREE_BYTE_SURROGATES = 2 * 1024;

  // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates]
  private static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
      0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES;

  // 2,650,112
  private static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT =
      // All one byte characters
      (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) +
      // One two byte character and a one byte character
      2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
          ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
       // Three byte characters
      THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS;

  // 1,048,576 [chars 0x10000L to 0x10FFFF]
  private static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS =
      0x10FFFF - 0x10000L + 1;

  // 289,571,839
  private static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT =
      // All one byte characters
      (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) +
      // One and three byte characters
      2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
          ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
      // Two two byte characters
      TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS +
      // Permutations of one and two byte characters
      3 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
          ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
          ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
      // Four byte characters
      FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS;

  /** Tests that round tripping of all two byte permutations work. */
  @GwtIncompatible // java.nio.charset.Charset
  public void testIsWellFormed_1Byte() {
    testBytes(1, EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT);
  }

  /** Tests that round tripping of all two byte permutations work. */
  @GwtIncompatible // java.nio.charset.Charset
  public void testIsWellFormed_2Bytes() {
    testBytes(2, EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT);
  }

  /** Tests that round tripping of all three byte permutations work. */
  @GwtIncompatible // java.nio.charset.Charset

  public void testIsWellFormed_3Bytes() {
    testBytes(3, EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT);
  }

  /**
   * Tests that round tripping of a sample of four byte permutations work.
   * All permutations are prohibitively expensive to test for automated runs.
   * This method tests specific four-byte cases.
   */
  public void testIsWellFormed_4BytesSamples() {
    // Valid 4 byte.
    assertWellFormed(0xF0, 0xA4, 0xAD, 0xA2);
    // Bad trailing bytes
    assertNotWellFormed(0xF0, 0xA4, 0xAD, 0x7F);
    assertNotWellFormed(0xF0, 0xA4, 0xAD, 0xC0);
    // Special cases for byte2
    assertNotWellFormed(0xF0, 0x8F, 0xAD, 0xA2);
    assertNotWellFormed(0xF4, 0x90, 0xAD, 0xA2);
  }

  /** Tests some hard-coded test cases. */
  public void testSomeSequences() {
    // Empty
    assertWellFormed();
    // One-byte characters, including control characters
    assertWellFormed(0x00, 0x61, 0x62, 0x63, 0x7F); // "\u0000abc\u007f"
    // Two-byte characters
    assertWellFormed(0xC2, 0xA2, 0xC2, 0xA2); // "\u00a2\u00a2"
    // Three-byte characters
    assertWellFormed(0xc8, 0x8a, 0x63, 0xc8, 0x8a, 0x63); // "\u020ac\u020ac"
    // Four-byte characters
    // "\u024B62\u024B62"
    assertWellFormed(0xc9, 0x8b, 0x36, 0x32, 0xc9, 0x8b, 0x36, 0x32);
    // Mixed string
    // "a\u020ac\u00a2b\\u024B62u020acc\u00a2de\u024B62"
    assertWellFormed(0x61, 0xc8, 0x8a, 0x63, 0xc2, 0xa2, 0x62, 0x5c, 0x75, 0x30,
        0x32, 0x34, 0x42, 0x36, 0x32, 0x75, 0x30, 0x32, 0x30, 0x61, 0x63, 0x63,
        0xc2, 0xa2, 0x64, 0x65, 0xc9, 0x8b, 0x36, 0x32);
    // Not a valid string
    assertNotWellFormed(-1, 0, -1, 0);
  }

  public void testShardsHaveExpectedRoundTrippables() {
    // A sanity check.
    long actual = 0;
    for (long expected : generateFourByteShardsExpectedRunnables()) {
      actual += expected;
    }
    assertEquals(EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT, actual);
  }

  private static String newString(char... chars) {
    return new String(chars);
  }

  private static byte[] toByteArray(int... bytes) {
    byte[] realBytes = new byte[bytes.length];
    for (int i = 0; i < bytes.length; i++) {
      realBytes[i] = (byte) bytes[i];
    }
    return realBytes;
  }

  private static void assertWellFormed(int... bytes) {
    assertTrue(Utf8.isWellFormed(toByteArray(bytes)));
  }

  private static void assertNotWellFormed(int... bytes) {
    assertFalse(Utf8.isWellFormed(toByteArray(bytes)));
  }

  private static long[] generateFourByteShardsExpectedRunnables() {
    long[] expected = new long[128];
    // 0-63 are all 5300224
    for (int i = 0; i <= 63; i++) {
      expected[i] = 5300224;
    }
    // 97-111 are all 2342912
    for (int i = 97; i <= 111; i++) {
     expected[i] = 2342912;
    }
    // 113-117 are all 1048576
    for (int i = 113; i <= 117; i++) {
      expected[i] = 1048576;
    }
    // One offs
    expected[112] = 786432;
    expected[118] = 786432;
    expected[119] = 1048576;
    expected[120] = 458752;
    expected[121] = 524288;
    expected[122] = 65536;
    // Anything not assigned was the default 0.
    return expected;
  }

  /**
   * Helper to run the loop to test all the permutations for the number of bytes
   * specified.
   *
   * @param numBytes the number of bytes in the byte array
   * @param expectedCount the expected number of roundtrippable permutations
   */
  @GwtIncompatible // java.nio.charset.Charset
  private static void testBytes(int numBytes, long expectedCount) {
    testBytes(numBytes, expectedCount, 0, -1);
  }

  /**
   * Helper to run the loop to test all the permutations for the number of bytes
   * specified. This overload is useful for debugging to get the loop to start
   * at a certain character.
   *
   * @param numBytes the number of bytes in the byte array
   * @param expectedCount the expected number of roundtrippable permutations
   * @param start the starting bytes encoded as a long as big-endian
   * @param lim the limit of bytes to process encoded as a long as big-endian,
   *     or -1 to mean the max limit for numBytes
   */
  @GwtIncompatible // java.nio.charset.Charset
  private static void testBytes(int numBytes, long expectedCount, long start, long lim) {
    byte[] bytes = new byte[numBytes];
    if (lim == -1) {
      lim = 1L << (numBytes * 8);
    }
    long countRoundTripped = 0;
    for (long byteChar = start; byteChar < lim; byteChar++) {
      long tmpByteChar = byteChar;
      for (int i = 0; i < numBytes; i++) {
        bytes[bytes.length - i - 1] = (byte) tmpByteChar;
        tmpByteChar = tmpByteChar >> 8;
      }
      boolean isRoundTrippable = Utf8.isWellFormed(bytes);
      assertEquals(isRoundTrippable, Utf8.isWellFormed(bytes, 0, numBytes));
      String s = new String(bytes, Charsets.UTF_8);
      byte[] bytesReencoded = s.getBytes(Charsets.UTF_8);
      boolean bytesEqual = Arrays.equals(bytes, bytesReencoded);

      if (bytesEqual != isRoundTrippable) {
        fail();
      }
      if (isRoundTrippable) {
        countRoundTripped++;
      }
    }
    assertEquals(expectedCount, countRoundTripped);
  }
}

Other Java examples (source code examples)

Here is a short list of links related to this Java Utf8Test.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2024 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.