001/*-------------------------------------------------------------------------+ 002| | 003| Copyright 2005-2011 the ConQAT Project | 004| | 005| Licensed under the Apache License, Version 2.0 (the "License"); | 006| you may not use this file except in compliance with the License. | 007| You may obtain a copy of the License at | 008| | 009| http://www.apache.org/licenses/LICENSE-2.0 | 010| | 011| Unless required by applicable law or agreed to in writing, software | 012| distributed under the License is distributed on an "AS IS" BASIS, | 013| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 014| See the License for the specific language governing permissions and | 015| limitations under the License. | 016+-------------------------------------------------------------------------*/ 017package org.conqat.lib.commons.serialization.objects; 018 019import java.io.DataInput; 020import java.io.DataInputStream; 021import java.io.DataOutputStream; 022import java.io.IOException; 023import java.io.UTFDataFormatException; 024 025/** 026 * This class contains code that was copied and adjusted from 027 * {@link DataInputStream} and {@link DataOutputStream}. The reason is that 028 * while <a href= 029 * "http://docs.oracle.com/javase/6/docs/platform/serialization/spec/protocol.html" 030 * >modified UTF8</a> handling is implemented in these classes, they can only 031 * handle short strings (less than 2^16 characters). As the code is badly 032 * modularized, we had to copy and modify it. 033 */ 034public class LongStringUtils { 035 036 /** Max size for char that can be encoded as single byte. */ 037 private static final int SINGLE_BYTE_LIMIT = 0x007F; 038 039 /** Max size for char that can be encoded as two bytes. */ 040 private static final int DOUBLE_BYTE_LIMIT = 0x07FF; 041 042 /** Maximal length of a short string. */ 043 public static final int MAX_SHORT_STRING_LENGTH = (1 << 16) - 1; 044 045 /** 046 * This is a copy of {@link DataInputStream#readUTF(DataInput)} with the main 047 * difference that the length is not read as a short but as a long. 048 */ 049 public static String readLongString(DataInputStream in) throws IOException { 050 051 // we need to read a long here as per specification, but are guaranteed to get 052 // only int range (as an array in Java also can not hold more elements) 053 int utfLength = (int) in.readLong(); 054 byte[] bytes = new byte[utfLength]; 055 in.readFully(bytes); 056 057 int count = 0; 058 int charCount = 0; 059 char[] chars = new char[utfLength]; 060 while (count < utfLength) { 061 int c = bytes[count] & 0xff; 062 if (c <= SINGLE_BYTE_LIMIT) { 063 count++; 064 chars[charCount++] = (char) c; 065 continue; 066 } 067 068 switch (c >> 4) { 069 case 12: 070 case 13: 071 /* 110x xxxx 10xx xxxx */ 072 count = incrementChecked(count, utfLength, 2); 073 chars[charCount++] = extractTwoByteChar(bytes, count - 2); 074 break; 075 case 14: 076 /* 1110 xxxx 10xx xxxx 10xx xxxx */ 077 count = incrementChecked(count, utfLength, 3); 078 chars[charCount++] = extractThreeByteChar(bytes, count - 3); 079 break; 080 default: 081 throwMalformedIf(true, count); 082 } 083 } 084 085 return new String(chars, 0, charCount); 086 087 } 088 089 /** Extracts a three byte character at given index position. */ 090 private static char extractThreeByteChar(byte[] bytes, int index) throws UTFDataFormatException { 091 int char2 = bytes[index + 1]; 092 int char3 = bytes[index + 2]; 093 throwMalformedIf(((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80), index + 1); 094 return (char) (((bytes[index] & 0x0F) << 12) | ((char2 & 0x3F) << 6) | ((char3 & 0x3F) << 0)); 095 } 096 097 /** Extracts a two byte character at given index position. */ 098 private static char extractTwoByteChar(byte[] bytes, int index) throws UTFDataFormatException { 099 int char2 = bytes[index + 1]; 100 throwMalformedIf((char2 & 0xC0) != 0x80, index + 2); 101 return (char) (((bytes[index] & 0x1F) << 6) | (char2 & 0x3F)); 102 } 103 104 private static void throwMalformedIf(boolean condition, int count) throws UTFDataFormatException { 105 if (condition) { 106 throw new UTFDataFormatException("malformed input around byte " + count); 107 } 108 } 109 110 /** 111 * Increments the count and checks whether count is still within the given 112 * length. 113 */ 114 private static int incrementChecked(int count, int utfLength, int increment) throws UTFDataFormatException { 115 count += increment; 116 if (count > utfLength) { 117 throw new UTFDataFormatException("malformed input: partial character at end"); 118 } 119 return count; 120 } 121 122 /** 123 * This is a copy of {@link DataOutputStream#writeUTF(String)} with the main 124 * difference that the length is not read as a short but as a long. 125 */ 126 public static void writeUTF(String string, DataOutputStream out) throws IOException { 127 int utfLength = string.chars().map(LongStringUtils::utfSize).sum(); 128 out.writeLong(utfLength); 129 130 byte[] bytes = new byte[utfLength]; 131 int index = 0; 132 for (int i = 0; i < string.length(); i++) { 133 int c = string.charAt(i); 134 if ((c >= 1) && (c <= SINGLE_BYTE_LIMIT)) { 135 bytes[index++] = (byte) c; 136 } else { 137 if (c > DOUBLE_BYTE_LIMIT) { 138 bytes[index++] = (byte) (0xE0 | ((c >> 12) & 0x0F)); 139 bytes[index++] = (byte) (0x80 | ((c >> 6) & 0x3F)); 140 } else { 141 bytes[index++] = (byte) (0xC0 | ((c >> 6) & 0x1F)); 142 } 143 bytes[index++] = (byte) (0x80 | (c & 0x3F)); 144 } 145 } 146 out.write(bytes); 147 } 148 149 /** Returns UTF encoded size of a character. */ 150 private static int utfSize(int character) { 151 if (character >= 1 && character <= SINGLE_BYTE_LIMIT) { 152 return 1; 153 } 154 if (character > DOUBLE_BYTE_LIMIT) { 155 return 3; 156 } 157 return 2; 158 } 159}