001/*-------------------------------------------------------------------------+
002|                                                                          |
003| Copyright 2005-2011 the ConQAT Project                                   |
004|                                                                          |
005| Licensed under the Apache License, Version 2.0 (the "License");          |
006| you may not use this file except in compliance with the License.         |
007| You may obtain a copy of the License at                                  |
008|                                                                          |
009|    http://www.apache.org/licenses/LICENSE-2.0                            |
010|                                                                          |
011| Unless required by applicable law or agreed to in writing, software      |
012| distributed under the License is distributed on an "AS IS" BASIS,        |
013| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
014| See the License for the specific language governing permissions and      |
015| limitations under the License.                                           |
016+-------------------------------------------------------------------------*/
017package org.conqat.lib.commons.serialization.objects;
018
019import java.io.DataInput;
020import java.io.DataInputStream;
021import java.io.DataOutputStream;
022import java.io.IOException;
023import java.io.UTFDataFormatException;
024
025/**
026 * This class contains code that was copied and adjusted from
027 * {@link DataInputStream} and {@link DataOutputStream}. The reason is that
028 * while <a href=
029 * "http://docs.oracle.com/javase/6/docs/platform/serialization/spec/protocol.html"
030 * >modified UTF8</a> handling is implemented in these classes, they can only
031 * handle short strings (less than 2^16 characters). As the code is badly
032 * modularized, we had to copy and modify it.
033 */
034public class LongStringUtils {
035
036        /** Max size for char that can be encoded as single byte. */
037        private static final int SINGLE_BYTE_LIMIT = 0x007F;
038
039        /** Max size for char that can be encoded as two bytes. */
040        private static final int DOUBLE_BYTE_LIMIT = 0x07FF;
041
042        /** Maximal length of a short string. */
043        public static final int MAX_SHORT_STRING_LENGTH = (1 << 16) - 1;
044
045        /**
046         * This is a copy of {@link DataInputStream#readUTF(DataInput)} with the main
047         * difference that the length is not read as a short but as a long.
048         */
049        public static String readLongString(DataInputStream in) throws IOException {
050
051                // we need to read a long here as per specification, but are guaranteed to get
052                // only int range (as an array in Java also can not hold more elements)
053                int utfLength = (int) in.readLong();
054                byte[] bytes = new byte[utfLength];
055                in.readFully(bytes);
056
057                int count = 0;
058                int charCount = 0;
059                char[] chars = new char[utfLength];
060                while (count < utfLength) {
061                        int c = bytes[count] & 0xff;
062                        if (c <= SINGLE_BYTE_LIMIT) {
063                                count++;
064                                chars[charCount++] = (char) c;
065                                continue;
066                        }
067
068                        switch (c >> 4) {
069                        case 12:
070                        case 13:
071                                /* 110x xxxx 10xx xxxx */
072                                count = incrementChecked(count, utfLength, 2);
073                                chars[charCount++] = extractTwoByteChar(bytes, count - 2);
074                                break;
075                        case 14:
076                                /* 1110 xxxx 10xx xxxx 10xx xxxx */
077                                count = incrementChecked(count, utfLength, 3);
078                                chars[charCount++] = extractThreeByteChar(bytes, count - 3);
079                                break;
080                        default:
081                                throwMalformedIf(true, count);
082                        }
083                }
084
085                return new String(chars, 0, charCount);
086
087        }
088
089        /** Extracts a three byte character at given index position. */
090        private static char extractThreeByteChar(byte[] bytes, int index) throws UTFDataFormatException {
091                int char2 = bytes[index + 1];
092                int char3 = bytes[index + 2];
093                throwMalformedIf(((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80), index + 1);
094                return (char) (((bytes[index] & 0x0F) << 12) | ((char2 & 0x3F) << 6) | ((char3 & 0x3F) << 0));
095        }
096
097        /** Extracts a two byte character at given index position. */
098        private static char extractTwoByteChar(byte[] bytes, int index) throws UTFDataFormatException {
099                int char2 = bytes[index + 1];
100                throwMalformedIf((char2 & 0xC0) != 0x80, index + 2);
101                return (char) (((bytes[index] & 0x1F) << 6) | (char2 & 0x3F));
102        }
103
104        private static void throwMalformedIf(boolean condition, int count) throws UTFDataFormatException {
105                if (condition) {
106                        throw new UTFDataFormatException("malformed input around byte " + count);
107                }
108        }
109
110        /**
111         * Increments the count and checks whether count is still within the given
112         * length.
113         */
114        private static int incrementChecked(int count, int utfLength, int increment) throws UTFDataFormatException {
115                count += increment;
116                if (count > utfLength) {
117                        throw new UTFDataFormatException("malformed input: partial character at end");
118                }
119                return count;
120        }
121
122        /**
123         * This is a copy of {@link DataOutputStream#writeUTF(String)} with the main
124         * difference that the length is not read as a short but as a long.
125         */
126        public static void writeUTF(String string, DataOutputStream out) throws IOException {
127                int utfLength = string.chars().map(LongStringUtils::utfSize).sum();
128                out.writeLong(utfLength);
129
130                byte[] bytes = new byte[utfLength];
131                int index = 0;
132                for (int i = 0; i < string.length(); i++) {
133                        int c = string.charAt(i);
134                        if ((c >= 1) && (c <= SINGLE_BYTE_LIMIT)) {
135                                bytes[index++] = (byte) c;
136                        } else {
137                                if (c > DOUBLE_BYTE_LIMIT) {
138                                        bytes[index++] = (byte) (0xE0 | ((c >> 12) & 0x0F));
139                                        bytes[index++] = (byte) (0x80 | ((c >> 6) & 0x3F));
140                                } else {
141                                        bytes[index++] = (byte) (0xC0 | ((c >> 6) & 0x1F));
142                                }
143                                bytes[index++] = (byte) (0x80 | (c & 0x3F));
144                        }
145                }
146                out.write(bytes);
147        }
148
149        /** Returns UTF encoded size of a character. */
150        private static int utfSize(int character) {
151                if (character >= 1 && character <= SINGLE_BYTE_LIMIT) {
152                        return 1;
153                }
154                if (character > DOUBLE_BYTE_LIMIT) {
155                        return 3;
156                }
157                return 2;
158        }
159}