001/*-------------------------------------------------------------------------+
002|                                                                          |
003| Copyright 2005-2011 The ConQAT Project                                   |
004|                                                                          |
005| Licensed under the Apache License, Version 2.0 (the "License");          |
006| you may not use this file except in compliance with the License.         |
007| You may obtain a copy of the License at                                  |
008|                                                                          |
009|    http://www.apache.org/licenses/LICENSE-2.0                            |
010|                                                                          |
011| Unless required by applicable law or agreed to in writing, software      |
012| distributed under the License is distributed on an "AS IS" BASIS,        |
013| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
014| See the License for the specific language governing permissions and      |
015| limitations under the License.                                           |
016+-------------------------------------------------------------------------*/
017package org.conqat.lib.commons.filesystem;
018
019import java.io.IOException;
020import java.io.InputStream;
021import java.nio.charset.Charset;
022import java.nio.charset.StandardCharsets;
023import java.util.Arrays;
024import java.util.Optional;
025
026import org.conqat.lib.commons.assertion.CCSMAssert;
027import org.conqat.lib.commons.io.ByteArrayUtils;
028
029/**
030 * Enumeration of the UTF byte-order marks (BOM). The actual values are taken
031 * from http://unicode.org/faq/utf_bom.html
032 * <p>
033 * The order of the values in this enum is chosen such that BOMs that are a
034 * prefix of other BOMs are at the end, i.e. UTF-32 is before UTF-16. This way
035 * we can check the BOM prefix in the order of the enum values' appearance.
036 * 
037 * @author hummelb
038 */
039public enum EByteOrderMark {
040
041        /** UTF-32 with big endian encoding. */
042        UTF_32BE(Charset.forName("UTF-32BE"), new byte[] { 0x00, 0x00, (byte) 0xFE, (byte) 0xFF }),
043
044        /** UTF-32 with little endian encoding. */
045        UTF_32LE(Charset.forName("UTF-32LE"), new byte[] { (byte) 0xFF, (byte) 0xFE, 0x00, 0x00 }),
046
047        /** UTF-16 with big endian encoding. */
048        UTF_16BE(StandardCharsets.UTF_16BE, new byte[] { (byte) 0xFE, (byte) 0xFF }),
049
050        /** UTF-16 with little endian encoding. */
051        UTF_16LE(StandardCharsets.UTF_16LE, new byte[] { (byte) 0xFF, (byte) 0xFE }),
052
053        /**
054         * UTF-8. Note that for UTF-8 the endianess is not relevant and that the BOM is
055         * optional.
056         */
057        UTF_8_BOM(StandardCharsets.UTF_8, new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF });
058
059        /** The maximal length of a BOM. */
060        public static final int MAX_BOM_LENGTH = 4;
061
062        /** The encoding implied by this byte-order mark */
063        private final Charset encoding;
064
065        /** The byte order mark. */
066        private final byte[] bom;
067
068        /** Constructor. */
069        private EByteOrderMark(Charset encoding, byte[] bom) {
070                this.encoding = encoding;
071                CCSMAssert.isTrue(bom.length <= MAX_BOM_LENGTH, "Inconsistent max BOM length!");
072                this.bom = bom;
073        }
074
075        /** @see #encoding */
076        public Charset getEncoding() {
077                return encoding;
078        }
079
080        /**
081         * Returns the byte order mark. This returns a copy, so the array may be
082         * modified.
083         */
084        public byte[] getBOM() {
085                return Arrays.copyOf(bom, bom.length);
086        }
087
088        /** Returns the size of the BOM in bytes. */
089        public int getBOMLength() {
090                return bom.length;
091        }
092
093        /**
094         * This method checks the start of the provided data array to find a BOM. If a
095         * BOM is found, the corresponding enum value is returned. If possible, the
096         * provided data should at least be of size {@value #MAX_BOM_LENGTH}. Otherwise
097         * the encoding might not be detected correctly. However, the method also works
098         * with shorter arrays (e.g. if a file consists of only 3 bytes).
099         */
100        public static Optional<EByteOrderMark> determineBOM(byte[] data) {
101                for (EByteOrderMark bom : values()) {
102                        if (ByteArrayUtils.isPrefix(bom.bom, data)) {
103                                return Optional.of(bom);
104                        }
105                }
106                return Optional.empty();
107        }
108
109        /**
110         * This method determines which BOM, if any, the given stream starts with and
111         * skips it. The given {@link InputStream} must offer
112         * {@linkplain InputStream#markSupported() mark support}.
113         */
114        public static Optional<EByteOrderMark> skipBOM(InputStream data) throws IOException {
115                if (!data.markSupported()) {
116                        throw new IllegalArgumentException("InputStream must support mark");
117                }
118                data.mark(MAX_BOM_LENGTH);
119                byte[] buffer = new byte[MAX_BOM_LENGTH];
120                int readBytes = data.read(buffer);
121                for (EByteOrderMark bom : values()) {
122                        if (readBytes < bom.getBOMLength()) {
123                                continue;
124                        }
125                        if (ByteArrayUtils.isPrefix(bom.bom, buffer)) {
126                                data.reset();
127                                data.skip(bom.getBOMLength());
128                                return Optional.of(bom);
129                        }
130                }
131                data.reset();
132                return Optional.empty();
133        }
134}