001/*-------------------------------------------------------------------------+ 002| | 003| Copyright 2005-2011 The ConQAT Project | 004| | 005| Licensed under the Apache License, Version 2.0 (the "License"); | 006| you may not use this file except in compliance with the License. | 007| You may obtain a copy of the License at | 008| | 009| http://www.apache.org/licenses/LICENSE-2.0 | 010| | 011| Unless required by applicable law or agreed to in writing, software | 012| distributed under the License is distributed on an "AS IS" BASIS, | 013| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 014| See the License for the specific language governing permissions and | 015| limitations under the License. | 016+-------------------------------------------------------------------------*/ 017package org.conqat.lib.commons.filesystem; 018 019import java.io.IOException; 020import java.io.InputStream; 021import java.nio.charset.Charset; 022import java.nio.charset.StandardCharsets; 023import java.util.Arrays; 024import java.util.Optional; 025 026import org.conqat.lib.commons.assertion.CCSMAssert; 027import org.conqat.lib.commons.io.ByteArrayUtils; 028 029/** 030 * Enumeration of the UTF byte-order marks (BOM). The actual values are taken 031 * from http://unicode.org/faq/utf_bom.html 032 * <p> 033 * The order of the values in this enum is chosen such that BOMs that are a 034 * prefix of other BOMs are at the end, i.e. UTF-32 is before UTF-16. This way 035 * we can check the BOM prefix in the order of the enum values' appearance. 036 * 037 * @author hummelb 038 */ 039public enum EByteOrderMark { 040 041 /** UTF-32 with big endian encoding. */ 042 UTF_32BE(Charset.forName("UTF-32BE"), new byte[] { 0x00, 0x00, (byte) 0xFE, (byte) 0xFF }), 043 044 /** UTF-32 with little endian encoding. */ 045 UTF_32LE(Charset.forName("UTF-32LE"), new byte[] { (byte) 0xFF, (byte) 0xFE, 0x00, 0x00 }), 046 047 /** UTF-16 with big endian encoding. */ 048 UTF_16BE(StandardCharsets.UTF_16BE, new byte[] { (byte) 0xFE, (byte) 0xFF }), 049 050 /** UTF-16 with little endian encoding. */ 051 UTF_16LE(StandardCharsets.UTF_16LE, new byte[] { (byte) 0xFF, (byte) 0xFE }), 052 053 /** 054 * UTF-8. Note that for UTF-8 the endianess is not relevant and that the BOM is 055 * optional. 056 */ 057 UTF_8_BOM(StandardCharsets.UTF_8, new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF }); 058 059 /** The maximal length of a BOM. */ 060 public static final int MAX_BOM_LENGTH = 4; 061 062 /** The encoding implied by this byte-order mark */ 063 private final Charset encoding; 064 065 /** The byte order mark. */ 066 private final byte[] bom; 067 068 /** Constructor. */ 069 private EByteOrderMark(Charset encoding, byte[] bom) { 070 this.encoding = encoding; 071 CCSMAssert.isTrue(bom.length <= MAX_BOM_LENGTH, "Inconsistent max BOM length!"); 072 this.bom = bom; 073 } 074 075 /** @see #encoding */ 076 public Charset getEncoding() { 077 return encoding; 078 } 079 080 /** 081 * Returns the byte order mark. This returns a copy, so the array may be 082 * modified. 083 */ 084 public byte[] getBOM() { 085 return Arrays.copyOf(bom, bom.length); 086 } 087 088 /** Returns the size of the BOM in bytes. */ 089 public int getBOMLength() { 090 return bom.length; 091 } 092 093 /** 094 * This method checks the start of the provided data array to find a BOM. If a 095 * BOM is found, the corresponding enum value is returned. If possible, the 096 * provided data should at least be of size {@value #MAX_BOM_LENGTH}. Otherwise 097 * the encoding might not be detected correctly. However, the method also works 098 * with shorter arrays (e.g. if a file consists of only 3 bytes). 099 */ 100 public static Optional<EByteOrderMark> determineBOM(byte[] data) { 101 for (EByteOrderMark bom : values()) { 102 if (ByteArrayUtils.isPrefix(bom.bom, data)) { 103 return Optional.of(bom); 104 } 105 } 106 return Optional.empty(); 107 } 108 109 /** 110 * This method determines which BOM, if any, the given stream starts with and 111 * skips it. The given {@link InputStream} must offer 112 * {@linkplain InputStream#markSupported() mark support}. 113 */ 114 public static Optional<EByteOrderMark> skipBOM(InputStream data) throws IOException { 115 if (!data.markSupported()) { 116 throw new IllegalArgumentException("InputStream must support mark"); 117 } 118 data.mark(MAX_BOM_LENGTH); 119 byte[] buffer = new byte[MAX_BOM_LENGTH]; 120 int readBytes = data.read(buffer); 121 for (EByteOrderMark bom : values()) { 122 if (readBytes < bom.getBOMLength()) { 123 continue; 124 } 125 if (ByteArrayUtils.isPrefix(bom.bom, buffer)) { 126 data.reset(); 127 data.skip(bom.getBOMLength()); 128 return Optional.of(bom); 129 } 130 } 131 data.reset(); 132 return Optional.empty(); 133 } 134}