tinyendian source code

1 //          Copyright Ferdinand Majerech 2014.
2 // Distributed under the Boost Software License, Version 1.0.
3 //    (See accompanying file LICENSE_1_0.txt or copy at
4 //          http://www.boost.org/LICENSE_1_0.txt)
5 
6 /// A minimal library providing functionality for changing the endianness of data.
7 module tinyendian;
8 
9 import std.system : Endian, endian;
10 
11 /// Unicode UTF encodings.
12 enum UTFEncoding : ubyte
13 {
14     UTF_8,
15     UTF_16,
16     UTF_32
17 }
18 ///
19 @safe unittest
20 {
21     const ints = [314, -101];
22     int[2] intsSwapBuffer = ints;
23     swapByteOrder(intsSwapBuffer[]);
24     swapByteOrder(intsSwapBuffer[]);
25     assert(ints == intsSwapBuffer, "Lost information when swapping byte order");
26 
27     const floats = [3.14f, 10.1f];
28     float[2] floatsSwapBuffer = floats;
29     swapByteOrder(floatsSwapBuffer[]);
30     swapByteOrder(floatsSwapBuffer[]);
31     assert(floats == floatsSwapBuffer, "Lost information when swapping byte order");
32 }
33 
34 /** Swap byte order of items in an array in place.
35  *
36  * Params:
37  *
38  * T     = Item type. Must be either 2 or 4 bytes long.
39  * array = Buffer with values to fix byte order of.
40  */
41 void swapByteOrder(T)(T[] array) @trusted @nogc pure nothrow
42 if (T.sizeof == 2 || T.sizeof == 4)
43 {
44     // Swap the byte order of all read characters.
45     foreach (ref item; array)
46     {
47         static if (T.sizeof == 2)
48         {
49             import std.algorithm.mutation : swap;
50             swap(*cast(ubyte*)&item, *(cast(ubyte*)&item + 1));
51         }
52         else static if (T.sizeof == 4)
53         {
54             import core.bitop : bswap;
55             const swapped = bswap(*cast(uint*)&item);
56             item = *cast(const(T)*)&swapped;
57         }
58         else static assert(false, "Unsupported T: " ~ T.stringof);
59     }
60 }
61 
62 /// See fixUTFByteOrder.
63 struct FixUTFByteOrderResult
64 {
65     ubyte[] array;
66     UTFEncoding encoding;
67     Endian endian;
68     uint bytesStripped = 0;
69 }
70 
71 /** Convert byte order of an array encoded in UTF(8/16/32) to system endianness in place.
72  *
73  * Uses the UTF byte-order-mark (BOM) to determine UTF encoding. If there is no BOM
74  * at the beginning of array, UTF-8 is assumed (this is compatible with ASCII). The
75  * BOM, if any, will be removed from the buffer.
76  *
77  * If the encoding is determined to be UTF-16 or UTF-32 and there aren't enough bytes
78  * for the last code unit (i.e. if array.length is odd for UTF-16 or not divisible by
79  * 4 for UTF-32), the extra bytes (1 for UTF-16, 1-3 for UTF-32) are stripped.
80  *
81  * Note that this function does $(B not) check if the array is a valid UTF string. It
82  * only works with the BOM and 1,2 or 4-byte items.
83  *
84  * Params:
85  *
86  * array = The array with UTF-data.
87  *
88  * Returns:
89  *
90  * A struct with the following members:
91  *
92  * $(D ubyte[] array)            A slice of the input array containing data in correct
93  *                               byte order, without BOM and in case of UTF-16/UTF-32,
94  *                               without stripped bytes, if any.
95  * $(D UTFEncoding encoding)     Encoding of the result (UTF-8, UTF-16 or UTF-32)
96  * $(D std.system.Endian endian) Endianness of the original array.
97  * $(D uint bytesStripped)       Number of bytes stripped from a UTF-16/UTF-32 array, if
98  *                               any. This is non-zero only if array.length was not
99  *                               divisible by 2 or 4 for UTF-16 and UTF-32, respectively.
100  *
101  * Complexity: (BIGOH array.length)
102  */
103 auto fixUTFByteOrder(ubyte[] array) @safe @nogc pure nothrow
104 {
105     // Enumerates UTF BOMs, matching indices to byteOrderMarks/bomEndian.
106     enum BOM: ubyte
107     {
108         UTF_8     = 0,
109         UTF_16_LE = 1,
110         UTF_16_BE = 2,
111         UTF_32_LE = 3,
112         UTF_32_BE = 4,
113         None      = ubyte.max
114     }
115 
116     // These 2 are from std.stream
117     static immutable ubyte[][5] byteOrderMarks = [ [0xEF, 0xBB, 0xBF],
118                                                    [0xFF, 0xFE],
119                                                    [0xFE, 0xFF],
120                                                    [0xFF, 0xFE, 0x00, 0x00],
121                                                    [0x00, 0x00, 0xFE, 0xFF] ];
122     static immutable Endian[5] bomEndian = [ endian,
123                                              Endian.littleEndian,
124                                              Endian.bigEndian,
125                                              Endian.littleEndian, 
126                                              Endian.bigEndian ];
127 
128     // Documented in function ddoc.
129 
130     FixUTFByteOrderResult result;
131 
132     // Detect BOM, if any, in the bytes we've read. -1 means no BOM.
133     // Need the last match: First 2 bytes of UTF-32LE BOM match the UTF-16LE BOM. If we
134     // used the first match, UTF-16LE would be detected when we have a UTF-32LE BOM.
135     import std.algorithm.searching : startsWith;
136     BOM bomId = BOM.None;
137     foreach (i, bom; byteOrderMarks)
138         if (array.startsWith(bom))
139             bomId = cast(BOM)i;
140 
141     result.endian = (bomId != BOM.None) ? bomEndian[bomId] : Endian.init;
142 
143     // Start of UTF data (after BOM, if any)
144     size_t start = 0;
145     // If we've read more than just the BOM, put the rest into the array.
146     with(BOM) final switch(bomId)
147     {
148         case None: result.encoding = UTFEncoding.UTF_8; break;
149         case UTF_8:
150             start = 3;
151             result.encoding = UTFEncoding.UTF_8;
152             break;
153         case UTF_16_LE, UTF_16_BE:
154             result.bytesStripped = array.length % 2;
155             start = 2;
156             result.encoding = UTFEncoding.UTF_16;
157             break;
158         case UTF_32_LE, UTF_32_BE:
159             result.bytesStripped = array.length % 4;
160             start = 4;
161             result.encoding = UTFEncoding.UTF_32;
162             break;
163     }
164 
165     // If there's a BOM, we need to move data back to ensure it starts at array[0]
166     if (start != 0)
167     {
168         array = array[start .. $  - result.bytesStripped];
169     }
170 
171     // We enforce above that array.length is divisible by 2/4 for UTF-16/32
172     if (endian != result.endian)
173     {
174         if (result.encoding == UTFEncoding.UTF_16)
175             swapByteOrder(cast(wchar[])array);
176         else if (result.encoding == UTFEncoding.UTF_32)
177             swapByteOrder(cast(dchar[])array);
178     }
179 
180     result.array = array;
181     return result;
182 }
183 ///
184 @safe unittest
185 {
186     {
187         ubyte[] s = [0xEF, 0xBB, 0xBF, 'a'];
188         FixUTFByteOrderResult r = fixUTFByteOrder(s);
189         assert(r.encoding == UTFEncoding.UTF_8);
190         assert(r.array.length == 1);
191         assert(r.array == ['a']);
192         assert(r.endian == Endian.littleEndian);
193     }
194 
195     {
196         ubyte[] s = ['a'];
197         FixUTFByteOrderResult r = fixUTFByteOrder(s);
198         assert(r.encoding == UTFEncoding.UTF_8);
199         assert(r.array.length == 1);
200         assert(r.array == ['a']);
201         assert(r.endian == Endian.bigEndian);
202     }
203 
204     {
205         // strip 'a' b/c not complete unit
206         ubyte[] s = [0xFE, 0xFF, 'a'];
207         FixUTFByteOrderResult r = fixUTFByteOrder(s);
208         assert(r.encoding == UTFEncoding.UTF_16);
209         assert(r.array.length == 0);
210         assert(r.endian == Endian.bigEndian);
211     }
212 
213 }