c# 文本编码识别

完美区分UTF-8,与UTF8 without BOM以及其他编码
最少4个byte
  1 public class TextEncodingDetect
  2     {
  3         #region Fields
  4 
  5         private readonly byte[] utf16LEBOM = { 0xFF, 0xFE };
  6         private readonly byte[] utf16BEBOM = { 0xFE, 0xFF };
  7         private readonly byte[] utf8BOM = { 0xEF, 0xBB, 0xBF };
  8 
  9         private bool nullSuggestsBinary = true;
 10         private double utf16ExpectedNullPercent = 70;
 11         private double utf16UnexpectedNullPercent = 10;
 12 
 13         #endregion
 14 
 15         #region Enums
 16 
 17         public enum Encoding
 18         {
 19             None,               // Unknown or binary
 20             ANSI,               // 0-255
 21             ASCII,              // 0-127
 22             UTF8_BOM,           // UTF8 with BOM
 23             UTF8_NOBOM,         // UTF8 without BOM
 24             UTF16_LE_BOM,       // UTF16 LE with BOM
 25             UTF16_LE_NOBOM,     // UTF16 LE without BOM
 26             UTF16_BE_BOM,       // UTF16-BE with BOM
 27             UTF16_BE_NOBOM      // UTF16-BE without BOM
 28         }
 29 
 30         #endregion
 31 
 32         #region Properties
 33 
 34         public bool NullSuggestsBinary
 35         {
 36             set
 37             {
 38                 this.nullSuggestsBinary = value;
 39             }
 40         }
 41 
 42         public double Utf16ExpectedNullPercent
 43         {
 44             set
 45             {
 46                 if (value > 0 && value < 100)
 47                 {
 48                     this.utf16ExpectedNullPercent = value;
 49                 }
 50             }
 51         }
 52 
 53         public double Utf16UnexpectedNullPercent
 54         {
 55             set
 56             {
 57                 if (value > 0 && value < 100)
 58                 {
 59                     this.utf16UnexpectedNullPercent = value;
 60                 }
 61             }
 62         }
 63 
 64         #endregion
 65 
 66         public static int GetBOMLengthFromEncodingMode(Encoding encoding)
 67         {
 68             int length = 0;
 69 
 70             if (encoding == Encoding.UTF16_BE_BOM || encoding == Encoding.UTF16_LE_BOM)
 71             {
 72                 length = 2;
 73             }
 74             else if (encoding == Encoding.UTF8_BOM)
 75             {
 76                 length = 3;
 77             }
 78 
 79             return length;
 80         }
 81 
 82    
 83         /// <summary>
 84         /// 85         /// </summary>
 86         /// <param name="buffer"></param>
 87         /// <param name="size"></param>
 88         /// <returns></returns>
 89         public Encoding DetectEncoding(byte[] buffer, int size)
 90         {
 91             // First check if we have a BOM and return that if so
 92             Encoding encoding = this.CheckBOM(buffer, size);
 93             if (encoding != Encoding.None)
 94             {
 95                 return encoding;
 96             }
 97 
 98             // Now check for valid UTF8
 99             encoding = this.CheckUTF8(buffer, size);
100             if (encoding != Encoding.None)
101             {
102                 return encoding;
103             }
104 
105             // Now try UTF16 
106             encoding = this.CheckUTF16NewlineChars(buffer, size);
107             if (encoding != Encoding.None)
108             {
109                 return encoding;
110             }
111 
112             encoding = this.CheckUTF16ASCII(buffer, size);
113             if (encoding != Encoding.None)
114             {
115                 return encoding;
116             }
117 
118             // ANSI or None (binary) then
119             if (!this.DoesContainNulls(buffer, size))
120             {
121                 return Encoding.ANSI;
122             }
123             else
124             {
125                 // Found a null, return based on the preference in null_suggests_binary_
126                 if (this.nullSuggestsBinary)
127                 {
128                     return Encoding.None;
129                 }
130                 else
131                 {
132                     return Encoding.ANSI;
133                 }
134             }
135         }
136 
137 
138         public Encoding CheckBOM(byte[] buffer, int size)
139         {
140             // Check for BOM
141             if (size >= 2 && buffer[0] == this.utf16LEBOM[0] && buffer[1] == this.utf16LEBOM[1])
142             {
143                 return Encoding.UTF16_LE_BOM;
144             }
145             else if (size >= 2 && buffer[0] == this.utf16BEBOM[0] && buffer[1] == this.utf16BEBOM[1])
146             {
147                 return Encoding.UTF16_BE_BOM;
148             }
149             else if (size >= 3 && buffer[0] == this.utf8BOM[0] && buffer[1] == this.utf8BOM[1] && buffer[2] == this.utf8BOM[2])
150             {
151                 return Encoding.UTF8_BOM;
152             }
153             else
154             {
155                 return Encoding.None;
156             }
157         }
158 
159         ///////////////////////////////////////////////////////////////////////////////
160         // Checks if a buffer contains valid utf8. Returns:
161         // None - not valid utf8
162         // UTF8_NOBOM - valid utf8 encodings and multibyte sequences
163         // ASCII - Only data in the 0-127 range. 
164         ///////////////////////////////////////////////////////////////////////////////
165 
166         private Encoding CheckUTF8(byte[] buffer, int size)
167         {
168             // UTF8 Valid sequences
169             // 0xxxxxxx  ASCII
170             // 110xxxxx 10xxxxxx  2-byte
171             // 1110xxxx 10xxxxxx 10xxxxxx  3-byte
172             // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx  4-byte
173             //
174             // Width in UTF8
175             // Decimal      Width
176             // 0-127        1 byte
177             // 194-223      2 bytes
178             // 224-239      3 bytes
179             // 240-244      4 bytes
180             //
181             // Subsequent chars are in the range 128-191
182             bool only_saw_ascii_range = true;
183             uint pos = 0;
184             int more_chars;
185 
186             while (pos < size)
187             {
188                 byte ch = buffer[pos++];
189 
190                 if (ch == 0 && this.nullSuggestsBinary)
191                 {
192                     return Encoding.None;
193                 }
194                 else if (ch <= 127)
195                 {
196                     // 1 byte
197                     more_chars = 0;
198                 }
199                 else if (ch >= 194 && ch <= 223)
200                 {
201                     // 2 Byte
202                     more_chars = 1;
203                 }
204                 else if (ch >= 224 && ch <= 239)
205                 {
206                     // 3 Byte
207                     more_chars = 2;
208                 }
209                 else if (ch >= 240 && ch <= 244)
210                 {
211                     // 4 Byte
212                     more_chars = 3;
213                 }
214                 else
215                 {
216                     return Encoding.None;               // Not utf8
217                 }
218 
219                 // Check secondary chars are in range if we are expecting any
220                 while (more_chars > 0 && pos < size)
221                 {
222                     only_saw_ascii_range = false;       // Seen non-ascii chars now
223 
224                     ch = buffer[pos++];
225                     if (ch < 128 || ch > 191)
226                     {
227                         return Encoding.None;           // Not utf8
228                     }
229 
230                     --more_chars;
231                 }
232             }
233 
234             // If we get to here then only valid UTF-8 sequences have been processed
235 
236             // If we only saw chars in the range 0-127 then we can't assume UTF8 (the caller will need to decide)
237             if (only_saw_ascii_range)
238             {
239                 return Encoding.ASCII;
240             }
241             else
242             {
243                 return Encoding.UTF8_NOBOM;
244             }
245         }
246 
247         ///////////////////////////////////////////////////////////////////////////////
248         // Checks if a buffer contains text that looks like utf16 by scanning for 
249         // newline chars that would be present even in non-english text.
250         // Returns:
251         // None - not valid utf16
252         // UTF16_LE_NOBOM - looks like utf16 le
253         // UTF16_BE_NOBOM - looks like utf16 be
254         ///////////////////////////////////////////////////////////////////////////////
255 
256         private Encoding CheckUTF16NewlineChars(byte[] buffer, int size)
257         {
258             if (size < 2)
259             {
260                 return Encoding.None;
261             }
262 
263             // Reduce size by 1 so we don't need to worry about bounds checking for pairs of bytes
264             size--;
265 
266             int le_control_chars = 0;
267             int be_control_chars = 0;
268             byte ch1, ch2;
269 
270             uint pos = 0;
271             while (pos < size)
272             {
273                 ch1 = buffer[pos++];
274                 ch2 = buffer[pos++];
275 
276                 if (ch1 == 0)
277                 {
278                     if (ch2 == 0x0a || ch2 == 0x0d)
279                     {
280                         ++be_control_chars;
281                     }
282                 }
283                 else if (ch2 == 0)
284                 {
285                     if (ch1 == 0x0a || ch1 == 0x0d)
286                     {
287                         ++le_control_chars;
288                     }
289                 }
290 
291                 // If we are getting both LE and BE control chars then this file is not utf16
292                 if (le_control_chars > 0 && be_control_chars > 0)
293                 {
294                     return Encoding.None;
295                 }
296             }
297 
298             if (le_control_chars > 0)
299             {
300                 return Encoding.UTF16_LE_NOBOM;
301             }
302             else if (be_control_chars > 0)
303             {
304                 return Encoding.UTF16_BE_NOBOM;
305             }
306             else
307             {
308                 return Encoding.None;
309             }
310         }
311 
312         ///////////////////////////////////////////////////////////////////////////////
313         // Checks if a buffer contains text that looks like utf16. This is done based
314         // the use of nulls which in ASCII/script like text can be useful to identify.
315         // Returns:
316         // None - not valid utf16
317         // UTF16_LE_NOBOM - looks like utf16 le
318         // UTF16_BE_NOBOM - looks like utf16 be
319         ///////////////////////////////////////////////////////////////////////////////
320 
321         private Encoding CheckUTF16ASCII(byte[] buffer, int size)
322         {
323             int num_odd_nulls = 0;
324             int num_even_nulls = 0;
325 
326             // Get even nulls
327             uint pos = 0;
328             while (pos < size)
329             {
330                 if (buffer[pos] == 0)
331                 {
332                     num_even_nulls++;
333                 }
334 
335                 pos += 2;
336             }
337 
338             // Get odd nulls
339             pos = 1;
340             while (pos < size)
341             {
342                 if (buffer[pos] == 0)
343                 {
344                     num_odd_nulls++;
345                 }
346 
347                 pos += 2;
348             }
349 
350             double even_null_threshold = (num_even_nulls * 2.0) / size;
351             double odd_null_threshold = (num_odd_nulls * 2.0) / size;
352             double expected_null_threshold = this.utf16ExpectedNullPercent / 100.0;
353             double unexpected_null_threshold = this.utf16UnexpectedNullPercent / 100.0;
354 
355             // Lots of odd nulls, low number of even nulls
356             if (even_null_threshold < unexpected_null_threshold && odd_null_threshold > expected_null_threshold)
357             {
358                 return Encoding.UTF16_LE_NOBOM;
359             }
360 
361             // Lots of even nulls, low number of odd nulls
362             if (odd_null_threshold < unexpected_null_threshold && even_null_threshold > expected_null_threshold)
363             {
364                 return Encoding.UTF16_BE_NOBOM;
365             }
366 
367             // Don't know
368             return Encoding.None;
369         }
370 
371         ///////////////////////////////////////////////////////////////////////////////
372         // Checks if a buffer contains any nulls. Used to check for binary vs text data.
373         ///////////////////////////////////////////////////////////////////////////////
374 
375         private bool DoesContainNulls(byte[] buffer, int size)
376         {
377             uint pos = 0;
378             while (pos < size)
379             {
380                 if (buffer[pos++] == 0)
381                 {
382                     return true;
383                 }
384             }
385 
386             return false;
387         }
388     }
猜你喜欢