C#如何自动识别文件的编码

前言
C#中识别文件的编码是一个头疼的问题，最近在做导入微信商户后台退款数据时，无论怎么设置编码导出来都是乱码，后来在网上找了这个识别文件编码的代码，感觉不错。最后识别出来是gb2312，看来我还是太渣了，只能吃土了，竟然忘记了这个编码。
下面话不多说，上代码。
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
 
         /// <summary>  
        
         /// 用于取得一个文本文件的编码方式(Encoding)。  
        
         /// </summary>  
        
         public 
         class 
         TxtFileEncoder 
        
         { 
        
         public 
         TxtFileEncoder() 
        
         { 
        
         //  
        
         // TODO: 在此处添加构造函数逻辑  
        
         //  
        
         } 
        
         /// <summary>  
        
         /// 取得一个文本文件的编码方式。如果无法在文件头部找到有效的前导符，Encoding.Default将被返回。  
        
         /// </summary>  
        
         /// <param name="fileName">文件名。</param>  
        
         /// <returns></returns>  
        
         public 
         static 
         Encoding GetEncoding( 
         string 
         fileName) 
        
         { 
        
         return 
         GetEncoding(fileName, Encoding.Default); 
        
         } 
        
         /// <summary>  
        
         /// 取得一个文本文件流的编码方式。  
        
         /// </summary>  
        
         /// <param name="stream">文本文件流。</param>  
        
         /// <returns></returns>  
        
         public 
         static 
         Encoding GetEncoding(FileStream stream) 
        
         { 
        
         return 
         GetEncoding(stream, Encoding.Default); 
        
         } 
        
         /// <summary>  
        
         /// 取得一个文本文件的编码方式。  
        
         /// </summary>  
        
         /// <param name="fileName">文件名。</param>  
        
         /// <param name="defaultEncoding">默认编码方式。当该方法无法从文件的头部取得有效的前导符时，将返回该编码方式。</param>  
        
         /// <returns></returns>  
        
         public 
         static 
         Encoding GetEncoding( 
         string 
         fileName, Encoding defaultEncoding) 
        
         { 
        
         FileStream fs =  
         new 
         FileStream(fileName, FileMode.Open); 
        
         Encoding targetEncoding = GetEncoding(fs, defaultEncoding); 
        
         fs.Close(); 
        
         return 
         targetEncoding; 
        
         } 
        
         /// <summary>  
        
         /// 取得一个文本文件流的编码方式。  
        
         /// </summary>  
        
         /// <param name="stream">文本文件流。</param>  
        
         /// <param name="defaultEncoding">默认编码方式。当该方法无法从文件的头部取得有效的前导符时，将返回该编码方式。</param>  
        
         /// <returns></returns>  
        
         public 
         static 
         Encoding GetEncoding(FileStream stream, Encoding defaultEncoding) 
        
         { 
        
         Encoding targetEncoding = defaultEncoding; 
        
         if 
         (stream !=  
         null 
         && stream.Length >= 2) 
        
         { 
        
         //保存文件流的前4个字节  
        
         byte 
         byte1 = 0; 
        
         byte 
         byte2 = 0; 
        
         byte 
         byte3 = 0; 
        
         byte 
         byte4 = 0; 
        
         //保存当前Seek位置  
        
         long 
         origPos = stream.Seek(0, SeekOrigin.Begin); 
        
         stream.Seek(0, SeekOrigin.Begin); 
        
         int 
         nByte = stream.ReadByte(); 
        
         byte1 = Convert.ToByte(nByte); 
        
         byte2 = Convert.ToByte(stream.ReadByte()); 
        
         if 
         (stream.Length >= 3) 
        
         { 
        
         byte3 = Convert.ToByte(stream.ReadByte()); 
        
         } 
        
         if 
         (stream.Length >= 4) 
        
         { 
        
         byte4 = Convert.ToByte(stream.ReadByte()); 
        
         } 
        
         //根据文件流的前4个字节判断Encoding  
        
         //Unicode {0xFF, 0xFE};  
        
         //BE-Unicode {0xFE, 0xFF};  
        
         //UTF8 = {0xEF, 0xBB, 0xBF};  
        
         if 
         (byte1 == 0xFE && byte2 == 0xFF) 
         //UnicodeBe  
        
         { 
        
         targetEncoding = Encoding.BigEndianUnicode; 
        
         } 
        
         if 
         (byte1 == 0xFF && byte2 == 0xFE && byte3 != 0xFF) 
         //Unicode  
        
         { 
        
         targetEncoding = Encoding.Unicode; 
        
         } 
        
         if 
         (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF) 
         //UTF8  
        
         { 
        
         targetEncoding = Encoding.UTF8; 
        
         } 
        
         //恢复Seek位置    
        
         stream.Seek(origPos, SeekOrigin.Begin); 
        
         } 
        
         return 
         targetEncoding; 
        
         } 
        
         // 新增加一个方法，解决了不带BOM的 UTF8 编码问题  
        
         /// <summary>  
        
         /// 通过给定的文件流，判断文件的编码类型  
        
         /// </summary>  
        
         /// <param name="fs">文件流</param>  
        
         /// <returns>文件的编码类型</returns>  
        
         public 
         static 
         System.Text.Encoding GetEncoding(Stream fs) 
        
         { 
        
         byte 
         [] Unicode =  
         new 
         byte 
         [] { 0xFF, 0xFE, 0x41 }; 
        
         byte 
         [] UnicodeBIG =  
         new 
         byte 
         [] { 0xFE, 0xFF, 0x00 }; 
        
         byte 
         [] UTF8 =  
         new 
         byte 
         [] { 0xEF, 0xBB, 0xBF };  
         //带BOM  
        
         Encoding reVal = Encoding.Default; 
        
         BinaryReader r =  
         new 
         BinaryReader(fs, System.Text.Encoding.Default); 
        
         byte 
         [] ss = r.ReadBytes(4); 
        
         if 
         (ss[0] == 0xFE && ss[1] == 0xFF && ss[2] == 0x00) 
        
         { 
        
         reVal = Encoding.BigEndianUnicode; 
        
         } 
        
         else 
         if 
         (ss[0] == 0xFF && ss[1] == 0xFE && ss[2] == 0x41) 
        
         { 
        
         reVal = Encoding.Unicode; 
        
         } 
        
         else 
        
         { 
        
         if 
         (ss[0] == 0xEF && ss[1] == 0xBB && ss[2] == 0xBF) 
        
         { 
        
         reVal = Encoding.UTF8; 
        
         } 
        
         else 
        
         { 
        
         int 
         i; 
        
         int 
         .TryParse(fs.Length.ToString(),  
         out 
         i); 
        
         ss = r.ReadBytes(i); 
        
         if 
         (IsUTF8Bytes(ss)) 
        
         reVal = Encoding.UTF8; 
        
         } 
        
         } 
        
         r.Close(); 
        
         return 
         reVal; 
        
         } 
        
         /// <summary>  
        
         /// 判断是否是不带 BOM 的 UTF8 格式  
        
         /// </summary>  
        
         /// <param name="data"></param>  
        
         /// <returns></returns>  
        
         private 
         static 
         bool 
         IsUTF8Bytes( 
         byte 
         [] data) 
        
         { 
        
         int 
         charByteCounter = 1;　  
         //计算当前正分析的字符应还有的字节数  
        
         byte 
         curByte;  
         //当前分析的字节.  
        
         for 
         ( 
         int 
         i = 0; i < data.Length; i++) 
        
         { 
        
         curByte = data[i]; 
        
         if 
         (charByteCounter == 1) 
        
         { 
        
         if 
         (curByte >= 0x80) 
        
         { 
        
         //判断当前  
        
         while 
         (((curByte <<= 1) & 0x80) != 0) 
        
         { 
        
         charByteCounter++; 
        
         } 
        
         //标记位首位若为非0 则至少以2个1开始 如:110XXXXX...........1111110X　  
        
         if 
         (charByteCounter == 1 || charByteCounter > 6) 
        
         { 
        
         return 
         false 
         ; 
        
         } 
        
         } 
        
         } 
        
         else 
        
         { 
        
         //若是UTF-8 此时第一位必须为1  
        
         if 
         ((curByte & 0xC0) != 0x80) 
        
         { 
        
         return 
         false 
         ; 
        
         } 
        
         charByteCounter--; 
        
         } 
        
         } 
        
         if 
         (charByteCounter > 1) 
        
         { 
        
         throw 
         new 
         Exception( 
         "非预期的byte格式!" 
         ); 
        
         } 
        
         return 
         true 
         ; 
        
         } 
        
         }
总结
以上就是C#自动识别文件编码的全部内容了，希望本文的内容对大家的学习或者工作能带来一定的帮助，如果有疑问大家可以留言交流。
C#如何自动识别文件的编码

猜你喜欢