解决中文字符乱码问题,如何将gbk格式转换utf-8格式

中文乱码的原因是因为编码格式是gbk的,而显示的时候是utf-8格式的,所以需要将其utf-8格式,才能正常显示出来。

1.gbk生成utf8程序需要经过两次转化。

  1.gbk->unicode

  2.unicode->utf8

2.gbkuni30_gen.h是gbk转化的unicode的数组,只有unicode部分,通过另一个文件程序可生成。gbk是索引(此文件可在我的下载资源中找到)

源码如下:

#include "gbkuni30_gen.h"

int gbk_to_unicode(unsigned short int* unicode, const char* gbk, int len)

{

    int i,j;

扫描二维码关注公众号,回复: 3582041 查看本文章

    i = 0;

    unsigned char* gb_temp = (unsigned char *)gbk;

    for(j = 0; i < len; j++)

    {

        if (gb_temp[i] <= 0x80)

        {

            unicode[j] = gb_temp[i];

            i++;

        }

        else

        {

            unsigned short int temp;

            temp = (gb_temp[i] << 8) + gb_temp[i+1];

            unicode[j] = gbkuni30[temp];

            i += 2;

        }

    }

    return j;

}

int enc_unicode_to_utf8_one(unsigned long unic, unsigned char *pOutput)

{  

   

  

    if ( unic <= 0x0000007F )  

    {  

        // * U-00000000 - U-0000007F:  0xxxxxxx  

        *pOutput     = (unic & 0x7F);  

        return 1;  

    }  

    else if ( unic >= 0x00000080 && unic <= 0x000007FF )  

    {  

        // * U-00000080 - U-000007FF:  110xxxxx 10xxxxxx  

        *(pOutput+1) = (unic & 0x3F) | 0x80;  

        *pOutput     = ((unic >> 6) & 0x1F) | 0xC0;  

        return 2;  

    }  

    else if ( unic >= 0x00000800 && unic <= 0x0000FFFF )  

    {  

        // * U-00000800 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx  

        *(pOutput+2) = (unic & 0x3F) | 0x80;  

        *(pOutput+1) = ((unic >>  6) & 0x3F) | 0x80;  

        *pOutput     = ((unic >> 12) & 0x0F) | 0xE0;  

        return 3;  

    }  

    else if ( unic >= 0x00010000 && unic <= 0x001FFFFF )  

    {  

        // * U-00010000 - U-001FFFFF:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx  

        *(pOutput+3) = (unic & 0x3F) | 0x80;  

        *(pOutput+2) = ((unic >>  6) & 0x3F) | 0x80;  

        *(pOutput+1) = ((unic >> 12) & 0x3F) | 0x80;  

        *pOutput     = ((unic >> 18) & 0x07) | 0xF0;  

        return 4;  

    }  

    else if ( unic >= 0x00200000 && unic <= 0x03FFFFFF )  

    {  

        // * U-00200000 - U-03FFFFFF:  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx  

        *(pOutput+4) = (unic & 0x3F) | 0x80;  

        *(pOutput+3) = ((unic >>  6) & 0x3F) | 0x80;  

        *(pOutput+2) = ((unic >> 12) & 0x3F) | 0x80;  

        *(pOutput+1) = ((unic >> 18) & 0x3F) | 0x80;  

        *pOutput     = ((unic >> 24) & 0x03) | 0xF8;  

        return 5;  

    }  

    else if ( unic >= 0x04000000 && unic <= 0x7FFFFFFF )  

    {  

        // * U-04000000 - U-7FFFFFFF:  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx  

        *(pOutput+5) = (unic & 0x3F) | 0x80;  

        *(pOutput+4) = ((unic >>  6) & 0x3F) | 0x80;  

        *(pOutput+3) = ((unic >> 12) & 0x3F) | 0x80;  

        *(pOutput+2) = ((unic >> 18) & 0x3F) | 0x80;  

        *(pOutput+1) = ((unic >> 24) & 0x3F) | 0x80;  

        *pOutput     = ((unic >> 30) & 0x01) | 0xFC;  

        return 6;  

    }  

  

    return 0;  

}  

void bgk_to_utf8( const char* gbk, unsigned char *utf8)

{

    int len = (int)strlen(gbk);

    unsigned short unicode[len];

    int reval=0;

    int unicode_len = 0,i;

    unicode_len = gbk_to_unicode(unicode,gbk, len);

   char *pstr=utf8;

    for(i=0;i<unicode_len;i++)

   {

reval=enc_unicode_to_utf8_one(unicode[i], pstr) ;

  pstr=pstr+reval;

    }

*(++pstr)='\0';

}

猜你喜欢

转载自blog.csdn.net/u012681014/article/details/69374841