字符串UTF-8和GBK之间的转换以及判定

一、判定字符串是否是UTF-8的编码

bool is_str_utf8(const char* str)
{
    unsigned int nBytes = 0;//UFT8可用1-6个字节编码,ASCII用一个字节  
    unsigned char chr = *str;
    bool bAllAscii = true;
 
    for (unsigned int i = 0; str[i] != '\0'; ++i)
    {
        chr = *(str + i);
        //判断是否ASCII编码,如果不是,说明有可能是UTF8,ASCII用7位编码,最高位标记为0,0xxxxxxx 
        if (nBytes == 0 && (chr & 0x80) != 0)
        {
            bAllAscii = false;
        }
 
        if (nBytes == 0) 
        {
            //如果不是ASCII码,应该是多字节符,计算字节数  
            if (chr >= 0x80) 
            {
                if (chr >= 0xFC && chr <= 0xFD)
                {
                    nBytes = 6;
                }
                else if (chr >= 0xF8)
                {
                    nBytes = 5;
                }
                else if (chr >= 0xF0)
                {
                    nBytes = 4;
                }
                else if (chr >= 0xE0)
                {
                    nBytes = 3;
                }
                else if (chr >= 0xC0)
                {
                    nBytes = 2;
                }
                else
                {
                    return false;
                }
                nBytes--;
            }
        }
        else
        {
            //多字节符的非首字节,应为 10xxxxxx 
            if ((chr & 0xC0) != 0x80)
            {
                return false;
            }
            //减到为零为止
            nBytes--;
        }
    }
 
    //违返UTF8编码规则 
    if (nBytes != 0)  
    {
        return false;
    }
 
    if (bAllAscii)
    { //如果全部都是ASCII, 也是UTF8
        return true;
    }
 
    return true;
}

二、判定字符串是否是GBk的编码

bool is_str_gbk(const char* str)
{
    unsigned int nBytes = 0;//GBK可用1-2个字节编码,中文两个 ,英文一个 
    unsigned char chr = *str;
    bool bAllAscii = true; //如果全部都是ASCII,  
 
    for (unsigned int i = 0; str[i] != '\0'; ++i)
    {
        chr = *(str + i);
        if ((chr & 0x80) != 0 && nBytes == 0)
        {// 判断是否ASCII编码,如果不是,说明有可能是GBK
            bAllAscii = false;
        }
 
        if (nBytes == 0) 
        {
            if (chr >= 0x80) 
            {
                if (chr >= 0x81 && chr <= 0xFE)
                {
                    nBytes = +2;
                }
                else
                {
                    return false;
                }
                nBytes--;
            }
        }
        else
        {
            if (chr < 0x40 || chr>0xFE)
            {
                return false;
            }
            nBytes--;
        }//else end
    }
 
    if (nBytes != 0)  
    {    //违返规则 
        return false;
    }
 
    if (bAllAscii)
    { //如果全部都是ASCII, 也是GBK
        return true;
    }
 
    return true;
}

三、字符串由GBk编码转换成UTF-8编码

void ConvertGBKToUtf8(CString &strGBK)
 {
  int len=MultiByteToWideChar(CP_ACP, 0, (LPCTSTR)strGBK, -1, NULL,0);
  wchar_t * wszUtf8 = new wchar_t [len];
  memset(wszUtf8, 0, len);
  MultiByteToWideChar(CP_ACP, 0, (LPCTSTR)strGBK, -1, wszUtf8, len);
  len = WideCharToMultiByte(CP_UTF8, 0, wszUtf8, -1, NULL, 0, NULL, NULL);
  char *szUtf8=new char[len + 1];
  memset(szUtf8, 0, len + 1);
  WideCharToMultiByte (CP_UTF8, 0, wszUtf8, -1, szUtf8, len, NULL,NULL);
  strGBK = szUtf8;
  delete[] szUtf8;
  delete[] wszUtf8;
 }


string GBKToUTF8(const char* strGBK)  
{  
    int len = MultiByteToWideChar(CP_ACP, 0, strGBK, -1, NULL, 0);  
    wchar_t* wstr = new wchar_t[len+1];  
    memset(wstr, 0, len+1);  
    MultiByteToWideChar(CP_ACP, 0, strGBK, -1, wstr, len);  
    len = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, NULL, 0, NULL, NULL);  
    char* str = new char[len+1];  
    memset(str, 0, len+1);  
    WideCharToMultiByte(CP_UTF8, 0, wstr, -1, str, len, NULL, NULL);  
    string strTemp = str;  
    if(wstr) delete[] wstr;  
    if(str) delete[] str;  
    return strTemp;  
}

四、字符串由UTF-8编码转换成GBk编码

string UtfToGbk(const char* utf8)
{
    int len = MultiByteToWideChar(CP_UTF8, 0, utf8, -1, NULL, 0);
    wchar_t* wstr = new wchar_t[len+1];
    memset(wstr, 0, len+1);
    MultiByteToWideChar(CP_UTF8, 0, utf8, -1, wstr, len);
    len = WideCharToMultiByte(CP_ACP, 0, wstr, -1, NULL, 0, NULL, NULL);
    char* str = new char[len+1];
    memset(str, 0, len+1);
    WideCharToMultiByte(CP_ACP, 0, wstr, -1, str, len, NULL, NULL);
    if(wstr) delete[] wstr;
    return str;
}

bool Utf82gbk(std::string &gbkStr, std::string &srcStr)
{
 
    //首先先将utf-8编码转换为unicode编码   
    if(NULL==setlocale(LC_ALL,"zh_CN.utf8"))//设置转换为unicode前的码,当前为utf8编码   
    {
        printf("Bad Parameter\n");
        return false;
    }
 
    int unicodeLen=mbstowcs(NULL,srcStr.c_str(),0);//计算转换后的长度   
    if(unicodeLen<=0)
    {
        printf("Can not Transfer!!!\n");
        return false;
    }
    wchar_t *unicodeStr=(wchar_t *)calloc(sizeof(wchar_t),unicodeLen+1);
    mbstowcs(unicodeStr,srcStr.c_str(),srcStr.size());//将gbk转换为unicode   
 
    //将unicode编码转换为gbk编码   
    if(NULL==setlocale(LC_ALL,"zh_CN.gbk"))//设置unicode转换后的码,当前为gbk   
    {
        printf("Bad Parameter\n");
        return false;
    }
    int gbkLen = wcstombs(NULL,unicodeStr,0);//计算转换后的长度   
    if(gbkLen<=0)
    {
        printf("Can not Transfer!!!\n");
        return false;
    }
    char gbkbuf[1024*10];
    wcstombs(gbkbuf,unicodeStr,gbkLen);
    gbkbuf[gbkLen]=0;//添加结束符   
    gbkStr = gbkbuf;
    free(unicodeStr);
    return true;
}


string UTF8ToGBK(const std::string& strUTF8)    
{    
    int len = MultiByteToWideChar(CP_UTF8, 0, strUTF8.c_str(), -1, NULL, 0);    
    WCHAR* wszGBK = new WCHAR[len+1];  
    memset(wszGBK, 0, len * 2 + 2);    
    MultiByteToWideChar(CP_UTF8, 0, (LPCSTR)(LPCTSTR)strUTF8.c_str(), -1, wszGBK, len);    
  
    len = WideCharToMultiByte(CP_ACP, 0, wszGBK, -1, NULL, 0, NULL, NULL);    
    char *szGBK = new char[len + 1];    
    memset(szGBK, 0, len + 1);    
    WideCharToMultiByte(CP_ACP,0, wszGBK, -1, szGBK, len, NULL, NULL);     
    std::string strTemp(szGBK);    
    delete[]szGBK;    
    delete[]wszGBK;    
    return strTemp;    
}

字符串UTF-8和GBK之间的转换以及判定

猜你喜欢