c语言Unicode和utf-8互转

Unicode和utf-8编码范围

 字 |  Unicode符号范围      |  UTF-8编码方式
 节 |  (十六进制)           | (二进制)
---+-----------------------+------------------------------------------------------
 1 | 0000 0000 - 0000 007F |                                              0xxxxxxx
 2 | 0000 0080 - 0000 07FF |                                     110xxxxx 10xxxxxx
 3 | 0000 0800 - 0000 FFFF |                            1110xxxx 10xxxxxx 10xxxxxx
 4 | 0001 0000 - 0010 FFFF |                   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 5 | 0020 0000 - 03FF FFFF |          111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 6 | 0400 0000 - 7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

常见Unicode为UCS-2编码，2个字节Unicode，3个字节utf-8

#include <stdio.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>


/**
 * @brief  常见unicode(UCS-2)转utf-8
 * @note   暂不支持UCS-4 (4字节)编码
 * @param  *src: 输入unicode指针(被转换成unsigned short类型进行识别)
 * @param  slen: 输出unicode字节数
 * @param  *dst: 输出utf-8缓冲区
 * @param  dlen: utf-8缓冲区长度
 * @retval <0:错误，>=0:输出utf-8字节数
 */
int unicode2utf8(unsigned char *src, int slen, unsigned char *dst,int dlen)
{
    
    
	int olen = 0;
	if(!src || !dst || !dlen)
		return -1;

	// 长度必须为2的倍数，需转换成unsigned short类型
	if(slen%2)
		return -2;

	unsigned short *p = (unsigned short *)src;
	unsigned short unicode=0;
	int len=slen/2;
	while(len--)
	{
    
    
		unicode = *p++;
		// printf("[%s](%d) unicode=%x\n",__func__,__LINE__,unicode);
		if(unicode == 0)
			break;
		// 预留3个字节，不足则错误
		if(olen > (dlen-3)){
    
    
			olen = -3;
			break;
		}

		if ( unicode <= 0x0000007F ){
    
    	//1个字节
			// U-00000000 - U-0000007F:  0xxxxxxx
			*dst++	= (unicode & 0x7F);
			olen += 1;
		}else if ( unicode >= 0x00000080 && unicode <= 0x000007FF ){
    
    		//两个字节
			// U-00000080 - U-000007FF:  110xxxxx 10xxxxxx
			*dst++	= ((unicode >> 6) & 0x1F) | 0xC0;
			*dst++ 	= (unicode & 0x3F) | 0x80;
			olen += 2;
		}else if ( unicode >= 0x00000800 && unicode <= 0x0000FFFF ){
    
    			//3个字节
			// U-00000800 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx
			*dst++	= ((unicode >> 12) & 0x0F) | 0xE0;
			*dst++	= ((unicode >>  6) & 0x3F) | 0x80;
			*dst++	= (unicode & 0x3F) | 0x80;
			olen += 3;
		}else{
    
    
			return -4;
		}
	};
	
    return olen;
}

/**
 * @brief  utf-8转unicode(UCS-2)
 * @note   暂不支持UCS-4 (4字节)编码
 * @param  src: 输入utf-8指针
 * @param  dst: 输出Unicode缓冲区
 * @param  dlen: Unicode缓冲区长度
 * @retval <0:错误，>=0:输出uinicode字节数
 */
int utf82unicode(char* src, char* dst, int dlen)
{
    
    
	// 输出Unicode字节数
	int olen = 0; 

	while (*src){
    
    
		// 预留2个字节，不足则错误
		if(olen > (dlen-2)){
    
    
			olen = -1;
			break;
		}
		if (*src > 0x00 && *src <= 0x7F) {
    
    	// 单字节UTF8字符（英文字母、数字）
			*dst++ = *src++;
			*dst++ = 0; //小端法表示，在高地址填补0
		}
		else if (((*src) & 0xE0) == 0xC0) {
    
    	// 双字节UTF8字符
			char high = *src++;
			char low = *src++;
			if ((low & 0xC0) != 0x80) {
    
    		// 检查是否为合法的UTF8字符
				return -1; 
			}
 
			*dst++ = (high << 6) + (low & 0x3F);
			*dst++ = (high >> 2) & 0x07;
		}else if (((*src) & 0xF0) == 0xE0) {
    
    	// 三字节UTF8字符
			char high 	= *src++;
			char middle = *src++;
			char low 	= *src++;
			if (((middle & 0xC0) != 0x80) || ((low & 0xC0) != 0x80)){
    
     //检测是否合法
				return -1;
			}
			*dst++ = (middle << 6) + (low & 0x3F);			// 取出middle的低两位与low的低6位，组合成unicode字符的低8位
			*dst++ = (high << 4) + ((middle >> 2) & 0x0F); 	// 取出high的低四位与middle的中间四位，组合成unicode字符的高8位
		}else {
    
    	// 对于其他字节数的UTF8字符不进行处理
			printf("[%s](%d) err\n",__func__,__LINE__);
			return -1;
		}
		olen +=2;
	}
	// unicode字符串后面，有两个\0
	if(olen > (dlen-2)){
    
    
		olen = -1;
	}else{
    
    
		*dst++ = 0;
		*dst++ = 0;
	}
	return olen;
}


// 测试互转
int unicode_utf8_test(void)
{
    
    
	printf("[%s](%d) Unicode->utf-8\n",__func__,__LINE__);
	// 你好\u4f60\u597d
	unsigned short hello[] = {
    
    0x4f60,0x597d};
	char utf8_temp[32];
	memset(utf8_temp,0,sizeof(utf8_temp));
	int ret = unicode2utf8((unsigned char*)hello,4,utf8_temp,sizeof(utf8_temp));

	printf("[%s](%d) ->utf8=%s\n",__func__,__LINE__,utf8_temp);

	printf("[%s](%d) utf-8->Unicode\n",__func__,__LINE__);
	char *p="你好";	
	unsigned short unicode_temp[8];
	memset(unicode_temp,0,sizeof(unicode_temp));
	ret = utf82unicode(p,(unsigned char*)unicode_temp,sizeof(unicode_temp));
	printf("[%s](%d) ->unicode:",__func__,__LINE__);
	for (size_t i = 0; i < ret/2; i++)
	{
    
    
		 printf("0x%x ",unicode_temp[i]);
	}
	printf("\n");
}

测试：

[unicode_utf8_test](127) Unicode->utf-8
[unicode_utf8_test](134) ->utf8=你好

[unicode_utf8_test](136) utf-8->Unicode
[unicode_utf8_test](141) ->unicode:0x4f60 0x597d

c语言Unicode和utf-8互转

猜你喜欢