linux应用程序_1_文本浏览器_2_encoding_2_各种编码格式

linux应用程序_1_文本浏览器_2_encoding_2_各种编码格式

记事本以ascii编码格式保存文件

英文:ascii编码,一个字符占一字节,编码小于128

中文:gbk编码,一个字占两字节,低字节编码大于等于128

utf-8编码又称unicode码,是一种变长字节编码方式

第一个字节,从最高位开始,连续的N个1,决定这个字由几个字节表示,组合时舍去这N个1

N=0——————1个字节

N=1、2—————2个字节

N=k(2<k<7)——k个字节

其余字节,最高位固定为1,次高位固定为0,组合时只取低六位

utf-16是一个字占两字节的编码格式,分为be(小端)和le(大端)两种

对于utf-16be,低字节在前,高字节在后

对于utf-16le,高字节在前,低字节在后

以ascii.c为例分析:

编码结构体:

static T_EncodingOpr g_tAsciiEncodingOpr = {
	.pcName   = "ascii",
	.iHeadLen = 0,
	.IsSuppot = AsciiIsSuppot,
	.GetCode  = AsciiGetCode,
};

编码格式解析:

如果不是utf-8、utf16le、utf16be,这里认为就是ascii

static int AsciiIsSuppot(char *pcBufHead)
{
	const char pucStrUtf8[]    = {0xEF, 0xBB, 0xBF};
	const char pucStrUtf16le[] = {0xFF, 0xFE};
	const char pucStrUtf16be[] = {0xFE, 0xFF};	

	if(!strncmp(pucStrUtf8, pcBufHead,3))
	{
		return 0;
	}
	if(!strncmp(pucStrUtf16le, pcBufHead,2))
	{
		return 0;
	}
	if(!strncmp(pucStrUtf16be, pcBufHead,2))
	{
		return 0;
	}
	
	return 1;
}

解析、获取编码:

以指针取回编码字符,函数返回字符占据字节数,以供后面处理字符使用

static int AsciiGetCode(unsigned char *pucBufEndStart,unsigned char *pucBufEndEnd, unsigned int *pdwCode)
{
	unsigned char *pucBuf = pucBufEndStart;
	unsigned char ucOneChar  = *pucBuf;
	
	if((pucBuf < pucBufEndEnd) && (ucOneChar < 0x80))
	{
		*pdwCode = ucOneChar;
		return 1;
	}
	if(((pucBuf + 1) < pucBufEndEnd) && (ucOneChar >= 0x80))
	{
		*pdwCode = ((unsigned int)pucBuf[1]<<8) + pucBuf[0];
		return 2;
	}
	if(pucBuf < pucBufEndEnd)
	{
		*pdwCode = ucOneChar;		
		return 3;
	}
	
	return 0;
}

ascii编码初始化:

添加支持的字库,注册编码结构体

int AsciiEncodingInit(void)
{
	AddFontOprForEncoding(&g_tAsciiEncodingOpr, GetFontOpr("freetype"));
	AddFontOprForEncoding(&g_tAsciiEncodingOpr, GetFontOpr("gbk"));
	AddFontOprForEncoding(&g_tAsciiEncodingOpr, GetFontOpr("ascii"));

	return RegisterEncoding(&g_tAsciiEncodingOpr);
}

(四种编码的)完整代码:

ascii.c

#include <config.h>
#include <encoding_manager.h>
#include <fonts_manager.h>
#include <string.h>

static int AsciiIsSuppot(char *pcBufHead);
static int AsciiGetCode(unsigned char *pucBufEndStart, unsigned char *pucBufEndEnd, unsigned int *pdwCode);


static T_EncodingOpr g_tAsciiEncodingOpr = {
	.pcName   = "ascii",
	.iHeadLen = 0,
	.IsSuppot = AsciiIsSuppot,
	.GetCode  = AsciiGetCode,
};
	
static int AsciiIsSuppot(char *pcBufHead)
{
	const char pucStrUtf8[]    = {0xEF, 0xBB, 0xBF};
	const char pucStrUtf16le[] = {0xFF, 0xFE};
	const char pucStrUtf16be[] = {0xFE, 0xFF};	

	if(!strncmp(pucStrUtf8, pcBufHead,3))
	{
		return 0;
	}
	if(!strncmp(pucStrUtf16le, pcBufHead,2))
	{
		return 0;
	}
	if(!strncmp(pucStrUtf16be, pcBufHead,2))
	{
		return 0;
	}
	
	return 1;
}

static int AsciiGetCode(unsigned char *pucBufEndStart,unsigned char *pucBufEndEnd, unsigned int *pdwCode)
{
	unsigned char *pucBuf = pucBufEndStart;
	unsigned char ucOneChar  = *pucBuf;
	
	if((pucBuf < pucBufEndEnd) && (ucOneChar < 0x80))
	{
		*pdwCode = ucOneChar;
		return 1;
	}
	if(((pucBuf + 1) < pucBufEndEnd) && (ucOneChar >= 0x80))
	{
		*pdwCode = ((unsigned int)pucBuf[1]<<8) + pucBuf[0];
		return 2;
	}
	if(pucBuf < pucBufEndEnd)
	{
		*pdwCode = ucOneChar;		
		return 3;
	}
	
	return 0;
}

int AsciiEncodingInit(void)
{
	AddFontOprForEncoding(&g_tAsciiEncodingOpr, GetFontOpr("freetype"));
	AddFontOprForEncoding(&g_tAsciiEncodingOpr, GetFontOpr("gbk"));
	AddFontOprForEncoding(&g_tAsciiEncodingOpr, GetFontOpr("ascii"));

	return RegisterEncoding(&g_tAsciiEncodingOpr);
}

utf8.c

#include <config.h>
#include <encoding_manager.h>
#include <fonts_manager.h>
#include <string.h>
#include <stdio.h>

static int Utf8IsSuppot(char *pcBufHead);
static int Utf8GetCode(unsigned char *pucBufStart, unsigned char *pucBufEnd, unsigned int *pdwCode);


static T_EncodingOpr g_tUtf8EncodingOpr = {
	.pcName   = "utf8",
	.iHeadLen = 3,
	.IsSuppot = Utf8IsSuppot,
	.GetCode  = Utf8GetCode,
};
	
static int Utf8IsSuppot(char *pcBufHead)
{
	const char pcStrUtf8[] = {0xEF, 0xBB, 0xBF};
	
	if(!strncmp(pcStrUtf8, pcBufHead,3))
	{
		return 1;
	}
	
	return 0;
}

static int GetHeadBits(unsigned char ucVal)
{
	int iBits = 0;
	int iCnt;
	
	for(iCnt = 7;iCnt >= 0;iCnt--)
	{
		if(ucVal & (1<<iCnt))	
			iBits++;		
		else		
			break;		
	}
	
	return iBits;
}

static int Utf8GetCode(unsigned char *pucBufStart, unsigned char *pucBufEnd, unsigned int *pdwCode)
{
	int iBits;
	int iCnt;	

	if(pucBufStart >= pucBufEnd)
	{
		DBG_PRINT("End\r\n");
		return 0;
	}
		
	iBits = GetHeadBits(pucBufStart[0]);

	if(pucBufStart + iBits > pucBufEnd)
	{
		DBG_PRINT("Error at Utf8GetCode : pucBufStart + iBits > pucBufEnd\r\n");	
		return 0;
	}

	if(iBits == 0)
	{
		*pdwCode = pucBufStart[0];
		iBits = 1;
	}
	else
	{
		*pdwCode   = (pucBufStart[0] << iBits) & 0xff;
		*pdwCode >>= iBits;
		for(iCnt = 1;iCnt < iBits;iCnt++)
		{



			*pdwCode <<= 6;
			
			*pdwCode += pucBufStart[(unsigned char)iCnt] & 0x3f;
		}
	}	
	
	return iBits;
}

int Utf8EncodingInit(void)
{
	AddFontOprForEncoding(&g_tUtf8EncodingOpr, GetFontOpr("freetype"));
	AddFontOprForEncoding(&g_tUtf8EncodingOpr, GetFontOpr("gbk"));
	AddFontOprForEncoding(&g_tUtf8EncodingOpr, GetFontOpr("ascii"));

	return RegisterEncoding(&g_tUtf8EncodingOpr);
}




utf16le.c

#include <config.h>
#include <encoding_manager.h>
#include <fonts_manager.h>
#include <string.h>

static int Utf16leIsSuppot(char *pcBufHead);
static int Utf16leGetCode(unsigned char *pucBufStart, unsigned char *pucBufEnd, unsigned int *pdwCode);


static T_EncodingOpr g_tUtf16leEncodingOpr = {
	.pcName   = "utf16le",
	.iHeadLen = 0,
	.IsSuppot = Utf16leIsSuppot,
	.GetCode  = Utf16leGetCode,
};
	
static int Utf16leIsSuppot(char *pcBufHead)
{
	const char pcStrUtf16le[] = {0xFF, 0xFE};

	if(!strncmp(pcStrUtf16le, pcBufHead,2))
	{
		return 1;
	}
	
	return 0;
}

static int Utf16leGetCode(unsigned char *pucBufStart, unsigned char *pucBufEnd, unsigned int *pdwCode)
{

	if (pucBufStart + 1 < pucBufEnd)
	{
		*pdwCode = (((unsigned int)pucBufStart[1])<<8) + pucBufStart[0];
		return 2;
	}

	return 0;	
}

int Utf16leEncodingInit(void)
{
	AddFontOprForEncoding(&g_tUtf16leEncodingOpr, GetFontOpr("gbk"));	
	AddFontOprForEncoding(&g_tUtf16leEncodingOpr, GetFontOpr("ascii"));

	return RegisterEncoding(&g_tUtf16leEncodingOpr);
}


utf16be.c

#include <config.h>
#include <encoding_manager.h>
#include <fonts_manager.h>
#include <string.h>

static int Utf16beIsSuppot(char *pcBufHead);
static int Utf16beGetCode(unsigned char *pucBufStart, unsigned char *pucBufEnd, unsigned int *pdwCode);


static T_EncodingOpr g_tUtf16beEncodingOpr = {
	.pcName   = "utf16be",
	.iHeadLen = 0,
	.IsSuppot = Utf16beIsSuppot,
	.GetCode  = Utf16beGetCode,
};
	
static int Utf16beIsSuppot(char *pcBufHead)
{
	const char pcStrUtf16be[] = {0xFE, 0xFF};	

	if(!strncmp(pcStrUtf16be, pcBufHead,2))
	{
		return 1;
	}
	
	return 0;
}

static int Utf16beGetCode(unsigned char *pucBufStart, unsigned char *pucBufEnd, unsigned int *pdwCode)
{

	if(pucBufStart+1 > pucBufEnd)
	{
		return 0;
	}
	*pdwCode = ((unsigned int)pucBufStart[0]<<8) + pucBufStart[1];

	return 2;	
}

int Utf16beEncodingInit(void)
{
	AddFontOprForEncoding(&g_tUtf16beEncodingOpr, GetFontOpr("gbk"));	
	AddFontOprForEncoding(&g_tUtf16beEncodingOpr, GetFontOpr("ascii"));
	return RegisterEncoding(&g_tUtf16beEncodingOpr);
}

发布了71 篇原创文章 · 获赞 4 · 访问量 7233

猜你喜欢

转载自blog.csdn.net/floatinglong/article/details/86635036