流式校验UTF8格式

由于某个工程需要校验数据是否是utf8格式的,然后翻了翻RFC,确实挺简单的编码格式,所以直接写了,但是越写越感觉不对,字节不够时真的需要cache吗?想肯定还有非常简单的方法,果不其然,找到了这篇paper:
http://bjoern.hoehrmann.de/utf-8/decoder/dfa/

流式校验 utf8 源码:

static unsigned char TYPES[] = { 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8,
8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8
};

static unsigned char STATES[] = {
0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12,
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12,
12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12,
12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 12, 12, 36,
12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12,
12, 12, 12, 12, 12, 12
};

#define UTF8_REJECT 12
#define UTF8_ACCEPT 0

typedef struct {
    unsigned int  state;
}utf8_ctx_t;

int utf8_check(utf8_ctx_t *ctx, unsigned char *data, unsigned int len)
{
    unsigned char       byte, type;
    unsigned int i;

    for (i = 0; i < len; i++) {
        byte = data[i];
        type = TYPES[byte & 0xFF];

        ctx->state = STATES[ctx->state + type];

        if (ctx->state == UTF8_REJECT) {
            ctx->state = UTF8_ACCEPT;
            return 0;
        }
        /*In a middle state or in accept state*/
    }

    return 1;
}

猜你喜欢

转载自blog.csdn.net/mrpre/article/details/80932692