C语言UTF8解析为uint8_t uint16_t uint32_t处理stb_truetype
#include <stdio.h>
#include <stdlib.h>
//===============================================================
//
// UTF-8
//
//===============================================================
#ifndef MY_ASSERT
#include <assert.h>
#define MY_ASSERT(expr) assert(expr)
#endif
#define MY_UTF8_INVALID 0xFFFD //internal invalid utf8 rune
#define MY_UTF8_SIZE 4 //describes the number of bytes a glyph consists of*/
#define MY_LEN(a) (sizeof(a)/sizeof(a)[0])
#define MY_BETWEEN(x, a, b) ((a) <= (x) && (x) < (b))
static const uint8_t MY_UTF8BYTE[MY_UTF8_SIZE + 1] = { 0x80, 0, 0xC0, 0xE0, 0xF0 };
static const uint8_t MY_UTF8MASK[MY_UTF8_SIZE + 1] = { 0xC0, 0x80, 0xE0, 0xF0, 0xF8 };
static const uint32_t MY_UTF8MIN[MY_UTF8_SIZE + 1] = { 0, 0, 0x80, 0x800, 0x10000 };
static const uint32_t MY_UTF8MAX[MY_UTF8_SIZE + 1] = { 0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF };
static int my_utf8_validate(uint32_t* u, int i)
{
MY_ASSERT(u);
if (!u)
{
return 0;
}
if (!MY_BETWEEN(*u, MY_UTF8MIN[i], MY_UTF8MAX[i]) || MY_BETWEEN(*u, 0xD800, 0xDFFF))
{
*u = MY_UTF8_INVALID;
}
for (i = 1; *u > MY_UTF8MAX[i]; ++i);
return i;
}
static uint32_t my_utf8_decode_byte(char c, int* i)
{
MY_ASSERT(i);
if (!i)
{
return 0;
}
for (*i = 0; *i < (int)MY_LEN(MY_UTF8MASK); ++(*i))
{
if (((uint8_t)c & MY_UTF8MASK[*i]) == MY_UTF8BYTE[*i])
{
return (uint8_t)(c & ~MY_UTF8MASK[*i]);
}
}
return 0;
}
static int my_utf8_decode(const char* c, uint32_t* u, int clen)
{
int i, j, len, type = 0;
uint32_t udecoded;
MY_ASSERT(c);
MY_ASSERT(u);
if (!c || !u || !clen)
{
return 0;
}
*u = MY_UTF8_INVALID;
udecoded = my_utf8_decode_byte(c[0], &len);
if (!MY_BETWEEN(len, 1, MY_UTF8_SIZE))
{
return 1;
}
for (i = 1, j = 1; i < clen && j < len; ++i, ++j)
{
udecoded = (udecoded << 6) | my_utf8_decode_byte(c[i], &type);
if (type != 0)
{
return j;
}
}
if (j < len)
{
return 0;
}
*u = udecoded;
my_utf8_validate(u, len);
return len;
}
static char my_utf8_encode_byte(uint32_t u, int i)
{
return (char)((MY_UTF8BYTE[i]) | ((uint8_t)u & ~MY_UTF8MASK[i]));
}
static int my_utf8_encode(uint32_t u, char* c, int clen)
{
int len, i;
len = my_utf8_validate(&u, 0);
if (clen < len || !len || len > MY_UTF8_SIZE)
{
return 0;
}
for (i = len - 1; i != 0; --i)
{
c[i] = my_utf8_encode_byte(u, 0);
u >>= 6;
}
c[0] = my_utf8_encode_byte(u, len);
return len;
}
static int my_utf8_len(const char* str, int len)
{
const char* text;
int glyphs = 0;
int text_len;
int glyph_len;
int src_len = 0;
uint32_t unicode;
MY_ASSERT(str);
if (!str || !len)
{
return 0;
}
text = str;
text_len = len;
glyph_len = my_utf8_decode(text, &unicode, text_len);
while (glyph_len && src_len < len)
{
glyphs++;
src_len += glyph_len;
glyph_len = my_utf8_decode(text + src_len, &unicode, text_len - src_len);
}
return glyphs;
}
static const char* my_utf8_at(const char* buffer, int length, int index, uint32_t* unicode, int* len)
{
int i = 0;
int src_len = 0;
int glyph_len = 0;
const char* text;
int text_len;
MY_ASSERT(buffer);
MY_ASSERT(unicode);
MY_ASSERT(len);
if (!buffer || !unicode || !len)
{
return 0;
}
if (index < 0)
{
*unicode = MY_UTF8_INVALID;
*len = 0;
return 0;
}
text = buffer;
text_len = length;
glyph_len = my_utf8_decode(text, unicode, text_len);
while (glyph_len)
{
if (i == index)
{
*len = glyph_len;
break;
}
i++;
src_len = src_len + glyph_len;
glyph_len = my_utf8_decode(text + src_len, unicode, text_len - src_len);
}
if (i != index)
{
return 0;
}
return buffer + src_len;
}