C语言UTF8解析为uint8_t uint16_t uint32_t处理stb_truetype

xingyun86 2021-12-21 1526
C语言UTF8解析为uint8_t uint16_t uint32_t处理stb_truetype
#include <stdio.h>
#include <stdlib.h>
//===============================================================
//
//                              UTF-8
//
//===============================================================
#ifndef MY_ASSERT
#include <assert.h>
#define MY_ASSERT(expr) assert(expr)
#endif
#define MY_UTF8_INVALID 0xFFFD //internal invalid utf8 rune
#define MY_UTF8_SIZE 4 //describes the number of bytes a glyph consists of*/
#define MY_LEN(a) (sizeof(a)/sizeof(a)[0])
#define MY_BETWEEN(x, a, b) ((a) <= (x) && (x) < (b))
static const uint8_t MY_UTF8BYTE[MY_UTF8_SIZE + 1] = { 0x80, 0, 0xC0, 0xE0, 0xF0 };
static const uint8_t MY_UTF8MASK[MY_UTF8_SIZE + 1] = { 0xC0, 0x80, 0xE0, 0xF0, 0xF8 };
static const uint32_t MY_UTF8MIN[MY_UTF8_SIZE + 1] = { 0, 0, 0x80, 0x800, 0x10000 };
static const uint32_t MY_UTF8MAX[MY_UTF8_SIZE + 1] = { 0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF };
static int my_utf8_validate(uint32_t* u, int i)
{
    MY_ASSERT(u);
    if (!u) 
    {
        return 0;
    }
    if (!MY_BETWEEN(*u, MY_UTF8MIN[i], MY_UTF8MAX[i]) || MY_BETWEEN(*u, 0xD800, 0xDFFF))
    {
        *u = MY_UTF8_INVALID;
    }
    for (i = 1; *u > MY_UTF8MAX[i]; ++i);
    return i;
}
static uint32_t my_utf8_decode_byte(char c, int* i)
{
    MY_ASSERT(i);
    if (!i)
    {
        return 0;
    }
    for (*i = 0; *i < (int)MY_LEN(MY_UTF8MASK); ++(*i))
    {
        if (((uint8_t)c & MY_UTF8MASK[*i]) == MY_UTF8BYTE[*i])
        {
            return (uint8_t)(c & ~MY_UTF8MASK[*i]);
        }
    }
    return 0;
}
static int my_utf8_decode(const char* c, uint32_t* u, int clen)
{
    int i, j, len, type = 0;
    uint32_t udecoded;
    MY_ASSERT(c);
    MY_ASSERT(u);
    if (!c || !u || !clen)
    {
        return 0;
    }
    *u = MY_UTF8_INVALID;
    udecoded = my_utf8_decode_byte(c[0], &len);
    if (!MY_BETWEEN(len, 1, MY_UTF8_SIZE))
    {
        return 1;
    }
    for (i = 1, j = 1; i < clen && j < len; ++i, ++j)
    {
        udecoded = (udecoded << 6) | my_utf8_decode_byte(c[i], &type);
        if (type != 0)
        {
            return j;
        }
    }
    if (j < len)
    {
        return 0;
    }
    *u = udecoded;
    my_utf8_validate(u, len);
    return len;
}
static char my_utf8_encode_byte(uint32_t u, int i)
{
    return (char)((MY_UTF8BYTE[i]) | ((uint8_t)u & ~MY_UTF8MASK[i]));
}
static int my_utf8_encode(uint32_t u, char* c, int clen)
{
    int len, i;
    len = my_utf8_validate(&u, 0);
    if (clen < len || !len || len > MY_UTF8_SIZE)
    {
        return 0;
    }
    for (i = len - 1; i != 0; --i) 
    {
        c[i] = my_utf8_encode_byte(u, 0);
        u >>= 6;
    }
    c[0] = my_utf8_encode_byte(u, len);
    return len;
}
static int my_utf8_len(const char* str, int len)
{
    const char* text;
    int glyphs = 0;
    int text_len;
    int glyph_len;
    int src_len = 0;
    uint32_t unicode;
    MY_ASSERT(str);
    if (!str || !len)
    {
        return 0;
    }
    text = str;
    text_len = len;
    glyph_len = my_utf8_decode(text, &unicode, text_len);
    while (glyph_len && src_len < len) 
    {
        glyphs++;
        src_len += glyph_len;
        glyph_len = my_utf8_decode(text + src_len, &unicode, text_len - src_len);
    }
    return glyphs;
}
static const char* my_utf8_at(const char* buffer, int length, int index, uint32_t* unicode, int* len)
{
    int i = 0;
    int src_len = 0;
    int glyph_len = 0;
    const char* text;
    int text_len;
    MY_ASSERT(buffer);
    MY_ASSERT(unicode);
    MY_ASSERT(len);
    if (!buffer || !unicode || !len)
    {
        return 0;
    }
    if (index < 0) 
    {
        *unicode = MY_UTF8_INVALID;
        *len = 0;
        return 0;
    }
    text = buffer;
    text_len = length;
    glyph_len = my_utf8_decode(text, unicode, text_len);
    while (glyph_len) 
    {
        if (i == index) 
        {
            *len = glyph_len;
            break;
        }
        i++;
        src_len = src_len + glyph_len;
        glyph_len = my_utf8_decode(text + src_len, unicode, text_len - src_len);
    }
    if (i != index)
    {
        return 0;
    }
    return buffer + src_len;
}
最新回复 (0)
只看楼主
全部楼主
C语言UTF8解析为uint8_t uint16_t uint32_t处理stb_truetype

xingyun86

作者最近主题：