C++判断字符串编码格式(ANSI\UTF16_LE\UTF16_BE\UTF8\UTF8_BOM)
enum Encode { ANSI = 1, UTF16_LE, UTF16_BE, UTF8_BOM, UTF8 };
__inline static
Encode IsUtf8Data(const uint8_t* data, size_t size)
{
bool bAnsi= true;
uint8_t ch = 0x00;
int32_t nBytes = 0;
for (auto i = 0; i < size; i++)
{
ch = *(data + i);
if ((ch & 0x80) != 0x00)
{
bAnsi = false;
}
if (nBytes == 0)
{
if (ch >= 0x80)
{
if (ch >= 0xFC && ch <= 0xFD)
{
nBytes = 6;
}
else if (ch >= 0xF8)
{
nBytes = 5;
}
else if (ch >= 0xF0)
{
nBytes = 4;
}
else if (ch >= 0xE0)
{
nBytes = 3;
}
else if (ch >= 0xC0)
{
nBytes = 2;
}
else
{
return Encode::ANSI;
}
nBytes--;
}
}
else
{
if ((ch & 0xC0) != 0x80)
{
return Encode::ANSI;
}
nBytes--;
}
}
if (nBytes > 0 || bAnsi)
{
return Encode::ANSI;
}
return Encode::UTF8;
}
__inline static
Encode DetectEncode(const uint8_t* data, size_t size)
{
if (size > 2 && data[0] == 0xFF && data[1] == 0xFE)
{
return Encode::UTF16_LE;
}
else if (size > 2 && data[0] == 0xFE && data[1] == 0xFF)
{
return Encode::UTF16_BE;
}
else if (size > 3 && data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF)
{
return Encode::UTF8_BOM;
}
else
{
return IsUtf8Data(data, size);
}
}
调用例子:
auto s = FILE_READER(sv.begin()->c_str(), std::ios::binary);
switch (DetectEncode((const uint8_t*)s.data(), s.size()))
{
case ANSI:
break;
case UTF16_LE:
s.erase(s.begin());
s.erase(s.begin());
s = StringConvertUtils::Instance()->WToA(std::wstring((const wchar_t*)s.data(), s.length() / sizeof(wchar_t)));
break;
case UTF16_BE:
s.erase(s.begin());
s.erase(s.begin());
s = StringConvertUtils::Instance()->WToA(std::wstring((const wchar_t*)s.data(), s.length() / sizeof(wchar_t)));
break;
case UTF8_BOM:
s.erase(s.begin());
s.erase(s.begin());
s.erase(s.begin());
s = StringConvertUtils::Instance()->WToA(StringConvertUtils::Instance()->UTF8ToW(s));
break;
case UTF8:
s = StringConvertUtils::Instance()->WToA(StringConvertUtils::Instance()->UTF8ToW(s));
break;
default:
break;
}
std::string AnsiReadFile(const std::string & fileName)
{
auto s = FILE_READER(fileName, std::ios::binary);
switch (DetectEncode((const uint8_t*)s.data(), s.size()))
{
case ANSI:
break;
case UTF16_LE:
s.erase(s.begin());
s.erase(s.begin());
s = StringConvertUtils::Instance()->WToA(std::wstring((const wchar_t*)s.data(), s.length() / sizeof(wchar_t)));
break;
case UTF16_BE:
s.erase(s.begin());
s.erase(s.begin());
s = StringConvertUtils::Instance()->WToA(std::wstring((const wchar_t*)s.data(), s.length() / sizeof(wchar_t)));
break;
case UTF8_BOM:
s.erase(s.begin());
s.erase(s.begin());
s.erase(s.begin());
s = StringConvertUtils::Instance()->WToA(StringConvertUtils::Instance()->UTF8ToW(s));
break;
case UTF8:
s = StringConvertUtils::Instance()->WToA(StringConvertUtils::Instance()->UTF8ToW(s));
break;
default:
break;
}
return (s);
}