如何检测用标准C UTF-8? [英] How to detect UTF-8 in plain C?
问题描述
我要寻找一个在普通的旧的C一code片段,检测到给定的字符串是UTF-8编码。我知道,与正则表达式的解决方案,但由于种种原因,效果会更好,以避免在这种特殊情况下使用什么,但纯C
与正则表达式的解决方案看起来像这样(警告:各种检查省略):
的#define UTF8_DETECT_REGEXP \"^([\\x09\\x0A\\x0D\\x20-\\x7E]|[\\xC2-\\xDF][\\x80-\\xBF]|\\xE0[\\xA0-\\xBF][\\x80-\\xBF]|[\\xE1-\\xEC\\xEE\\xEF][\\x80-\\xBF]{2}|\\xED[\\x80-\\x9F][\\x80-\\xBF]|\\xF0[\\x90-\\xBF][\\x80-\\xBF]{2}|[\\xF1-\\xF3][\\x80-\\xBF]{3}|\\xF4[\\x80-\\x8F][\\x80-\\xBF]{2})*$\"为const char *的错误;
INT error_off;
INT RC;
INT VECT [100];utf8_re = pcre_compile(UTF8_DETECT_REGEXP,PCRE_CASELESS,&安培;错误,&放大器; error_off,NULL);
utf8_pe = pcre_study(utf8_re,0,&安培;错误);RC = pcre_exec(utf8_re,utf8_pe,STR,LEN,0,0,VECT,sizeof的(VECT)/ sizeof的(VECT [0]));如果(RC大于0){
的printf(字符串是UTF8 \\ n);
}其他{
的printf(字符串不是UTF8 \\ n)
}
下面是一个(希望没有bug的)实现的纯C语言这个前pression :
_Bool is_utf8(为const char *字符串)
{
如果(!字符串)
返回0; const的无符号字符*字节=(const的无符号字符*)的字符串;
而(*字节)
{
如果((// ASCII
//使用字节[0]< = 0x7F的允许ASCII控制字符
字节[0] == 0×09 ||
字节[0] ==的0x0A ||
字节[0] == || 0X0D
(0x20的&下; =字节[0]&放大器;&放大器;字节[0]&下; = 0x7E的)
)
){
字节+ = 1;
继续;
} 如果((//非超长2字节
(为0xC2&下; =字节[0]&放大器;&放大器;字节[0]&下; = 0xDF)及&放大器;
(0x80的&下; =字节[1]&放大器;&放大器;字节[1] = 0xBF时才)
)
){
字节+ = 2;
继续;
} 如果((不包括// overlongs
字节[0] ==取0xE0&放大器;&放大器;
(0XA0&下; =字节[1]&放大器;&放大器;字节[1] = 0xBF时才)及&放大器;
(0x80的&下; =字节[2]&放大器;&放大器;字节[2]&下; = 0xBF时才)
)||
(//直3字节
((0xE1&下; =字节[0]&放大器;&放大器;字节[0]&下; = 0xEC)||
字节[0] == 0xEE ||
字节[0] == 0xEF)及&放大器;
(0x80的&下; =字节[1]&放大器;&放大器;字节[1] = 0xBF时才)及&放大器;
(0x80的&下; =字节[2]&放大器;&放大器;字节[2]&下; = 0xBF时才)
)||
(不包括//代理人
字节[0] == 0xED&放大器;&放大器;
(0x80的&下; =字节[1]&放大器;&放大器;字节[1] = 0x9F)及&放大器;
(0x80的&下; =字节[2]&放大器;&放大器;字节[2]&下; = 0xBF时才)
)
){
字节+ = 3;
继续;
} 如果((// 1-3架
字节[0] == 0XF0&放大器;&放大器;
(0×90&下; =字节[1]&放大器;&放大器;字节[1] = 0xBF时才)及&放大器;
(0x80的&下; =字节[2]&放大器;&放大器;字节[2]&下; = 0xBF时才)及&放大器;
(0x80的&下; =字节[3]及放大器;&放大器;字节[3]及下; = 0xBF时才)
)||
(//飞机4-15
(的0xf1&下; =字节[0]&放大器;&放大器;字节[0]&下; = 0xF3)及&放大器;
(0x80的&下; =字节[1]&放大器;&放大器;字节[1] = 0xBF时才)及&放大器;
(0x80的&下; =字节[2]&放大器;&放大器;字节[2]&下; = 0xBF时才)及&放大器;
(0x80的&下; =字节[3]及放大器;&放大器;字节[3]及下; = 0xBF时才)
)||
(//面16
字节[0] == 0xF4中&放大器;&放大器;
(0x80的&下; =字节[1]&放大器;&放大器;字节[1] = 0x8F)及&放大器;
(0x80的&下; =字节[2]&放大器;&放大器;字节[2]&下; = 0xBF时才)及&放大器;
(0x80的&下; =字节[3]及放大器;&放大器;字节[3]及下; = 0xBF时才)
)
){
字节+ = 4;
继续;
} 返回0;
} 返回1;
}
请注意,这是由W3C的表单验证,建议定期EX pression,这确实拒绝了一些有效的UTF-8序列的忠实翻译(特别是那些包含ASCII控制字符)。
此外,即使通过在注释中改变修复此之后,它仍然假设零终端,其中prevents嵌入NULL字符,但它应该在技术上是合法的。
当我在创造我自己的字符串库涉足,我跟UTF-8修改了(即编码NUL作为超长两个字节) - 随意使用的该标题的作为用于提供验证例程不从上述缺点影响的模板。
I am looking for a code snippet in plain old C that detects that the given string is in UTF-8 encoding. I know the solution with regex, but for various reasons it would be better to avoid using anything but plain C in this particular case.
Solution with regex looks like this (warning: various checks omitted):
#define UTF8_DETECT_REGEXP "^([\x09\x0A\x0D\x20-\x7E]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x90-\xBF][\x80-\xBF]{2}|[\xF1-\xF3][\x80-\xBF]{3}|\xF4[\x80-\x8F][\x80-\xBF]{2})*$"
const char *error;
int error_off;
int rc;
int vect[100];
utf8_re = pcre_compile(UTF8_DETECT_REGEXP, PCRE_CASELESS, &error, &error_off, NULL);
utf8_pe = pcre_study(utf8_re, 0, &error);
rc = pcre_exec(utf8_re, utf8_pe, str, len, 0, 0, vect, sizeof(vect)/sizeof(vect[0]));
if (rc > 0) {
printf("string is in UTF8\n");
} else {
printf("string is not in UTF8\n")
}
Here's a (hopefully bug-free) implementation of this expression in plain C:
_Bool is_utf8(const char * string)
{
if(!string)
return 0;
const unsigned char * bytes = (const unsigned char *)string;
while(*bytes)
{
if( (// ASCII
// use bytes[0] <= 0x7F to allow ASCII control characters
bytes[0] == 0x09 ||
bytes[0] == 0x0A ||
bytes[0] == 0x0D ||
(0x20 <= bytes[0] && bytes[0] <= 0x7E)
)
) {
bytes += 1;
continue;
}
if( (// non-overlong 2-byte
(0xC2 <= bytes[0] && bytes[0] <= 0xDF) &&
(0x80 <= bytes[1] && bytes[1] <= 0xBF)
)
) {
bytes += 2;
continue;
}
if( (// excluding overlongs
bytes[0] == 0xE0 &&
(0xA0 <= bytes[1] && bytes[1] <= 0xBF) &&
(0x80 <= bytes[2] && bytes[2] <= 0xBF)
) ||
(// straight 3-byte
((0xE1 <= bytes[0] && bytes[0] <= 0xEC) ||
bytes[0] == 0xEE ||
bytes[0] == 0xEF) &&
(0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
(0x80 <= bytes[2] && bytes[2] <= 0xBF)
) ||
(// excluding surrogates
bytes[0] == 0xED &&
(0x80 <= bytes[1] && bytes[1] <= 0x9F) &&
(0x80 <= bytes[2] && bytes[2] <= 0xBF)
)
) {
bytes += 3;
continue;
}
if( (// planes 1-3
bytes[0] == 0xF0 &&
(0x90 <= bytes[1] && bytes[1] <= 0xBF) &&
(0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
(0x80 <= bytes[3] && bytes[3] <= 0xBF)
) ||
(// planes 4-15
(0xF1 <= bytes[0] && bytes[0] <= 0xF3) &&
(0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
(0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
(0x80 <= bytes[3] && bytes[3] <= 0xBF)
) ||
(// plane 16
bytes[0] == 0xF4 &&
(0x80 <= bytes[1] && bytes[1] <= 0x8F) &&
(0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
(0x80 <= bytes[3] && bytes[3] <= 0xBF)
)
) {
bytes += 4;
continue;
}
return 0;
}
return 1;
}
Please note that this is a faithful translation of the regular expression recommended by W3C for form validation, which does indeed reject some valid UTF-8 sequences (in particular those containing ASCII control characters).
Also, even after fixing this by making the change mentioned in the comment, it still assumes zero-termination, which prevents embedding NUL characters, although it should technically be legal.
When I dabbled in creating my own string library, I went with modified UTF-8 (ie encoding NUL as an overlong two-byte sequence) - feel free to use this header as a template for providing a validation routine which doesn't suffer from the above shortcomings.
这篇关于如何检测用标准C UTF-8?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!