http://www.51099.com/comp/prcj/20101226/369895.html首先考慮utf-8編碼的判斷
utf-8編碼的判斷格式如下:
1字節(jié) 0xxxxxxx
2字節(jié) 110xxxxx 10xxxxxx
3字節(jié) 1110xxxx 10xxxxxx 10xxxxxx
4字節(jié) 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
5字節(jié) 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
6字節(jié) 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
這是標(biāo)準(zhǔn)的utf-8編碼格式,所以如果網(wǎng)頁(yè)是utf-8網(wǎng)頁(yè),那么必然遵循這個(gè)規(guī)律
函數(shù)實(shí)現(xiàn):
//judge the byte whether begin with binary 10
int Encoder::is_utf8_special_byte(unsigned char c)
{
unsigned special_byte = 0X02; //binary 00000010
if (c >> 6 == special_byte) {
return 1;
} else {
return 0;
}
}
int Encoder::is_utf8_code(const string& str)
{
unsigned one_byte = 0X00; //binary 00000000
unsigned two_byte = 0X06; //binary 00000110
unsigned three_byte = 0X0E; //binary 00001110
unsigned four_byte = 0X1E; //binary 00011110
unsigned five_byte = 0X3E; //binary 00111110
unsigned six_byte = 0X7E; //binary 01111110
int utf8_yes = 0;
int utf8_no = 0;
unsigned char k = 0;
unsigned char m = 0;
unsigned char n = 0;
unsigned char p = 0;
unsigned char q = 0;
unsigned char c = 0;
for (uint i=0; i<str.size();) {
c = (unsigned char)str[i];
if (c>>7 == one_byte) {
i++;
continue;
} else if (c>>5 == two_byte) {
k = (unsigned char)str[i+1];
if ( is_utf8_special_byte(k) ) {
utf8_yes++;
i += 2;
continue;
}
} else if (c>>4 == three_byte) {
m = (unsigned char)str[i+1];
n = (unsigned char)str[i+2];
if ( is_utf8_special_byte(m)
&& is_utf8_special_byte(n) ) {
utf8_yes++;
i += 3;
continue;
}
} else if (c>>3 == four_byte) {
k = (unsigned char)str[i+1];
m = (unsigned char)str[i+2];
n = (unsigned char)str[i+3];
if ( is_utf8_special_byte(k)
&& is_utf8_special_byte(m)
&& is_utf8_special_byte(n) ) {
utf8_yes++;
i += 4;
continue;
}
} else if (c>>2 == five_byte) {
unsigned char k = (unsigned char)str[i+1];
unsigned char m = (unsigned char)str[i+2];
unsigned char n = (unsigned char)str[i+3];
unsigned char p = (unsigned char)str[i+4];
if ( is_utf8_special_byte(k)
&& is_utf8_special_byte(m)
&& is_utf8_special_byte(n)
&& is_utf8_special_byte(p) ) {
utf8_yes++;
i += 5;
continue;
}
} else if (c>>1 == six_byte) {
k = (unsigned char)str[i+1];
m = (unsigned char)str[i+2];
n = (unsigned char)str[i+3];
p = (unsigned char)str[i+4];
q = (unsigned char)str[i+5];
if ( is_utf8_special_byte(k)
&& is_utf8_special_byte(m)
&& is_utf8_special_byte(n)
&& is_utf8_special_byte(p)
&& is_utf8_special_byte(q) ) {
utf8_yes++;
i += 6;
continue;
}
}
utf8_no++;
i++;
}
printf("%d %d\n", utf8_yes, utf8_no);
int ret = (100*utf8_yes)/(utf8_yes + utf8_no);
if (ret > 90) {
return 1;
} else {
return 0;
}
}
實(shí)現(xiàn)原理:判斷網(wǎng)頁(yè)文本中符合utf-8規(guī)則的字?jǐn)?shù)和不符合utf-8規(guī)則的字?jǐn)?shù)
如果符合的字?jǐn)?shù)超過(guò)90%,則判斷為utf-8編碼
其次應(yīng)該是gb2312編碼的判斷,由于gb2312相對(duì)gbk和big5的編碼范圍要小,所以
在gb2312和gbk和big5之間,應(yīng)該首先判斷該網(wǎng)頁(yè)文本是否是gb2312
函數(shù)實(shí)現(xiàn):
int Encoder::is_gb2312_code(const string& str)
{
unsigned one_byte = 0X00; //binary 00000000
int gb2312_yes = 0;
int gb2312_no = 0;
unsigned char k = 0;
unsigned char c = 0;
for (uint i=0; i<str.size();) {
c = (unsigned char)str[i];
if (c>>7 == one_byte) {
i++;
continue;
} else if (c >= 0XA1 && c <= 0XF7) {
k = (unsigned char)str[i+1];
if (k >= 0XA1 && k <= 0XFE) {
gb2312_yes++;
i += 2;
continue;
}
}
gb2312_no++;
i += 2;
}
printf("%d %d\n", gb2312_yes, gb2312_no);
int ret = (100*gb2312_yes)/(gb2312_yes+gb2312_no);
if (ret > 90) {
return 1;
} else {
return 0;
}
}
實(shí)現(xiàn)原理:統(tǒng)計(jì)符合gb2312編碼特征的字?jǐn)?shù)和不符合gb2312編碼特征的字?jǐn)?shù)
如果符合的字?jǐn)?shù)超過(guò)90%,則判斷該網(wǎng)頁(yè)文本為gb2312
再者應(yīng)該判斷big5編碼,原因是因?yàn)間bk的范圍要比big5的范圍廣
函數(shù)實(shí)現(xiàn)
int Encoder::is_big5_code(const string& str)
{
unsigned one_byte = 0X00; //binary 00000000
int big5_yes = 0;
int big5_no = 0;
unsigned char k = 0;
unsigned char c = 0;
for (uint i=0; i<str.size();) {
c = (unsigned char)str[i];
if (c>>7 == one_byte) {
i++;
continue;
} else if (c >= 0XA1 && c <= 0XF9) {
k = (unsigned char)str[i+1];
if ( k >= 0X40 && k <= 0X7E
|| k >= 0XA1 && k <= 0XFE) {
big5_yes++;
i += 2;
continue;
}
}
big5_no++;
i += 2;
}
printf("%d %d\n", big5_yes, big5_no);
int ret = (100*big5_yes)/(big5_yes+big5_no);
if (ret > 90) {
return 1;
} else {
return 0;
}
}
實(shí)現(xiàn)原理同gb2312
最后是gbk的判斷
函數(shù)實(shí)現(xiàn)
int Encoder::is_gbk_code(const string& str)
{
unsigned one_byte = 0X00; //binary 00000000
int gbk_yes = 0;
int gbk_no = 0;
unsigned char k = 0;
unsigned char c = 0;
for (uint i=0; i<str.size();) {
c = (unsigned char)str[i];
if (c>>7 == one_byte) {
i++;
continue;
} else if (c >= 0X81 && c <= 0XFE) {
k = (unsigned char)str[i+1];
if (k >= 0X40 && k <= 0XFE) {
gbk_yes++;
i += 2;
continue;
}
}
gbk_no++;
i += 2;
}
printf("%d %d\n", gbk_yes, gbk_no);
int ret = (100*gbk_yes)/(gbk_yes+gbk_no);
if (ret > 90) {
return 1;
} else {
return 0;
}
}
實(shí)現(xiàn)原理同gb2312和big5
最后關(guān)于gb18030:
好像我暫時(shí)未看到有用gb18030做網(wǎng)頁(yè)編碼的,所以對(duì)于gb18030的編碼判斷現(xiàn)在暫時(shí)忽略掉,如果以后遇到gb18030做網(wǎng)頁(yè)編碼的,再做進(jìn)一步考慮。
(#)