一、編碼范圍
1. GBK (GB2312/GB18030)
\x00-\xff GBK雙字節(jié)編碼范圍
\x20-\x7f ASCII
\xa1-\xff 中文
\x80-\xff 中文
2. UTF-8 (Unicode)
\u4e00-\u9fa5 (中文)
\x3130-\x318F (韓文
\xAC00-\xD7A3 (韓文)
\u0800-\u4e00 (日文)
ps: 韓文是大于[\u9fa5]的字符
正則例子:
preg_replace("/([\x80-\xff])/","",$str);
preg_replace("/([u4e00-u9fa5])/","",$str);
二、代碼例子
//判斷是否有中文字符-GBK (JavaScript)
function check_chinese_char(s){
return (s.length != s.replace(/[^\x00-\xff]/g,"**").length);
三、參考文檔
http://www.unicode.org/
http://examples.oreilly.com/cjkvinfo/doc/cjk.inf
http://www.ansell-uebersetzungen.com/gbuni.html
http://www.haiyan.com/steelk/navigator/ref/gbk/gbindex.htm
http://baike.baidu.com/view/40801.htm
http://www.chedong.com/tech/hello_unicode.html
另:
/*
* 中文截取,支持gb2312,gbk,utf-8,big5
*
* @param string $str 要截取的字串
* @param int $start 截取起始位置
* @param int $length 截取長度
* @param string $charset utf-8|gb2312|gbk|big5 編碼
* @param $suffix 是否加尾綴
*/
public function csubstr($str, $start=0, $length, $charset="utf-8", $suffix=true)
{
if(function_exists("mb_substr"))
return mb_substr($str, $start, $length, $charset);
$re['utf-8'] = "/[\x01-\x7f]|[\xc2-\xdf][\x80-\xbf]|[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xff][\x80-\xbf]{3}/";
$re['gb2312'] = "/[\x01-\x7f]|[\xb0-\xf7][\xa0-\xfe]/";
$re['gbk'] = "/[\x01-\x7f]|[\x81-\xfe][\x40-\xfe]/";
$re['big5'] = "/[\x01-\x7f]|[\x81-\xfe]([\x40-\x7e]|\xa1-\xfe])/";
preg_match_all($re[$charset], $str, $match);
$slice = join("",array_slice($match[0], $start, $length));
if($suffix) return $slice."…";
return $slice;
}