编码问题... windows-1255到utf 8? [英] Encoding issues ... windows-1255 to utf 8?

查看:223
本文介绍了编码问题... windows-1255到utf 8?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述


$ b $(


我仍​​然得到不同的结果,我无法解决这个问题。 b

第一个问题是php iconv()或mb_convert_encoding()是否支持windows-1255 ????
测试时,它会返回多个输出(使用// ignore& // translate)
,但是它的运行不正常。



我正在看mb_list_encodings()输出,它不包括window-1255 ...
玩和测试mb_detect_encoding()与Windows-1255输入(从网络爬网)
不返回好的字符集...

解决方案

你应该能够使用 strtr 具有要转换的关联字符数组(数据为可从MSDN 获得,并转换成下面的PHP数组)。请注意,在此代码中,保留的字节值将替换为U + FFFD替换字符(\xef\xbf\xbd)。

 函数win1255ToUtf8($ str){
static $ tbl = null;
if(!$ tbl){
$ tbl = array_combine(range(\x80,\xff),array(
\xe2\x82\ xac,\xef\xbf\xbd,\xe2\x80\x9a,\xc6\x92,
\xe2\x80\x9e ,\xe2\x80\xa6,\xe2\x80\xa0,\xe2\x80\xa1,
\xcb\x86 ,\xe2\x80\xb0,\xef\xbf\xbd,\xe2\x80\xb9,
\xef\xbf\ xbd,\xef\xbf\xbd,\xef\xbf\xbd,\xef\xbf\xbd,
\xef\xbf \xbd,\xe2\x80\x98,\xe2\x80\x99,\xe2\x80\x9c,
\xe2\\ \\ x80\x9d,\xe2\x80\xa2,\xe2\x80\x93,\xe2\x80\x94,
\ xcb\x9c,\xe2\x84\xa2,\\ xef\xbf\xbd,\xe2\x80\xba,
\xef\xbf\xbd,\xef\xbf\xbd, \xef\xbf\xbd,\xef\xbf\xbd,
\xc2\xa0,\xc2\xa1,\xc2\\ \\ xa2,\xc2\xa3,\xe2\x82\xaa,
\xc2\xa5,\xc2\xa6,\ xc2\xa7,\xc2\xa8,\xc2\xa9,
\xc3\x97,\xc2\xab,\xc2 \xac,\xc2\xad,\xc2\xae,
\xc2\xaf,\xc2\xb0,\xc2\\ \\ xb1,\xc2\xb2,\xc2\xb3,
\xc2\xb4,\xc2\xb5,\xc2\ xb6,\xc2\xb7,\xc2\xb8,
\xc2\xb9,\xc3\xb7,\xc2\xbb ,\xc2\xbc,\xc2\xbd,
\xc2\\ xbe,\xc2\xbf,\xd6\xb0,\xd6\xb1,\xd6\xb2,
\xd6\xb3 ,\xd6\xb4,\xd6\xb5,\xd6\xb6,\xd6\xb7,
\xd6\xb8 ,\xd6\xb9,\xef\xbf\xbd,\xd6\xbb,\xd6\xbc,
\xd6\ xbd,\xd6\xbe,\xd6\xbf,\xd7\x80,\xd7\x81,
\xd7\x82 ,\xd7\x83,\xd7\xb0,\xd7\xb1,\xd7\xb2,
\xd7\xb3 ,\xd7\xb4,\xef\xbf\xbd,\xef\xbf\xbd,
\xef\xbf\xbd, \xef\xbf\xbd,\xef\xbf\xbd,\xef\xbf\xbd,
\xef\xbf\xbd ,\xd7\x90,\xd7\x91,\xd7\x92,\xd7 \x93,
\xd7\x94,\xd7\x95,\xd7\x96,\xd7\x97,\xd7\\ \\ x98,
\xd7\x99,\xd7\x9a,\xd7\x9b,\xd7\x9c,\xd7\ x9d,
\xd7\x9e,\xd7\x9f,\xd7\xa0,\xd7\xa1,\xd7\xa2 ,
\xd7\xa3,\xd7\xa4,\xd7\xa5,\xd7\xa6,\xd7\xa7 ,
\xd7\xa8,\xd7\xa9,\xd7\xaa,\xef\xbf\xbd,\xef\ xbf\xbd,
\xe2\x80\x8e,\xe2\x80\x8f,\xef\xbf\xbd,
));
}
return strtr($ str,$ tbl);
}

我用此PHP脚本生成了上述代码:

 函数win1255ToUtf8($ str){
static $ tbl = null;
if(!$ tbl){
$ tbl = array_combine(range(\x80,\xff),array(
<?php

函数encodeString($ str){
return''。preg_replace('/../','\ $ 0',bin2hex($ str))。''';
}

函数codepointToUtf8($ n){
return mb_convert_encoding(pack('V',$ n),'UTF-8','UTF-32LE');
}

$ text = strip_tags(file_get_contents('http://msdn.microsoft.com/en-us/goglobal/cc305148.aspx'));
preg_match_all('/([ 0-9A-F] {2})= U\ +([0-9A-F] {4})/',$ text,$ matches,PREG_SET_ORDER);

$ table = array_fill(0,128,\xef\xbf\xbd);
foreach($ matches as $ match){
$ input = hexdec($ match [1]) - 128;
if($ input> = 0){
$ table [$ input] = codepointToUtf8(hexdec($ match [2]));
}
}

$ buf ='';
foreach($ table as $ from => $ to){
$ buf。= encodeString($ to)。 ',';
}
echo wordwrap(substr($ buf,0,-1),68,\\\
),\\\
;

?>
));
}
return strtr($ str,$ tbl);
}


Encoding convert from windows-1255 to utf-8 was asked before I know, but I'm still getting different results and I can't solve it.

The first issue is "does php iconv() or mb_convert_encoding() support windows-1255????" While testing, it returns several outputs (playing with the //ignore & //translate) but its not working well at all.

I was looking at mb_list_encodings() output and it doesn't include window-1255... playing and testing mb_detect_encoding() with an windows-1255 input (crawled from the net) doesn't return the good charset...

解决方案

You should be able to just use strtr with an associative array of characters to convert (the data is available from MSDN, and converted into a PHP array below). Note that in this code, reserved byte values are replaced with the U+FFFD replacement character ("\xef\xbf\xbd").

function win1255ToUtf8($str) {
    static $tbl = null;
    if (!$tbl) {
        $tbl = array_combine(range("\x80", "\xff"), array(
            "\xe2\x82\xac", "\xef\xbf\xbd", "\xe2\x80\x9a", "\xc6\x92",
            "\xe2\x80\x9e", "\xe2\x80\xa6", "\xe2\x80\xa0", "\xe2\x80\xa1",
            "\xcb\x86", "\xe2\x80\xb0", "\xef\xbf\xbd", "\xe2\x80\xb9",
            "\xef\xbf\xbd", "\xef\xbf\xbd", "\xef\xbf\xbd", "\xef\xbf\xbd",
            "\xef\xbf\xbd", "\xe2\x80\x98", "\xe2\x80\x99", "\xe2\x80\x9c",
            "\xe2\x80\x9d", "\xe2\x80\xa2", "\xe2\x80\x93", "\xe2\x80\x94",
            "\xcb\x9c", "\xe2\x84\xa2", "\xef\xbf\xbd", "\xe2\x80\xba",
            "\xef\xbf\xbd", "\xef\xbf\xbd", "\xef\xbf\xbd", "\xef\xbf\xbd",
            "\xc2\xa0", "\xc2\xa1", "\xc2\xa2", "\xc2\xa3", "\xe2\x82\xaa",
            "\xc2\xa5", "\xc2\xa6", "\xc2\xa7", "\xc2\xa8", "\xc2\xa9",
            "\xc3\x97", "\xc2\xab", "\xc2\xac", "\xc2\xad", "\xc2\xae",
            "\xc2\xaf", "\xc2\xb0", "\xc2\xb1", "\xc2\xb2", "\xc2\xb3",
            "\xc2\xb4", "\xc2\xb5", "\xc2\xb6", "\xc2\xb7", "\xc2\xb8",
            "\xc2\xb9", "\xc3\xb7", "\xc2\xbb", "\xc2\xbc", "\xc2\xbd",
            "\xc2\xbe", "\xc2\xbf", "\xd6\xb0", "\xd6\xb1", "\xd6\xb2",
            "\xd6\xb3", "\xd6\xb4", "\xd6\xb5", "\xd6\xb6", "\xd6\xb7",
            "\xd6\xb8", "\xd6\xb9", "\xef\xbf\xbd", "\xd6\xbb", "\xd6\xbc",
            "\xd6\xbd", "\xd6\xbe", "\xd6\xbf", "\xd7\x80", "\xd7\x81",
            "\xd7\x82", "\xd7\x83", "\xd7\xb0", "\xd7\xb1", "\xd7\xb2",
            "\xd7\xb3", "\xd7\xb4", "\xef\xbf\xbd", "\xef\xbf\xbd",
            "\xef\xbf\xbd", "\xef\xbf\xbd", "\xef\xbf\xbd", "\xef\xbf\xbd",
            "\xef\xbf\xbd", "\xd7\x90", "\xd7\x91", "\xd7\x92", "\xd7\x93",
            "\xd7\x94", "\xd7\x95", "\xd7\x96", "\xd7\x97", "\xd7\x98",
            "\xd7\x99", "\xd7\x9a", "\xd7\x9b", "\xd7\x9c", "\xd7\x9d",
            "\xd7\x9e", "\xd7\x9f", "\xd7\xa0", "\xd7\xa1", "\xd7\xa2",
            "\xd7\xa3", "\xd7\xa4", "\xd7\xa5", "\xd7\xa6", "\xd7\xa7",
            "\xd7\xa8", "\xd7\xa9", "\xd7\xaa", "\xef\xbf\xbd", "\xef\xbf\xbd",
            "\xe2\x80\x8e", "\xe2\x80\x8f", "\xef\xbf\xbd",
        ));
    }
    return strtr($str, $tbl);
}

I generated the above code with this PHP script:

function win1255ToUtf8($str) {
    static $tbl = null;
    if (!$tbl) {
        $tbl = array_combine(range("\x80", "\xff"), array(
            <?php

        function encodeString($str) {
            return '"' . preg_replace('/../', '\x$0', bin2hex($str)) . '"';
        }

        function codepointToUtf8($n) {
            return mb_convert_encoding(pack('V', $n), 'UTF-8', 'UTF-32LE');
        }

        $text = strip_tags( file_get_contents( 'http://msdn.microsoft.com/en-us/goglobal/cc305148.aspx') );
        preg_match_all('/([0-9A-F]{2}) = U\+([0-9A-F]{4})/', $text, $matches, PREG_SET_ORDER);

        $table = array_fill(0, 128, "\xef\xbf\xbd");
        foreach ($matches as $match) {
            $input = hexdec($match[1]) - 128;
            if ($input >= 0) {
                $table[$input] = codepointToUtf8(hexdec($match[2]));
            }
        }

        $buf = '';
        foreach ($table as $from => $to) {
            $buf .= encodeString($to) . ', ';
        }
        echo wordwrap(substr($buf, 0, -1), 68, "\n            "), "\n";

?>
        ));
    }
    return strtr($str, $tbl);
}

这篇关于编码问题... windows-1255到utf 8?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆