如何读取utf16文本文件在golang中的字符串? [英] How to read utf16 text file to string in golang?
问题描述
我可以将文件读取到字节数组
,但是当我将它转换为字符串时,它会将字符串转换为字符串
utf16字节ascii如何正确转换它?
包主
$ os $
$ func main(){
//读取整个文件
f,err:= os.Open(test.txt)
if err!= nil {
fmt.Printf(error opening file:%v \ n,err)
os.Exit(1)
}
r:= bufio.NewReader(f)
var s,b,e = r.ReadLine()
if e == nil {
fmt.Println(b)
fmt.Println(s)
fmt.Println(string(s))
}
输出:
false
[255 254 91 0 83 0 99 0 114 0 105 0 112 0 116 0 32 0 73 0 110 0 102 0 111 0 93 0
13 0]
S cript I nfo]
更新:
经过我的测试编辑这两个例子,我已经详细说明了现在的确切问题。
在windows中,如果我在行尾添加换行符(CR + LF) ,CR将在行中读取。由于readline函数无法正确处理unicode([OD OA] = ok,[OD 00 OA 00] = not ok)。
如果readline函数可以识别unicode,它应该了解[OD 00 OA 00]并返回[] uint16而不是[]字节。所以我认为我不应该使用bufio.NewReader,因为它不能使用读取utf16,我没有看到bufio.NewReader.ReadLine可以接受参数作为标志来指示读取文本是utf8,utf16le / be或utf32。在去库中有unicode文本的readline函数吗?
UTF16,UTF8和Byte Order Marks由< unicode.org/> Unicode联盟: UTF-16常见问题解答, UTF-8常见问题解答和字节顺序标记(BOM)FAQ 。 blockquote>
读取文件中的行太麻烦了。
人们经常被引用到bufio.Reader .ReadLine由于其名称
,但它有一个奇怪的签名,返回(line []字节,isPrefix bool,
err错误),并且需要很多工作。
ReadSlice和ReadString需要一个分隔符字节,它是almo st
总是显而易见且难看的'\ n',并且还可以返回
和EOF
bufio:新的扫描仪界面
添加一个新的简单的扫描界面(可能是文本)数据,
基于称为扫描仪的新类型。它有自己的内部
缓冲,所以即使没有注入
bufio.Reader也应该是高效的。输入的格式是由split
函数定义的,默认情况下分成几行。
您可以从通常的位置下载二进制和源代码发布:
https://code.google.com/p/go/downloads/list?q=go1.1beta1
下面是一个使用Unicode规则将UTF16文本文件行转换为Go UTF8编码字符串的程序。该代码已被修改以利用Go 1.1中新的 bufio.Scanner
界面。
$ $ bos
运行时
unicode / utf16
unicode / utf8
)
// UTF16BytesToString将UTF-16以big或little endian字节顺序编码的字节,
//转换为UTF-8编码的字符串。
func UTF16BytesToString(b [] byte,o binary.ByteOrder)string {
utf:= make([] uint16,(len(b)+(2-1))/ 2)
代表i:= 0; i +(2-1)< LEN(B);如果len(b)/ 2 utf [i / 2] = o.Uint16(b [i:])
}
。 len(utf){
utf [len(utf)-1] = utf8.RuneError
}
返回字符串(utf16.Decode(utf))
}
// UTF-16 endian byte order
const(
unknownEndian = iota
bigEndian
littleEndian
)
// dropCREndian从endian数据中删除终端\ r。
func dropCREndian(data [] byte,t1,t2 byte)[] byte {
if len(data)> 1 {
if data [len(data)-2] == t1&& data [len(data)-1] == t2 {
return data [0:len(data)-2]
}
}
返回数据
}
// dropCRBE从大端数据中删除终端\ r。
func dropCRBE(data [] byte)[] byte {
return dropCREndian(data,'\x00','\r')
}
// dropCRLE从小端数据中删除终端\ r。
func dropCRLE(data [] byte)[] byte {
return dropCREndian(data,'\r','\x00')
}
// dropCR从数据中删除终端\ r。
func dropCR(data [] byte)([] byte,int){
var endian = unknownEndian
switch ld:= len(data); {
case ld!= len(dropCRLE(data)):
endian = littleEndian
case ld!= len(dropCRBE(data)):
endian = bigEndian
}
返回数据,endian
}
// SplitFunc是一个扫描器的分离函数,它返回
// text的每一行,去掉任何尾随行尾标记。返回的行可能为
//为空。行尾标记是一个可选的回车符,后跟一个强制换行符
//。在正则表达式中,它是`\r?\\\
`。
//即使没有
// newline,也会返回最后一个非空输入行。
func ScanUTF16LinesFunc(byteOrder binary.ByteOrder)(bufio.SplitFunc,func()binary.ByteOrder){
//函数闭包变量
var endian = unknownEndian
switch byteOrder {
case binary.BigEndian:
endian = bigEndian
case binary.LittleEndian:
endian = littleEndian
}
const bom = 0xFEFF
var checkBOM bool = endian == unknownEndian
//扫描器分割函数
splitFunc:= func(data [] byte,atEOF bool)(advance int,token [] byte,err错误){
如果atEOF&& len(data)== 0 {
return 0,nil,nil
}
checkBOM {
checkBOM = false
len(data) > 1 {
switch uint16(bom){
case uint16(data [0])<< 8 | uint16(data [1]):
endian = bigEndian
return 2,nil,nil
case uint16(data [1])<<< 8 | uint16(data [0]):
endian = littleEndian
return 2,nil,nil
}
}
}
//扫描换行符终止的行。
i:= 0
for {
j:= bytes.IndexByte(data [i:],'\\\
')
if j< 0 {
break
}
i + = j
switch e:= i%2; e {
case 1:// UTF-16BE
if endian!= littleEndian {
if i> 1 {
if data [i-1] =='\x00'{
endian = bigEndian
//我们有一个完整的换行符终止的行。
return i + 1,dropCRBE(data [0:i-1]),nil
}
}
}
case 0:// UTF-16LE
if endian!= bigEndian {
if i + 1 < len(data){
i ++
if data [i] =='\x00'{
endian = littleEndian
//我们有一个完整的换行符终止行。
返回i + 1,dropCRLE(data [0:i-1]),nil
}
}
}
}
i ++
}
//如果我们在EOF,我们有一个最终的非终止线。把它返还。
如果atEOF {
// drop CR。
advance = len(data)
switch endian {
case bigEndian:
data = dropCRBE(data)
case littleEndian:
data = dropCRLE )
default:
data,endian = dropCR(data)
}
if endian == unknownEndian {
if runtime.GOOS ==windows{
endian = littleEndian
} else {
endian = bigEndian
}
}
return advance,data,nil
}
//请求更多数据。
return 0,nil,nil
}
// Endian字节顺序函数
orderFunc:= func()(byteOrder binary.ByteOrder){
switch endian {
case bigEndian:
byteOrder = binary.BigEndian
case littleEndian:
byteOrder = binary.LittleEndian
}
return byteOrder
}
return splitFunc,orderFunc
}
func main(){
file,err:= os.Open(utf16.le.txt )
如果err!= nil {
fmt.Println(err)
os.Exit(1)
}
推迟file.Close()
fmt.Println(file.Name())
rdr:= bufio.NewReader(file)
scanner:= bufio.NewScanner(rdr)
var bo binary.ByteOrder //未知,从数据推断
// bo = binary.LittleEndian // windows
splitFunc,orderFunc:= ScanUTF16LinesFunc(bo)
scanner.Split(splitFunc)
for scanner.Scan(){
b:= scan.Bytes()
s:= UTF16BytesToString(b,orderFunc())
fmt.Println(len(s),s)
fmt.Println(len(b),b)
}
fmt.Println(orderFunc())
如果err:= scanner.Err(); err!= nil {
fmt.Println(err)
}
}
输出:
utf16.le.txt
15Hello,世界
22 [34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 0 22 78 76 117 34 0]
0
0 []
15你好,世界
22 [34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 0 22 78 76 117 34 0]
LittleEndian
utf16.be.txt
15你好,世界
22 [0 34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 78 22 117 76 0 34]
0
0 []
15世界
22 [0 34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 78 22 117 76 0 34]
BigEndian
I can read the file to bytes array
but when I convert it to string
it treat the utf16 bytes as ascii
How to convert it correctly?
package main
import ("fmt"
"os"
"bufio"
)
func main(){
// read whole the file
f, err := os.Open("test.txt")
if err != nil {
fmt.Printf("error opening file: %v\n",err)
os.Exit(1)
}
r := bufio.NewReader(f)
var s,b,e = r.ReadLine()
if e==nil{
fmt.Println(b)
fmt.Println(s)
fmt.Println(string(s))
}
}
output:
false
[255 254 91 0 83 0 99 0 114 0 105 0 112 0 116 0 32 0 73 0 110 0 102 0 111 0 93 0 13 0]
S c r i p t I n f o ]
Update:
After I tested the two examples, I have understanded what is the exact problem now.
In windows, if I add the line break (CR+LF) at the end of the line, the CR will be read in the line. Because the readline function cannot handle unicode correctly ([OD OA]=ok, [OD 00 OA 00]=not ok).
If the readline function can recognize unicode, it should understand [OD 00 OA 00] and return []uint16 rather than []bytes.
So I think I should not use bufio.NewReader as it is not able to read utf16, I don't see bufio.NewReader.ReadLine can accept parameter as flag to indicate the reading text is utf8, utf16le/be or utf32. Is there any readline function for unicode text in go library?
UTF16, UTF8, and Byte Order Marks are defined by the Unicode Consortium: UTF-16 FAQ, UTF-8 FAQ, and Byte Order Mark (BOM) FAQ.
Issue 4802: bufio: reading lines is too cumbersome
Reading lines from a file is too cumbersome in Go.
People are often drawn to bufio.Reader.ReadLine because of its name, but it has a weird signature, returning (line []byte, isPrefix bool, err error), and requires a lot of work.
ReadSlice and ReadString require a delimiter byte, which is almost always the obvious and unsightly '\n', and also can return both a line and an EOF
bufio: new Scanner interface
Add a new, simple interface for scanning (probably textual) data, based on a new type called Scanner. It does its own internal buffering, so should be plausibly efficient even without injecting a bufio.Reader. The format of the input is defined by a "split function", by default splitting into lines.
You can download binary and source distributions from the usual place: https://code.google.com/p/go/downloads/list?q=go1.1beta1
Here's a program which uses the Unicode rules to convert UTF16 text file lines to Go UTF8 encoded strings. The code has been revised to take advantage of the new bufio.Scanner
interface in Go 1.1.
package main
import (
"bufio"
"bytes"
"encoding/binary"
"fmt"
"os"
"runtime"
"unicode/utf16"
"unicode/utf8"
)
// UTF16BytesToString converts UTF-16 encoded bytes, in big or little endian byte order,
// to a UTF-8 encoded string.
func UTF16BytesToString(b []byte, o binary.ByteOrder) string {
utf := make([]uint16, (len(b)+(2-1))/2)
for i := 0; i+(2-1) < len(b); i += 2 {
utf[i/2] = o.Uint16(b[i:])
}
if len(b)/2 < len(utf) {
utf[len(utf)-1] = utf8.RuneError
}
return string(utf16.Decode(utf))
}
// UTF-16 endian byte order
const (
unknownEndian = iota
bigEndian
littleEndian
)
// dropCREndian drops a terminal \r from the endian data.
func dropCREndian(data []byte, t1, t2 byte) []byte {
if len(data) > 1 {
if data[len(data)-2] == t1 && data[len(data)-1] == t2 {
return data[0 : len(data)-2]
}
}
return data
}
// dropCRBE drops a terminal \r from the big endian data.
func dropCRBE(data []byte) []byte {
return dropCREndian(data, '\x00', '\r')
}
// dropCRLE drops a terminal \r from the little endian data.
func dropCRLE(data []byte) []byte {
return dropCREndian(data, '\r', '\x00')
}
// dropCR drops a terminal \r from the data.
func dropCR(data []byte) ([]byte, int) {
var endian = unknownEndian
switch ld := len(data); {
case ld != len(dropCRLE(data)):
endian = littleEndian
case ld != len(dropCRBE(data)):
endian = bigEndian
}
return data, endian
}
// SplitFunc is a split function for a Scanner that returns each line of
// text, stripped of any trailing end-of-line marker. The returned line may
// be empty. The end-of-line marker is one optional carriage return followed
// by one mandatory newline. In regular expression notation, it is `\r?\n`.
// The last non-empty line of input will be returned even if it has no
// newline.
func ScanUTF16LinesFunc(byteOrder binary.ByteOrder) (bufio.SplitFunc, func() binary.ByteOrder) {
// Function closure variables
var endian = unknownEndian
switch byteOrder {
case binary.BigEndian:
endian = bigEndian
case binary.LittleEndian:
endian = littleEndian
}
const bom = 0xFEFF
var checkBOM bool = endian == unknownEndian
// Scanner split function
splitFunc := func(data []byte, atEOF bool) (advance int, token []byte, err error) {
if atEOF && len(data) == 0 {
return 0, nil, nil
}
if checkBOM {
checkBOM = false
if len(data) > 1 {
switch uint16(bom) {
case uint16(data[0])<<8 | uint16(data[1]):
endian = bigEndian
return 2, nil, nil
case uint16(data[1])<<8 | uint16(data[0]):
endian = littleEndian
return 2, nil, nil
}
}
}
// Scan for newline-terminated lines.
i := 0
for {
j := bytes.IndexByte(data[i:], '\n')
if j < 0 {
break
}
i += j
switch e := i % 2; e {
case 1: // UTF-16BE
if endian != littleEndian {
if i > 1 {
if data[i-1] == '\x00' {
endian = bigEndian
// We have a full newline-terminated line.
return i + 1, dropCRBE(data[0 : i-1]), nil
}
}
}
case 0: // UTF-16LE
if endian != bigEndian {
if i+1 < len(data) {
i++
if data[i] == '\x00' {
endian = littleEndian
// We have a full newline-terminated line.
return i + 1, dropCRLE(data[0 : i-1]), nil
}
}
}
}
i++
}
// If we're at EOF, we have a final, non-terminated line. Return it.
if atEOF {
// drop CR.
advance = len(data)
switch endian {
case bigEndian:
data = dropCRBE(data)
case littleEndian:
data = dropCRLE(data)
default:
data, endian = dropCR(data)
}
if endian == unknownEndian {
if runtime.GOOS == "windows" {
endian = littleEndian
} else {
endian = bigEndian
}
}
return advance, data, nil
}
// Request more data.
return 0, nil, nil
}
// Endian byte order function
orderFunc := func() (byteOrder binary.ByteOrder) {
switch endian {
case bigEndian:
byteOrder = binary.BigEndian
case littleEndian:
byteOrder = binary.LittleEndian
}
return byteOrder
}
return splitFunc, orderFunc
}
func main() {
file, err := os.Open("utf16.le.txt")
if err != nil {
fmt.Println(err)
os.Exit(1)
}
defer file.Close()
fmt.Println(file.Name())
rdr := bufio.NewReader(file)
scanner := bufio.NewScanner(rdr)
var bo binary.ByteOrder // unknown, infer from data
// bo = binary.LittleEndian // windows
splitFunc, orderFunc := ScanUTF16LinesFunc(bo)
scanner.Split(splitFunc)
for scanner.Scan() {
b := scanner.Bytes()
s := UTF16BytesToString(b, orderFunc())
fmt.Println(len(s), s)
fmt.Println(len(b), b)
}
fmt.Println(orderFunc())
if err := scanner.Err(); err != nil {
fmt.Println(err)
}
}
Output:
utf16.le.txt
15 "Hello, 世界"
22 [34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 0 22 78 76 117 34 0]
0
0 []
15 "Hello, 世界"
22 [34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 0 22 78 76 117 34 0]
LittleEndian
utf16.be.txt
15 "Hello, 世界"
22 [0 34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 78 22 117 76 0 34]
0
0 []
15 "Hello, 世界"
22 [0 34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 78 22 117 76 0 34]
BigEndian
这篇关于如何读取utf16文本文件在golang中的字符串?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!