如何读取utf16文本文件在golang中的字符串? [英] How to read utf16 text file to string in golang?

查看:899
本文介绍了如何读取utf16文本文件在golang中的字符串?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我可以将文件读取到字节数组

,但是当我将它转换为字符串时,它会将字符串转换为字符串

utf16字节ascii



如何正确转换它?

 包主







$ os $
$ func main(){
//读取整个文件
f,err:= os.Open(test.txt)
if err!= nil {
fmt.Printf(error opening file:%v \ n,err)
os.Exit(1)
}
r:= bufio.NewReader(f)
var s,b,e = r.ReadLine()
if e == nil {
fmt.Println(b)
fmt.Println(s)
fmt.Println(string(s))
}






输出:



false

[255 254 91 0 83 0 99 0 114 0 105 0 112 0 116 0 32 0 73 0 110 0 102 0 111 0 93 0
13 0]

S cript I nfo]




更新:

经过我的测试编辑这两个例子,我已经详细说明了现在的确切问题。



在windows中,如果我在行尾添加换行符(CR + LF) ,CR将在行中读取。由于readline函数无法正确处理unicode([OD OA] = ok,[OD 00 OA 00] = not ok)。

如果readline函数可以识别unicode,它应该了解[OD 00 OA 00]并返回[] uint16而不是[]字节。所以我认为我不应该使用bufio.NewReader,因为它不能使用读取utf16,我没有看到bufio.NewReader.ReadLine可以接受参数作为标志来指示读取文本是utf8,utf16le / be或utf32。在去库中有unicode文本的readline函数吗?

解决方案

UTF16,UTF8和Byte Order Marks由< unicode.org/> Unicode联盟: UTF-16常见问题解答 UTF-8常见问题解答字节顺序标记(BOM)FAQ blockquote>

问题4802:bufio:朗读行太麻烦



读取文件中的行太麻烦了。

人们经常被引用到bufio.Reader .ReadLine由于其名称
,但它有一个奇怪的签名,返回(line []字节,isPrefix bool,
err错误),并且需要很多工作。



ReadSlice和ReadString需要一个分隔符字节,它是almo st
总是显而易见且难看的'\ n',并且还可以返回
和EOF





修订版:f685026a2d38

bufio:新的扫描仪界面

添加一个新的简单的扫描界面(可能是文本)数据,
基于称为扫描仪的新类型。它有自己的内部
缓冲,所以即使没有注入
bufio.Reader也应该是高效的。输入的格式是由split
函数定义的,默认情况下分成几行。







go1.1beta1发布



您可以从通常的位置下载二进制和源代码发布:
https://code.google.com/p/go/downloads/list?q=go1.1beta1







下面是一个使用Unicode规则将UTF16文本文件行转换为Go UTF8编码字符串的程序。该代码已被修改以利用Go 1.1中新的 bufio.Scanner 界面。

 












$ $ bos
运行时
unicode / utf16
unicode / utf8


// UTF16BytesToString将UTF-16以big或little endian字节顺序编码的字节,
//转换为UTF-8编码的字符串。
func UTF16BytesToString(b [] byte,o binary.ByteOrder)string {
utf:= make([] uint16,(len(b)+(2-1))/ 2)
代表i:= 0; i +(2-1)< LEN(B);如果len(b)/ 2 utf [i / 2] = o.Uint16(b [i:])
}
。 len(utf){
utf [len(utf)-1] = utf8.RuneError
}
返回字符串(utf16.Decode(utf))
}

// UTF-16 endian byte order
const(
unknownEndian = iota
bigEndian
littleEndian


// dropCREndian从endian数据中删除终端\ r。
func dropCREndian(data [] byte,t1,t2 byte)[] byte {
if len(data)> 1 {
if data [len(data)-2] == t1&& data [len(data)-1] == t2 {
return data [0:len(data)-2]
}
}
返回数据
}

// dropCRBE从大端数据中删除终端\ r。
func dropCRBE(data [] byte)[] byte {
return dropCREndian(data,'\x00','\r')
}

// dropCRLE从小端数据中删除终端\ r。
func dropCRLE(data [] byte)[] byte {
return dropCREndian(data,'\r','\x00')
}

// dropCR从数据中删除终端\ r。
func dropCR(data [] byte)([] byte,int){
var endian = unknownEndian
switch ld:= len(data); {
case ld!= len(dropCRLE(data)):
endian = littleEndian
case ld!= len(dropCRBE(data)):
endian = bigEndian
}
返回数据,endian
}

// SplitFunc是一个扫描器的分离函数,它返回
// text的每一行,去掉任何尾随行尾标记。返回的行可能为
//为空。行尾标记是一个可选的回车符,后跟一个强制换行符
//。在正则表达式中,它是`\r?\\\
`。
//即使没有
// newline,也会返回最后一个非空输入行。
func ScanUTF16LinesFunc(byteOrder binary.ByteOrder)(bufio.SplitFunc,func()binary.ByteOrder){

//函数闭包变量
var endian = unknownEndian
switch byteOrder {
case binary.BigEndian:
endian = bigEndian
case binary.LittleEndian:
endian = littleEndian
}
const bom = 0xFEFF
var checkBOM bool = endian == unknownEndian

//扫描器分割函数
splitFunc:= func(data [] byte,atEOF bool)(advance int,token [] byte,err错误){

如果atEOF&& len(data)== 0 {
return 0,nil,nil
}

checkBOM {
checkBOM = false
len(data) > 1 {
switch uint16(bom){
case uint16(data [0])<< 8 | uint16(data [1]):
endian = bigEndian
return 2,nil,nil
case uint16(data [1])<<< 8 | uint16(data [0]):
endian = littleEndian
return 2,nil,nil
}
}
}

//扫描换行符终止的行。
i:= 0
for {
j:= bytes.IndexByte(data [i:],'\\\
')
if j< 0 {
break
}
i + = j
switch e:= i%2; e {
case 1:// UTF-16BE
if endian!= littleEndian {
if i> 1 {
if data [i-1] =='\x00'{
endian = bigEndian
//我们有一个完整的换行符终止的行。
return i + 1,dropCRBE(data [0:i-1]),nil
}
}
}
case 0:// UTF-16LE
if endian!= bigEndian {
if i + 1 < len(data){
i ++
if data [i] =='\x00'{
endian = littleEndian
//我们有一个完整的换行符终止行。
返回i + 1,dropCRLE(data [0:i-1]),nil
}
}
}
}
i ++
}

//如果我们在EOF,我们有一个最终的非终止线。把它返还。
如果atEOF {
// drop CR。
advance = len(data)
switch endian {
case bigEndian:
data = dropCRBE(data)
case littleEndian:
data = dropCRLE )
default:
data,endian = dropCR(data)
}
if endian == unknownEndian {
if runtime.GOOS ==windows{
endian = littleEndian
} else {
endian = bigEndian
}
}
return advance,data,nil
}

//请求更多数据。
return 0,nil,nil
}

// Endian字节顺序函数
orderFunc:= func()(byteOrder binary.ByteOrder){
switch endian {
case bigEndian:
byteOrder = binary.BigEndian
case littleEndian:
byteOrder = binary.LittleEndian
}
return byteOrder
}

return splitFunc,orderFunc
}

func main(){
file,err:= os.Open(utf16.le.txt )
如果err!= nil {
fmt.Println(err)
os.Exit(1)
}
推迟file.Close()
fmt.Println(file.Name())

rdr:= bufio.NewReader(file)
scanner:= bufio.NewScanner(rdr)
var bo binary.ByteOrder //未知,从数据推断
// bo = binary.LittleEndian // windows
splitFunc,orderFunc:= ScanUTF16LinesFunc(bo)
scanner.Split(splitFunc)

for scanner.Scan(){
b:= scan.Bytes()
s:= UTF16BytesToString(b,orderFunc())
fmt.Println(len(s),s)
fmt.Println(len(b),b)
}
fmt.Println(orderFunc())

如果err:= scanner.Err(); err!= nil {
fmt.Println(err)
}
}

输出:

  utf16.le.txt 
15Hello,世界
22 [34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 0 22 78 76 117 34 0]
0
0 []
15你好,世界
22 [34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 0 22 78 76 117 34 0]
LittleEndian

utf16.be.txt
15你好,世界
22 [0 34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 78 22 117 76 0 34]
0
0 []
15世界
22 [0 34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 78 22 117 76 0 34]
BigEndian


I can read the file to bytes array

but when I convert it to string

it treat the utf16 bytes as ascii

How to convert it correctly?

package main

import ("fmt"
"os"
"bufio"
)

func main(){
    // read whole the file
    f, err := os.Open("test.txt")
    if err != nil {
        fmt.Printf("error opening file: %v\n",err)
        os.Exit(1)
    }
    r := bufio.NewReader(f)
    var s,b,e = r.ReadLine()
    if e==nil{
        fmt.Println(b)
        fmt.Println(s)
        fmt.Println(string(s))
    }
}


output:

false

[255 254 91 0 83 0 99 0 114 0 105 0 112 0 116 0 32 0 73 0 110 0 102 0 111 0 93 0 13 0]

S c r i p t I n f o ]


Update:

After I tested the two examples, I have understanded what is the exact problem now.

In windows, if I add the line break (CR+LF) at the end of the line, the CR will be read in the line. Because the readline function cannot handle unicode correctly ([OD OA]=ok, [OD 00 OA 00]=not ok).

If the readline function can recognize unicode, it should understand [OD 00 OA 00] and return []uint16 rather than []bytes.

So I think I should not use bufio.NewReader as it is not able to read utf16, I don't see bufio.NewReader.ReadLine can accept parameter as flag to indicate the reading text is utf8, utf16le/be or utf32. Is there any readline function for unicode text in go library?

解决方案

UTF16, UTF8, and Byte Order Marks are defined by the Unicode Consortium: UTF-16 FAQ, UTF-8 FAQ, and Byte Order Mark (BOM) FAQ.


Issue 4802: bufio: reading lines is too cumbersome

Reading lines from a file is too cumbersome in Go.

People are often drawn to bufio.Reader.ReadLine because of its name, but it has a weird signature, returning (line []byte, isPrefix bool, err error), and requires a lot of work.

ReadSlice and ReadString require a delimiter byte, which is almost always the obvious and unsightly '\n', and also can return both a line and an EOF


Revision: f685026a2d38

bufio: new Scanner interface

Add a new, simple interface for scanning (probably textual) data, based on a new type called Scanner. It does its own internal buffering, so should be plausibly efficient even without injecting a bufio.Reader. The format of the input is defined by a "split function", by default splitting into lines.


go1.1beta1 released

You can download binary and source distributions from the usual place: https://code.google.com/p/go/downloads/list?q=go1.1beta1


Here's a program which uses the Unicode rules to convert UTF16 text file lines to Go UTF8 encoded strings. The code has been revised to take advantage of the new bufio.Scanner interface in Go 1.1.

package main

import (
    "bufio"
    "bytes"
    "encoding/binary"
    "fmt"
    "os"
    "runtime"
    "unicode/utf16"
    "unicode/utf8"
)

// UTF16BytesToString converts UTF-16 encoded bytes, in big or little endian byte order,
// to a UTF-8 encoded string.
func UTF16BytesToString(b []byte, o binary.ByteOrder) string {
    utf := make([]uint16, (len(b)+(2-1))/2)
    for i := 0; i+(2-1) < len(b); i += 2 {
        utf[i/2] = o.Uint16(b[i:])
    }
    if len(b)/2 < len(utf) {
        utf[len(utf)-1] = utf8.RuneError
    }
    return string(utf16.Decode(utf))
}

// UTF-16 endian byte order
const (
    unknownEndian = iota
    bigEndian
    littleEndian
)

// dropCREndian drops a terminal \r from the endian data.
func dropCREndian(data []byte, t1, t2 byte) []byte {
    if len(data) > 1 {
        if data[len(data)-2] == t1 && data[len(data)-1] == t2 {
            return data[0 : len(data)-2]
        }
    }
    return data
}

// dropCRBE drops a terminal \r from the big endian data.
func dropCRBE(data []byte) []byte {
    return dropCREndian(data, '\x00', '\r')
}

// dropCRLE drops a terminal \r from the little endian data.
func dropCRLE(data []byte) []byte {
    return dropCREndian(data, '\r', '\x00')
}

// dropCR drops a terminal \r from the data.
func dropCR(data []byte) ([]byte, int) {
    var endian = unknownEndian
    switch ld := len(data); {
    case ld != len(dropCRLE(data)):
        endian = littleEndian
    case ld != len(dropCRBE(data)):
        endian = bigEndian
    }
    return data, endian
}

// SplitFunc is a split function for a Scanner that returns each line of
// text, stripped of any trailing end-of-line marker. The returned line may
// be empty. The end-of-line marker is one optional carriage return followed
// by one mandatory newline. In regular expression notation, it is `\r?\n`.
// The last non-empty line of input will be returned even if it has no
// newline.
func ScanUTF16LinesFunc(byteOrder binary.ByteOrder) (bufio.SplitFunc, func() binary.ByteOrder) {

    // Function closure variables
    var endian = unknownEndian
    switch byteOrder {
    case binary.BigEndian:
        endian = bigEndian
    case binary.LittleEndian:
        endian = littleEndian
    }
    const bom = 0xFEFF
    var checkBOM bool = endian == unknownEndian

    // Scanner split function
    splitFunc := func(data []byte, atEOF bool) (advance int, token []byte, err error) {

        if atEOF && len(data) == 0 {
            return 0, nil, nil
        }

        if checkBOM {
            checkBOM = false
            if len(data) > 1 {
                switch uint16(bom) {
                case uint16(data[0])<<8 | uint16(data[1]):
                    endian = bigEndian
                    return 2, nil, nil
                case uint16(data[1])<<8 | uint16(data[0]):
                    endian = littleEndian
                    return 2, nil, nil
                }
            }
        }

        // Scan for newline-terminated lines.
        i := 0
        for {
            j := bytes.IndexByte(data[i:], '\n')
            if j < 0 {
                break
            }
            i += j
            switch e := i % 2; e {
            case 1: // UTF-16BE
                if endian != littleEndian {
                    if i > 1 {
                        if data[i-1] == '\x00' {
                            endian = bigEndian
                            // We have a full newline-terminated line.
                            return i + 1, dropCRBE(data[0 : i-1]), nil
                        }
                    }
                }
            case 0: // UTF-16LE
                if endian != bigEndian {
                    if i+1 < len(data) {
                        i++
                        if data[i] == '\x00' {
                            endian = littleEndian
                            // We have a full newline-terminated line.
                            return i + 1, dropCRLE(data[0 : i-1]), nil
                        }
                    }
                }
            }
            i++
        }

        // If we're at EOF, we have a final, non-terminated line. Return it.
        if atEOF {
            // drop CR.
            advance = len(data)
            switch endian {
            case bigEndian:
                data = dropCRBE(data)
            case littleEndian:
                data = dropCRLE(data)
            default:
                data, endian = dropCR(data)
            }
            if endian == unknownEndian {
                if runtime.GOOS == "windows" {
                    endian = littleEndian
                } else {
                    endian = bigEndian
                }
            }
            return advance, data, nil
        }

        // Request more data.
        return 0, nil, nil
    }

    // Endian byte order function
    orderFunc := func() (byteOrder binary.ByteOrder) {
        switch endian {
        case bigEndian:
            byteOrder = binary.BigEndian
        case littleEndian:
            byteOrder = binary.LittleEndian
        }
        return byteOrder
    }

    return splitFunc, orderFunc
}

func main() {
    file, err := os.Open("utf16.le.txt")
    if err != nil {
        fmt.Println(err)
        os.Exit(1)
    }
    defer file.Close()
    fmt.Println(file.Name())

    rdr := bufio.NewReader(file)
    scanner := bufio.NewScanner(rdr)
    var bo binary.ByteOrder // unknown, infer from data
    // bo = binary.LittleEndian // windows
    splitFunc, orderFunc := ScanUTF16LinesFunc(bo)
    scanner.Split(splitFunc)

    for scanner.Scan() {
        b := scanner.Bytes()
        s := UTF16BytesToString(b, orderFunc())
        fmt.Println(len(s), s)
        fmt.Println(len(b), b)
    }
    fmt.Println(orderFunc())

    if err := scanner.Err(); err != nil {
        fmt.Println(err)
    }
}

Output:

utf16.le.txt
15 "Hello, 世界"
22 [34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 0 22 78 76 117 34 0]
0 
0 []
15 "Hello, 世界"
22 [34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 0 22 78 76 117 34 0]
LittleEndian

utf16.be.txt
15 "Hello, 世界"
22 [0 34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 78 22 117 76 0 34]
0 
0 []
15 "Hello, 世界"
22 [0 34 0 72 0 101 0 108 0 108 0 111 0 44 0 32 78 22 117 76 0 34]
BigEndian

这篇关于如何读取utf16文本文件在golang中的字符串?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆