在C中读取unicode文件时出错 [英] Getting error while reading unicode file in C

查看:418
本文介绍了在C中读取unicode文件时出错的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我想使用以下代码读取C(Cygwin / GCC)中的unicode文件:

  #include< stdio.h> 
#include< stdlib.h>
#include< glib.h>


void split_parse(char * text){
char ** res = g_strsplit(text,=,2);
printf(Key =%s:,res [0]);
printf(Value =%s,res [1]);
printf(\\\
);
}

int main(int argc,char ** argv)
{
setenv(CYGWIN,nodosfilewarning,1);

GIOChannel * channel;
GError * err = NULL;
int reading = 0;
const gchar * enc;
guchar magic [2] = {0};
gsize bytes_read = 0;

const char * filename =C:\\CONFIG;


channel = g_io_channel_new_file(filename,r,& err);

if(!channel){
g_print(%s,err-> message);
return 1;
}

if(g_io_channel_set_encoding(channel,NULL,& err)!= G_IO_STATUS_NORMAL){
g_print(g_io_channel_set_encoding:%s\\\
,err->信息);
return 1;
}

if(g_io_channel_read_chars(channel,(gchar *)magic,2,& bytes_read,& err)!= G_IO_STATUS_NORMAL){
g_print(g_io_channel_read_chars:% s\\\
,err-> message);
return 1;
}

if(magic [0] == 0xFF&& magic [1] == 0xFE)
{
enc =UTF-16LE ;
}
else if(magic [0] == 0xFE&& magic [1] == 0xFF)
{
enc =UTF-16BE;
}
else
{
enc =UTF-8;
if(g_io_channel_seek_position(channel,0,G_SEEK_CUR,& err)== G_IO_STATUS_ERROR)
{
g_print(g_io_channel_seek:failed\\\
);
return 1;
}
}

if(g_io_channel_set_encoding(channel,enc,& err)!= G_IO_STATUS_NORMAL){
g_print(%s,err->信息);
return 1;
}

reading = 1;
GIOStatus status;
char * str = NULL;
size_t len;

while(阅读){

status = g_io_channel_read_line(channel,& str,& len,NULL,& err);
switch(status){
case G_IO_STATUS_EOF:
reading = 0;
break;
case G_IO_STATUS_NORMAL:
if(len == 0)continue;
split_parse(str);
break;
case G_IO_STATUS_AGAIN:continue;
case G_IO_STATUS_ERROR:
默认值:
// throw error;
reading = 0;
break;
}
}

g_free(str);
g_io_channel_unref(channel);

return(EXIT_SUCCESS);
}

文件(C:\CONFIG)的内容如下: p>

  h-debug =1
name = ME
ÆÆЮ©=2¾1¼

在读取它时,我总是在while循环中的g_io_channel_read_line获取以下错误消息:


0x800474f8转换输入中的字节序列无效


我做错了?


$ b

编辑:Hexdump的文件 $ b

解决方案

您的文件包含(EF BB BF)的3字节UTF8 BOM。字节顺序标记。



您的代码默认为UTF8,但不消耗BOM。

  channel,0,G_SEEK_CUR和&err 

s / b

 频道,3,G_SEEK_CUR和& err 

此外,我建议将 magic 读取4个字节并肯定地辨别BOM。



如果没有找到BOM,您可以假设编码为NULL,我认为是二进制。






UTF32BE\x00\x00\xFE\xFF

UTF32LE\xFF\xFE\x00\x00

UTF8\xEF\\ \\ xBB \xBF

UTF16BE\xFE\xFF

UTF16LE\xFF\xFE

二进制NULL


I want to read a unicode file in C (Cygwin/GCC) using the following code:

#include <stdio.h>
#include <stdlib.h>
#include <glib.h>


void split_parse(char* text){
    char** res = g_strsplit(text, "=", 2);
    printf("Key = %s : ", res[0]);
    printf("Value = %s", res[1]);
    printf("\n");
}

int main(int argc, char **argv)
{
    setenv ("CYGWIN", "nodosfilewarning", 1);

    GIOChannel *channel;
    GError *err = NULL;
    int reading = 0;
    const gchar* enc;
    guchar magic[2] = { 0 };
    gsize bytes_read = 0;

    const char* filename = "C:\\CONFIG";


    channel = g_io_channel_new_file (filename, "r", &err);

    if (!channel) {
        g_print("%s", err->message);
        return 1;
    }

    if (g_io_channel_set_encoding(channel, NULL, &err) != G_IO_STATUS_NORMAL) {
        g_print("g_io_channel_set_encoding: %s\n", err->message);
        return 1;
    }

    if (g_io_channel_read_chars(channel, (gchar*) magic, 2, &bytes_read, &err) != G_IO_STATUS_NORMAL) {
        g_print("g_io_channel_read_chars: %s\n", err->message);
        return 1;
    }

    if (magic[0] == 0xFF && magic[1] == 0xFE)
    {
        enc = "UTF-16LE";
    }
    else if (magic[0] == 0xFE && magic[1] == 0xFF)
    {
        enc = "UTF-16BE";
    }
    else
    {
        enc = "UTF-8";
        if (g_io_channel_seek_position(channel, 0, G_SEEK_CUR, &err) == G_IO_STATUS_ERROR)
        {
            g_print("g_io_channel_seek: failed\n");
            return 1;
        }
    }

    if (g_io_channel_set_encoding (channel, enc, &err) != G_IO_STATUS_NORMAL) {
        g_print("%s", err->message);
        return 1;
    }

    reading = 1;
    GIOStatus status;
    char* str = NULL;
    size_t len;

    while(reading){

        status = g_io_channel_read_line(channel, &str, &len, NULL, &err);
        switch(status){
            case G_IO_STATUS_EOF:
                reading = 0;
                break;
            case G_IO_STATUS_NORMAL:
                if(len == 0) continue;
                split_parse(str);
                break;
            case G_IO_STATUS_AGAIN: continue;
            case G_IO_STATUS_ERROR:
            default:
                //throw error;
                reading = 0;
                break;
        }
    }

    g_free(str);
    g_io_channel_unref(channel);

    return(EXIT_SUCCESS);
}

The file (C:\CONFIG) content is as follows:

h-debug="1"
name=ME
ÃÆÿЮ©=2¾1¼

While reading it I am always getting the following error message at "g_io_channel_read_line" inside the while loop:

0x800474f8 "Invalid byte sequence in conversion input"

What am I doing wrong? How to read a file like this in C using glib?

EDIT: Hexdump of the file

解决方案

Your file contains the 3-byte UTF8 BOM of (EF BB BF). byte-order-mark.

Your code defaults to UTF8, but does not consume the BOM.

channel, 0, G_SEEK_CUR, &err

s/b

channel, 3, G_SEEK_CUR, &err

Further, I would recommend extending your magic code to read 4 bytes and affirmatively discern the BOM.

If you do not find a BOM, you could assume encoding NULL which I think is binary. Or throw an error Or fix the wayward text file Or, if your are pedantic, sequentially try all known encoding types.


UTF32BE "\x00\x00\xFE\xFF"
UTF32LE "\xFF\xFE\x00\x00"
UTF8 "\xEF\xBB\xBF"
UTF16BE "\xFE\xFF"
UTF16LE "\xFF\xFE"
NULL for binary

这篇关于在C中读取unicode文件时出错的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆