使用多行字段和转义的双引号解析CSV文件 [英] Parsing CSV file with multiline fields and escaped double quotes

查看:194
本文介绍了使用多行字段和转义的双引号解析CSV文件的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

使用多行字段和转义引号解析CSV文件的最佳方法是什么?



例如,此CSV

 第一行的第一个字段,此字段是多行

但是没关系,因为它包含在双重qoutes中,这个
是一个转义的double qoute,但是这个不是
这是第二行的第二个字段,但它不是多行
,因为它不启动
与立即双引号

在Excel中看起来像这样:



>



我只是保持状态不知何故,有一个标志说如果我现在正在阅读的字段已经开始与qoute等等?此外,对于有双引号但不在字段开头的情况,即,ABC,item item是否被认为无效?此外,如果不在引用字段中,引号是否转义?



有没有任何其他角落我可能错过了?

编程实践,其中包括C中的CSV解析库和C ++中的重新实现,并讨论了此级别的详细信息。在 RFC 4180 中也有一个CSV标准。逗号分隔的通用格式和MIME类型值,您也可以研究此主题的维基百科






在另一个答案是一些示例代码,仍在测试。它在它的限制内很有弹性。此处将其修改为SSCCE(简短,自我包含,正确的示例)。

  #include< stdbool.h> 
#include< wchar.h>
#include< wctype.h>

extern const wchar_t * nextCsvField(const wchar_t * p,wchar_t sep,bool * newline);

//返回一个指向下一个字段的开始的指针
//或者如果这是CSV中的最后一个字段,则返回零
// p是开始位置字段
// sep是使用的分隔符,即逗号或分号
//换行符说明字段以换行符还是用逗号
const wchar_t * nextCsvField(const wchar_t * p ,wchar_t sep,bool * newline)
{
//解析引用的序列
if(''== p [0]){
p ++;
while (1){
//查找下一个双引号
p = wcschr(p,L'');
//如果我们没有找到它或它是最后一个符号
//那么这是最后一个字段
if(!p ||!p [1])$ ​​b $ b return 0;
//检查,它是一个转义的双引号
if(p [1]!='')
break;
//跳过转义的double -quote
p + = 2;
}
}

//查找下一个换行符或逗号
wchar_t newline_or_sep [4] = L\ n \r;
newline_or_sep [2] = sep;
p = wcspbrk(p,newline_or_sep);

//如果没有换行符或分隔符, 。
if(!p)
return 0;

//检查是否有换行符
* newline =(p [0] =='\ r'|| p [0] =='\\\
');

//处理\r\\\
,否则增加
if(p [0] =='\r'&& p [1] =='\\\
')
p + = 2;
else
p ++;

return b;
}

static void dissect(const wchar_t * line)
{
const wchar_t * start = line;
const wchar_t * next;
bool eol;
wprintf(LInput:%d [%。* ls] \\\
,wcslen(line),wcslen(line)-1,line);
while((next = nextCsvField(start,L',',& eol))!= 0)
{
wprintf(LField:[%。* ls] =%d)\\\
,(next-start-eol),start,eol);
start = next;
}
}

static const wchar_t multiline [] =
L第一行的第一个字段,\此字段为multiline\\\

\\\

但是没关系,因为它包含在双引号中,并且this \\\

是一个转义的\\double quote\ \\不是\\\

\这是第二行的第二个字段,但它不是multiline \\\

,因为它不开始\ n
与立即双引号\\\\

;

int main(void)
{
wchar_t line [1024];

while(fgetws(line,sizeof(line)/ sizeof(line [0]),stdin))
dissect(line);
dissect(multiline);

return 0;
}



示例输出



  $ cat csv.data 
a,bb,c,de,f
1,2,,,,
$ ./wcsv< csv.data
输入:16 [a,bb,c,de,f]
字段:[a,](eol = 0)
字段:[bb,] eol = 0)
字段:[c,](eol = 0)
字段:[e] )
输入:17 [1,2,,,,4]
字段:[1,](eol = 0)
字段: ,](eol = 0)
字段:[,](eol = 0) ](eol = 0)
字段:[4](eol = 1)
输入:296 [第一行的第一个字段,这个字段是多行

因为它包含在双引号中,而这个
是一个转义的双引号,但是这个不是
这是第二行的第二个字段,但它不是多行
因为它不用立即双引号
启动
字段:[第一行的第一个字段,](eol = 0)
字段:[此字段是多行

但是没关系,因为它包含在双引号中,而
是一个转义的双引号,但是这个不是](eol = 1)
字段: 这是第二行的第二个字段,](eol = 0)
字段:[但它不是多行](eol = 1)
字段:[因为它不开始] 1)
字段:[立即双引号](eol = 1)
$

我说在其限度内;它的局限性是什么?



主要是它隔离原始字段,而不是转换字段。因此,它隔离的字段必须被修改以产生真实值,带有包围的双引号被去掉,并且内部双重双引号被单引号替代。将原始字段转换为实际值模仿了 nextCsvField()函数中的很多代码。输入是字段的开始和字段的结束(分隔符)。这里是第二个具有额外函数 csvFieldData()和上面显示的 dissect()函数的SSCCE 。解剖输出的格式略有不同,因此看起来更好:

  #include< stdbool.h> 
#include< wchar.h>
#include< wctype.h>

extern const wchar_t * nextCsvField(const wchar_t * p,wchar_t sep,bool * newline);

//返回一个指向下一个字段的开始的指针
//或者如果这是CSV中的最后一个字段,则返回零
// p是开始位置字段
// sep是使用的分隔符,即逗号或分号
//换行符说明字段以换行符还是用逗号
const wchar_t * nextCsvField(const wchar_t * p ,wchar_t sep,bool * newline)
{
//解析引用的序列
if(''== p [0]){
p ++;
while (1){
//查找下一个双引号
p = wcschr(p,L'');
//如果我们没有找到它或它是最后一个符号
//那么这是最后一个字段
if(!p ||!p [1])$ ​​b $ b return 0;
//检查,它是一个转义的双引号
if(p [1]!='')
break;
//跳过转义的double -quote
p + = 2;
}
}

//查找下一个换行符或逗号
wchar_t newline_or_sep [4] = L\ n \r;
newline_or_sep [2] = sep;
p = wcspbrk(p,newline_or_sep);

//如果没有换行符或分隔符, 。
if(!p)
return 0;

//检查是否有换行符
* newline =(p [0] =='\ r'|| p [0] =='\\\
');

//处理\r\\\
,否则增加
if(p [0] =='\r'&& p [1] =='\\\
')
p + = 2;
else
p ++;

return p;
}

static wchar_t * csvFieldData(const wchar_t * fld_s,const wchar_t * fld_e,wchar_t * buffer,size_t buflen)
{
wchar_t * dst = buffer;
wchar_t * end = buffer + buflen - 1;
const wchar_t * src = fld_s;

if(* src == L'')
{
const wchar_t * p = src + 1;
while(p< fld_e&&& ; dst {
if(p [0] == L''& p + 1< fld_s& p [1] == L' ')
{
* dst ++ = p [0];
p + = 2;
}
else if(p [0] == L'
{
p ++;
break;
}
else
* dst ++ = * p ++;
}
src = p;
}
while(src< fld_e&& dst< end)
* dst ++ = * src ++;
if(dst> = end)
return 0;
* dst = L'\0';
return(buffer);
}

static void dissect(const wchar_t * line)
{
const wchar_t * start = line;
const wchar_t * next;
bool eol;
wprintf(LInput%3zd:[%。* ls] \\\
,wcslen(line),wcslen(line)-1,line);
while((next = nextCsvField(start,L',',& eol))!= 0)
{
wchar_t buffer [1024]
wprintf(LRaw Field:[%。* ls](eol =%d)\\\
,(next - start - eol),start,eol);
if(csvFieldData(start,next-1,buffer,sizeof(buffer)/ sizeof(buffer [0]))!= 0)
wprintf(LField%3zd:[%ls] \\ n,wcslen(buffer),buffer);
start = next;
}
}

static const wchar_t multiline [] =
L第一行的第一个字段,\此字段为multiline\\\

\\\

但是没关系,因为它包含在双引号中,并且this \\\

是一个转义的\\double quote\ \\不是\\\

\这是第二行的第二个字段,但它不是multiline \\\

,因为它不开始\ n
与立即双引号\\\\

;

int main(void)
{
wchar_t line [1024];

while(fgetws(line,sizeof(line)/ sizeof(line [0]),stdin))
dissect(line);
dissect(multiline);

return 0;
}



示例输出



  $ ./wcsv< csv.data 
输入16:[a,bb,c,de,f]
原始字段:[a,](eol = 0)
字段1:[a]
字段2:[bb]
原始字段:[c,](eol = 0)
字段3:[bb, c]
原始字段:[d]e,](eol = 0)
字段4:[de] b $ b字段1:[f]
输入17:[1,2,,,,4]
原始字段:[1,](eol = 0)
字段1:[1]
原始字段:[2,](eol = 0)
字段1:[2] (eol = 0)
字段0:[]
原始字段:[,](eol = 0)
字段0:[]
原始字段: ,](eol = 0)
字段2:[]
原始字段:[4](eol = 1)
字段1:[4]
输入296: [第一行的第一个字段,这个字段是多行的

但是没关系,因为它包含在双引号中,这个
是一个转义的双引号,但这个是不是
这是第二行的第二个字段,但它不是多行
,因为它不启动
立即双引号]
原始字段:[第一个字段的第一行,](eol = 0)
字段24:[第一行的第一个字段]
原始字段:[此字段是多行

但是没关系,因为它并且这个
是一个转义的双引号,但是这个不是](eol = 1)
字段140:[此字段是多行的

但是没关系,因为它包含在双引号中,这个
是一个转义的双引号,但是这个不是]
Raw Field:[这是第二行的第二个字段, ](eol = 0)
字段38:[这是第二行的第二个字段]
原始字段:[但它不是多行](eol = 1)
字段24:但它不是多行]
原始字段:[因为它不启动](eol = 1)
字段28:[因为它不启动]
原始字段:立即双引号](eol = 1)
字段34:[立即双引号]
$

我没有用 \r\\\
(或简单 \r )行结束。


What is the best way to parse a CSV file with multiline fields and escaped quotes?

For example, this CSV

First field of first row,"This field is multiline

but that's OK because it's enclosed in double qoutes, and this
is an escaped "" double qoute" but this one "" is not
   "This is second field of second row, but it is not multiline
because it doesn't start 
with an immediate double quote"

looks in Excel like this:

Do I just preserve the state somehow, have a flag saying if the field I'm reading now has started with a qoute, etc.? Also, what happens to cases where there are double quotes but not at the start of the field, i.e. , "ABC", or ,"item "" item" "" are those considered invalid? Also, are "" quotes escaped if not inside a quoted field? Excel doesn't seem to.

Are there any other corner cases I might have missed?

解决方案

Your sample data has a single double quote just after 'double qoute' (sic) which terminates the double quoted start of field, but you have to continue reading until the next comma or the end of line. This is a malformed multi-line field, but you're seeing what Excel does with it. You can find a description of this in the (excellent) book The Practice of Programming which includes a CSV parsing library in C and a reimplementation in C++, and which discusses this level of detail. There's also a standard for CSV in RFC 4180 "Common Format and MIME Type for Comma-Separated Values" and you can also study Wikipedia on the subject.


In the other answer is some sample code, still under test. It is pretty resilient within its limitations. Here it is modified into an SSCCE (Short, Self-Contained, Correct Example).

#include <stdbool.h>
#include <wchar.h>
#include <wctype.h>

extern const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline);

// Returns a pointer to the start of the next field,
// or zero if this is the last field in the CSV
// p is the start position of the field
// sep is the separator used, i.e. comma or semicolon
// newline says whether the field ends with a newline or with a comma
const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline)
{
    // Parse quoted sequences
    if ('"' == p[0]) {
        p++;
        while (1) {
            // Find next double-quote
            p = wcschr(p, L'"');
            // If we don't find it or it's the last symbol
            // then this is the last field
            if (!p || !p[1])
                return 0;
            // Check for "", it is an escaped double-quote
            if (p[1] != '"')
                break;
            // Skip the escaped double-quote
            p += 2;
        }
    }

    // Find next newline or comma.
    wchar_t newline_or_sep[4] = L"\n\r ";
    newline_or_sep[2] = sep;
    p = wcspbrk(p, newline_or_sep);

    // If no newline or separator, this is the last field.
    if (!p)
        return 0;

    // Check if we had newline.
    *newline = (p[0] == '\r' || p[0] == '\n');

    // Handle "\r\n", otherwise just increment
    if (p[0] == '\r' && p[1] == '\n')
        p += 2;
    else
        p++;

    return p;
}

static void dissect(const wchar_t *line)
{
    const wchar_t *start = line;
    const wchar_t *next;
    bool     eol;
    wprintf(L"Input: %d [%.*ls]\n", wcslen(line), wcslen(line)-1, line);
    while ((next = nextCsvField(start, L',', &eol)) != 0)
    {
        wprintf(L"Field: [%.*ls] (eol = %d)\n", (next - start - eol), start, eol);
        start = next;
    }
}

static const wchar_t multiline[] =
   L"First field of first row,\"This field is multiline\n"
    "\n"
    "but that's OK because it's enclosed in double quotes, and this\n"
    "is an escaped \"\" double quote\" but this one \"\" is not\n"
    "   \"This is second field of second row, but it is not multiline\n"
    "   because it doesn't start \n"
    "   with an immediate double quote\"\n"
    ;

int main(void)
{
    wchar_t line[1024];

    while (fgetws(line, sizeof(line)/sizeof(line[0]), stdin))
        dissect(line);
    dissect(multiline);

    return 0;
}

Example output

$ cat csv.data
a,bb, c ,d""e,f
1,"2","",,"""",4
$ ./wcsv < csv.data
Input: 16 [a,bb, c ,d""e,f]
Field: [a,] (eol = 0)
Field: [bb,] (eol = 0)
Field: [ c ,] (eol = 0)
Field: [d""e,] (eol = 0)
Field: [f] (eol = 1)
Input: 17 [1,"2","",,"""",4]
Field: [1,] (eol = 0)
Field: ["2",] (eol = 0)
Field: ["",] (eol = 0)
Field: [,] (eol = 0)
Field: ["""",] (eol = 0)
Field: [4] (eol = 1)
Input: 296 [First field of first row,"This field is multiline

but that's OK because it's enclosed in double quotes, and this
is an escaped "" double quote" but this one "" is not
   "This is second field of second row, but it is not multiline
   because it doesn't start 
   with an immediate double quote"]
Field: [First field of first row,] (eol = 0)
Field: ["This field is multiline

but that's OK because it's enclosed in double quotes, and this
is an escaped "" double quote" but this one "" is not] (eol = 1)
Field: [   "This is second field of second row,] (eol = 0)
Field: [ but it is not multiline] (eol = 1)
Field: [   because it doesn't start ] (eol = 1)
Field: [   with an immediate double quote"] (eol = 1)
$

I said 'within its limitations'; what are its limitations?

Primarily, it isolates the raw field, rather than the converted field. Thus, the field it isolates has to be modified to produce the 'real' value, with the enclosing double quotes stripped off, and internal doubled double quotes replaced by single quotes. Converting a raw field to the real value mimics quite a lot of the code in the nextCsvField() function. The inputs are the start of the field and the end of the field (the separator character). Here's a second SSCCE with an extra function csvFieldData(), and the dissect() function shown above revised to call it. The format of the dissected output is slightly different so it looks better:

#include <stdbool.h>
#include <wchar.h>
#include <wctype.h>

extern const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline);

// Returns a pointer to the start of the next field,
// or zero if this is the last field in the CSV
// p is the start position of the field
// sep is the separator used, i.e. comma or semicolon
// newline says whether the field ends with a newline or with a comma
const wchar_t *nextCsvField(const wchar_t *p, wchar_t sep, bool *newline)
{
    // Parse quoted sequences
    if ('"' == p[0]) {
        p++;
        while (1) {
            // Find next double-quote
            p = wcschr(p, L'"');
            // If we don't find it or it's the last symbol
            // then this is the last field
            if (!p || !p[1])
                return 0;
            // Check for "", it is an escaped double-quote
            if (p[1] != '"')
                break;
            // Skip the escaped double-quote
            p += 2;
        }
    }

    // Find next newline or comma.
    wchar_t newline_or_sep[4] = L"\n\r ";
    newline_or_sep[2] = sep;
    p = wcspbrk(p, newline_or_sep);

    // If no newline or separator, this is the last field.
    if (!p)
        return 0;

    // Check if we had newline.
    *newline = (p[0] == '\r' || p[0] == '\n');

    // Handle "\r\n", otherwise just increment
    if (p[0] == '\r' && p[1] == '\n')
        p += 2;
    else
        p++;

    return p;
}

static wchar_t *csvFieldData(const wchar_t *fld_s, const wchar_t *fld_e, wchar_t *buffer, size_t buflen)
{
    wchar_t *dst = buffer;
    wchar_t *end = buffer + buflen - 1;
    const wchar_t *src = fld_s;

    if (*src == L'"')
    {
        const wchar_t *p = src + 1;
        while (p < fld_e && dst < end)
        {
            if (p[0] == L'"' && p+1 < fld_s && p[1] == L'"')
            {
                *dst++ = p[0];
                p += 2;
            }
            else if (p[0] == L'"')
            {
                p++;
                break;
            }
            else
                *dst++ = *p++;
        }
        src = p;
    }
    while (src < fld_e && dst < end)
        *dst++ = *src++;
    if (dst >= end)
        return 0;
    *dst = L'\0';
    return(buffer);
}

static void dissect(const wchar_t *line)
{
    const wchar_t *start = line;
    const wchar_t *next;
    bool     eol;
    wprintf(L"Input %3zd: [%.*ls]\n", wcslen(line), wcslen(line)-1, line);
    while ((next = nextCsvField(start, L',', &eol)) != 0)
    {
        wchar_t buffer[1024];
        wprintf(L"Raw Field: [%.*ls] (eol = %d)\n", (next - start - eol), start, eol);
        if (csvFieldData(start, next-1, buffer, sizeof(buffer)/sizeof(buffer[0])) != 0)
            wprintf(L"Field %3zd: [%ls]\n", wcslen(buffer), buffer);
        start = next;
    }
}

static const wchar_t multiline[] =
   L"First field of first row,\"This field is multiline\n"
    "\n"
    "but that's OK because it's enclosed in double quotes, and this\n"
    "is an escaped \"\" double quote\" but this one \"\" is not\n"
    "   \"This is second field of second row, but it is not multiline\n"
    "   because it doesn't start \n"
    "   with an immediate double quote\"\n"
    ;

int main(void)
{
    wchar_t line[1024];

    while (fgetws(line, sizeof(line)/sizeof(line[0]), stdin))
        dissect(line);
    dissect(multiline);

    return 0;
}

Example output

$ ./wcsv < csv.data
Input  16: [a,bb, c ,d""e,f]
Raw Field: [a,] (eol = 0)
Field   1: [a]
Raw Field: [bb,] (eol = 0)
Field   2: [bb]
Raw Field: [ c ,] (eol = 0)
Field   3: [ c ]
Raw Field: [d""e,] (eol = 0)
Field   4: [d""e]
Raw Field: [f] (eol = 1)
Field   1: [f]
Input  17: [1,"2","",,"""",4]
Raw Field: [1,] (eol = 0)
Field   1: [1]
Raw Field: ["2",] (eol = 0)
Field   1: [2]
Raw Field: ["",] (eol = 0)
Field   0: []
Raw Field: [,] (eol = 0)
Field   0: []
Raw Field: ["""",] (eol = 0)
Field   2: [""]
Raw Field: [4] (eol = 1)
Field   1: [4]
Input 296: [First field of first row,"This field is multiline

but that's OK because it's enclosed in double quotes, and this
is an escaped "" double quote" but this one "" is not
   "This is second field of second row, but it is not multiline
   because it doesn't start 
   with an immediate double quote"]
Raw Field: [First field of first row,] (eol = 0)
Field  24: [First field of first row]
Raw Field: ["This field is multiline

but that's OK because it's enclosed in double quotes, and this
is an escaped "" double quote" but this one "" is not] (eol = 1)
Field 140: [This field is multiline

but that's OK because it's enclosed in double quotes, and this
is an escaped " double quote" but this one "" is not]
Raw Field: [   "This is second field of second row,] (eol = 0)
Field  38: [   "This is second field of second row]
Raw Field: [ but it is not multiline] (eol = 1)
Field  24: [ but it is not multiline]
Raw Field: [   because it doesn't start ] (eol = 1)
Field  28: [   because it doesn't start ]
Raw Field: [   with an immediate double quote"] (eol = 1)
Field  34: [   with an immediate double quote"]
$ 

I've not tested with \r\n (or plain \r) line endings.

这篇关于使用多行字段和转义的双引号解析CSV文件的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆