Ç标记生成器(它返回空字段缺少太多时,耶!) [英] C Tokenizer (and it returns empty too when fields are missing. yay!)

查看:162
本文介绍了Ç标记生成器(它返回空字段缺少太多时,耶!)的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

另请参阅:这是一个很好的SUBSTR()对于C?


的strtok()和朋友跳过空场,我不知道该怎么告诉它不要跳过,而是在这种情况下返回空。

这是最断词我可以看到,甚至不让我开始对 sscanf的类似行为()(但它从来没有表示将在空下地干活开头)。

我一直在滚,感觉昏昏欲睡一样,所以这里去审核:

 的char * SUBSTR(为const char *文本,诠释nStartingPos,诠释nRun)
{
    字符* emptyString =的strdup(); / *来吧!这不能不* /    如果(文字== NULL)回报emptyString;    INT在textlen = strlen的(文本);    --nStartingPos;    如果((nStartingPos℃,)||(nRun&下; = 0)||(在textlen == 0)||(在textlen&下; nStartingPos))返回emptyString;    字符* returnString =(字符*)释放calloc((1 + nRun)的sizeof(字符));    如果(returnString == NULL)回报emptyString;    strncat函数(returnString,(nStartingPos +文字),nRun);    / *我们不需要从这时开始了emptyString * /
    免费(emptyString);
    emptyString = NULL;    返回returnString;
}INT TokenizeC(为const char *文字,字符DELIM,焦炭***输出)
{
    如果(!(*输出)= NULL)返回-1; / *我将分配我自己的存储* /    INT nCountOfDelimiters = 0;
    INT DX = 0;
    INT nCountOfElements = 0;
    INT在textlen = strlen的(文本);    对于(; DX<在textlen ++ DX)
    {
        如果((文[DX] == DELIM)及及(DX =(在textlen - 1))!)++ nCountOfDelimiters;
        / *尾随分隔符需要,因为我们*不单独计算/
        / * 1 *始终递增计数/
    }    / *
    我们将尽可能多的数组元素nCountOfDelimiters + 1
    标记化一个空字符串应该返回同一个令牌会
    是空的(这不是多大多库行为?还是应该返回NULL?)
    * /    (*输出)=(字符**)的malloc((1 + nCountOfDelimiters)* sizeof的(字符*));    对于(DX = 0;&DX LT;在textlen; DX ++)
    {
    INT nStartOfString =(1 + DX);    //的printf(\\ n DX =%d个,nStartOfString =%d个[1!],DX,nStartOfString);    / *获取定界符之间运行* /
    而((DX<在textlen)及和放大器;!(文[DX] = DELIM))DX ++;    //的printf(\\ n DX =%d个,nStartOfString =%d个[2!],DX,nStartOfString);    (*输出)[nCountOfElements] =(1 + DX - nStartOfString)? SUBSTR(文字,nStartOfString,(1 + DX - nStartOfString))的strdup();    //的printf(\\ n SUBSTR(文字,%D,%D)=>'%s'的[!]nStartOfString,(1 + DX - nStartOfString),(*输出)[nCountOfElements]);    如果(NULL ==(*输出)[nCountOfElements])
    {
    // Woops!撤消所有
    // TODO:如何测试这个场景?    为(; nCountOfElements> = 0; --nCountOfElements)
    {
    如果((*输出)[nCountOfElements]!= NULL)免费((*输出)[nCountOfElements]);
    (*输出)[nCountOfElements] = NULL;
    }    返回-2;
    }    ++ nCountOfElements;
    }    返回nCountOfElements; / *返回如果SUCESSFUL令牌的数量* /
}无效reclaim2D(CHAR ***店,无符号整型ITEMCOUNT)
{
    对于(INT X = 0;&ITEMCOUNT LT; ITEMCOUNT ++ X)
    {
        如果((*店)[X]!= NULL)免费((*店)[X]);
        (*店)[X] = NULL;
    }    如果(!(*店)= NULL)免费((*店));
    (*店)= NULL;
}

下面的驱动程序:

  INT的main()
{
    //尾随 - 方案不给元素的正确的计数
    //(关闭1在过去的元素应是空的)    为const char *文字=1-2-3-6-7-8-9-10-11-,DELIM =' - '; // 10元    焦炭**输出= NULL;    INT C = TokenizeC(文字,DELIM,&安培;输出);    的printf(\\ n \\ n [*]%D,C);    为(中间体X = 0; X c为C ++ x)的
    {
        的printf(\\ n [主]'%s'的,输出[X]); //预计:1-2-3-6-7-8-9-10-11-<&空GT;
    }    reclaim2D(安培;输出,C);    文字=12-3-6-7-8-9-10-11; // 8元    C = TokenizeC(文字,DELIM,&安培;输出);    的printf(\\ n \\ n [*]%D,C);    为(中间体X = 0; X c为C ++ x)的
    {
        的printf(\\ n [主]'%s'的,输出[X]); //预期:12-3-6-7-8-9-10-11
    }    reclaim2D(安培;输出,C);    文字=----- 2--4--6-7100000000-8-9-10-11-100000000- // 17元    C = TokenizeC(文字,DELIM,&安培;输出);    的printf(\\ n \\ n [*]%D,C);    为(中间体X = 0; X c为C ++ x)的
    {
        的printf(\\ n [主]'%s'的,输出[X]);
        //预期<&空GT; - <&空GT; - <&空GT; - <&空GT;
        // - <&空GT;&-2- LT;空> -4-哌空> -6-7100000000
        // -8-9-10-11-100000000-<&空GT;
    }    reclaim2D(安培;输出,C);    文字=----- 2--4--6-7100000000-8-9-10-11-100000000 // 16元    C = TokenizeC(文字,DELIM,&安培;输出);    的printf(\\ n \\ n [*]%D,C);    为(中间体X = 0; X c为C ++ x)的
    {
        的printf(\\ n [主]'%s'的,输出[X]);
        //预期:其中,空> - <&空GT; - <&空GT; - <&空GT; - <&空GT;
        // - 2-<&空GT; -4-哌空> -6-7100000000-8-9-10-11-100000000
    }    reclaim2D(安培;输出,C);    返回0;
}

是的,你注意到它的权利;现在仅适用于一个单一的分隔符,当然,我们有这个偏离一个错误顾不上。


  

输出:


  [*] 9
[主]'1'
[主]'2'
[主]'3'
[主]'6'
[主]'7'
[主]'8'
[主]'9'
[主] '10'
[主] '11'[*] 8
[主] '12'
[主]'3'
[主]'6'
[主]'7'
[主]'8'
[主]'9'
[主] '10'
[主] '11'[*] 16
[主要]''
[主要]''
[主要]''
[主要]''
[主要]''
[主]'2'
[主要]''
[主]'4'
[主要]''
[主]'6'
[主]71亿
[主]'8'
[主]'9'
[主] '10'
[主] '11'
[主]'亿'[*] 16
[主要]''
[主要]''
[主要]''
[主要]''
[主要]''
[主]'2'
[主要]''
[主]'4'
[主要]''
[主]'6'
[主]71亿
[主]'8'
[主]'9'
[主] '10'
[主] '11'
[主]'亿'

我也使这一个wiki,因为我在网上看到很多类似的请求。


解决方案

在一些系统中,有一个函数调用strsep()。你可以找到谷歌的来源$ C ​​$ C。例如,的http://www.google.com/$c$csearch/p?hl=zh-TW#XAzRy8oK4zA/libc/string/strsep.c&q=strsep

See also: Is this a good substr() for C?


strtok() and friends skip over empty fields, and I do not know how to tell it not to skip but rather return empty in such cases.

Similar behavior from most tokenizers I could see, and don't even get me started on sscanf() (but then it never said it would work on empty fields to begin with).

I have been on a roll and feeling sleepy as well, so here it goes for review:

char* substr(const char* text, int nStartingPos, int nRun)
{
    char* emptyString = strdup(""); /* C'mon! This cannot fail */

    if(text == NULL) return emptyString;

    int textLen = strlen(text);

    --nStartingPos;

    if((nStartingPos < 0) || (nRun <= 0) || (textLen == 0) || (textLen < nStartingPos)) return emptyString;

    char* returnString = (char *)calloc((1 + nRun), sizeof(char));

    if(returnString == NULL) return emptyString;

    strncat(returnString, (nStartingPos + text), nRun);

    /* We do not need emptyString anymore from this point onwards */
    free(emptyString);
    emptyString = NULL;

    return returnString;
}

int TokenizeC(const char* text, char delim, char ***output)
{
    if((*output) != NULL) return -1; /* I will allocate my own storage */

    int nCountOfDelimiters = 0;
    int dx = 0;
    int nCountOfElements = 0;
    int textLen = strlen(text);

    for(; dx < textLen; ++dx)
    {
        if((text[dx] == delim) && (dx != (textLen - 1))) ++nCountOfDelimiters;
        /* trailing delimiter need not be counted separately as we are */
        /* incrementing the count always by 1 */
    }

    /*
    We will have as many array elements as nCountOfDelimiters + 1
    Tokenizing an empty string should return a single token that would
    be empty (Is this not how most libraries behave? Or should it return NULL?)
    */

    (*output) = (char **)malloc((1 + nCountOfDelimiters) * sizeof(char *));

    for(dx = 0; dx < textLen; dx++)
    {
    	int nStartOfString = (1 + dx);

    	//printf("\n[! 1]dx = %d, nStartOfString = %d", dx, nStartOfString);

    	/* Get the run between delimiters */
    	while((dx < textLen) && (text[dx] != delim)) dx++;

    	//printf("\n[! 2]dx = %d, nStartOfString = %d", dx, nStartOfString);

    	(*output)[nCountOfElements] = (1 + dx - nStartOfString) ? substr(text, nStartOfString, (1 + dx - nStartOfString)) : strdup("");

    	//printf("\n[!]substr(text, %d, %d) => '%s'", nStartOfString, (1 + dx - nStartOfString), (*output)[nCountOfElements]);

    	if(NULL == (*output)[nCountOfElements])
    	{
    		// Woops! Undo all
    		// TODO: How to test this scenario?!

    		for(; nCountOfElements >= 0; --nCountOfElements)
    		{
    		    if((*output)[nCountOfElements] != NULL) free((*output)[nCountOfElements]);
    		    (*output)[nCountOfElements] = NULL;
    		}

    		return -2; 
    	}

    	++nCountOfElements;
    }

    return nCountOfElements; /* Return the number of tokens if sucessful */
}

void reclaim2D(char ***store, unsigned int itemCount)
{
    for (int x = 0; itemCount < itemCount; ++x)
    {
        if((*store)[x] != NULL) free((*store)[x]);
        (*store)[x] = NULL;
    }

    if((*store) != NULL) free((*store));
    (*store) = NULL;
}

Here's the driver:

int main()
{
    // Trailing '-' scenarios not giving correct count of elements
    // (off by 1 for the last element that should come as empty)

    const char *text = "1-2-3-6-7-8-9-10-11-", delim = '-'; // 10 elements

    char **output = NULL;

    int c = TokenizeC(text, delim, &output);

    printf("\n\n[*]%d", c);

    for (int x = 0; x < c; ++x)
    {
        printf("\n[main]'%s'", output[x]); //Expected : 1-2-3-6-7-8-9-10-11-<empty>
    }

    reclaim2D(&output, c);

    text = "12-3-6-7-8-9-10-11";  // 8 elements

    c = TokenizeC(text, delim, &output);

    printf("\n\n[*]%d", c);

    for(int x = 0; x < c; ++x)
    {
        printf("\n[main]'%s'", output[x]); //Expected : 12-3-6-7-8-9-10-11
    }

    reclaim2D(&output, c);

    text = "-----2--4--6-7100000000-8-9-10-11-100000000-";  // 17 elements

    c = TokenizeC(text, delim, &output);

    printf("\n\n[*]%d", c);

    for(int x = 0; x < c; ++x)
    {
        printf("\n[main]'%s'", output[x]);
        //Expected <empty>-<empty>-<empty>-<empty>
        // -<empty>-2-<empty>-4-<empty>-6-7100000000
        // -8-9-10-11-100000000-<empty>
    }

    reclaim2D(&output, c);

    text = "-----2--4--6-7100000000-8-9-10-11-100000000";  // 16 elements

    c = TokenizeC(text, delim, &output);

    printf("\n\n[*]%d", c);

    for(int x = 0; x < c; ++x)
    {
        printf("\n[main]'%s'", output[x]);
        //Expected : <empty>-<empty>-<empty>-<empty>-<empty>
        //-2-<empty>-4-<empty>-6-7100000000-8-9-10-11-100000000
    }

    reclaim2D(&output, c);

    return 0;
}

Yes, you noticed it right; it works now only for a single delimiter, but of course, we have this off by one bug to attend to.

Outputs:

[*]9
[main]'1'
[main]'2'
[main]'3'
[main]'6'
[main]'7'
[main]'8'
[main]'9'
[main]'10'
[main]'11'

[*]8
[main]'12'
[main]'3'
[main]'6'
[main]'7'
[main]'8'
[main]'9'
[main]'10'
[main]'11'

[*]16
[main]''
[main]''
[main]''
[main]''
[main]''
[main]'2'
[main]''
[main]'4'
[main]''
[main]'6'
[main]'7100000000'
[main]'8'
[main]'9'
[main]'10'
[main]'11'
[main]'100000000'

[*]16
[main]''
[main]''
[main]''
[main]''
[main]''
[main]'2'
[main]''
[main]'4'
[main]''
[main]'6'
[main]'7100000000'
[main]'8'
[main]'9'
[main]'10'
[main]'11'
[main]'100000000'

I am also making this a wiki because I saw many similar requests on the net.

解决方案

On some system, there is a function called strsep(). And you can find its source code by google. For example, http://www.google.com/codesearch/p?hl=zh-TW#XAzRy8oK4zA/libc/string/strsep.c&q=strsep

这篇关于Ç标记生成器(它返回空字段缺少太多时,耶!)的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆