Ç标记生成器(它返回空字段缺少太多时,耶!) [英] C Tokenizer (and it returns empty too when fields are missing. yay!)
问题描述
另请参阅:这是一个很好的SUBSTR()对于C?
的strtok()
和朋友跳过空场,我不知道该怎么告诉它不要跳过,而是在这种情况下返回空。
这是最断词我可以看到,甚至不让我开始对 sscanf的类似行为()
(但它从来没有表示将在空下地干活开头)。
我一直在滚,感觉昏昏欲睡一样,所以这里去审核:
的char * SUBSTR(为const char *文本,诠释nStartingPos,诠释nRun)
{
字符* emptyString =的strdup(); / *来吧!这不能不* / 如果(文字== NULL)回报emptyString; INT在textlen = strlen的(文本); --nStartingPos; 如果((nStartingPos℃,)||(nRun&下; = 0)||(在textlen == 0)||(在textlen&下; nStartingPos))返回emptyString; 字符* returnString =(字符*)释放calloc((1 + nRun)的sizeof(字符)); 如果(returnString == NULL)回报emptyString; strncat函数(returnString,(nStartingPos +文字),nRun); / *我们不需要从这时开始了emptyString * /
免费(emptyString);
emptyString = NULL; 返回returnString;
}INT TokenizeC(为const char *文字,字符DELIM,焦炭***输出)
{
如果(!(*输出)= NULL)返回-1; / *我将分配我自己的存储* / INT nCountOfDelimiters = 0;
INT DX = 0;
INT nCountOfElements = 0;
INT在textlen = strlen的(文本); 对于(; DX<在textlen ++ DX)
{
如果((文[DX] == DELIM)及及(DX =(在textlen - 1))!)++ nCountOfDelimiters;
/ *尾随分隔符需要,因为我们*不单独计算/
/ * 1 *始终递增计数/
} / *
我们将尽可能多的数组元素nCountOfDelimiters + 1
标记化一个空字符串应该返回同一个令牌会
是空的(这不是多大多库行为?还是应该返回NULL?)
* / (*输出)=(字符**)的malloc((1 + nCountOfDelimiters)* sizeof的(字符*)); 对于(DX = 0;&DX LT;在textlen; DX ++)
{
INT nStartOfString =(1 + DX); //的printf(\\ n DX =%d个,nStartOfString =%d个[1!],DX,nStartOfString); / *获取定界符之间运行* /
而((DX<在textlen)及和放大器;!(文[DX] = DELIM))DX ++; //的printf(\\ n DX =%d个,nStartOfString =%d个[2!],DX,nStartOfString); (*输出)[nCountOfElements] =(1 + DX - nStartOfString)? SUBSTR(文字,nStartOfString,(1 + DX - nStartOfString))的strdup(); //的printf(\\ n SUBSTR(文字,%D,%D)=>'%s'的[!]nStartOfString,(1 + DX - nStartOfString),(*输出)[nCountOfElements]); 如果(NULL ==(*输出)[nCountOfElements])
{
// Woops!撤消所有
// TODO:如何测试这个场景? 为(; nCountOfElements> = 0; --nCountOfElements)
{
如果((*输出)[nCountOfElements]!= NULL)免费((*输出)[nCountOfElements]);
(*输出)[nCountOfElements] = NULL;
} 返回-2;
} ++ nCountOfElements;
} 返回nCountOfElements; / *返回如果SUCESSFUL令牌的数量* /
}无效reclaim2D(CHAR ***店,无符号整型ITEMCOUNT)
{
对于(INT X = 0;&ITEMCOUNT LT; ITEMCOUNT ++ X)
{
如果((*店)[X]!= NULL)免费((*店)[X]);
(*店)[X] = NULL;
} 如果(!(*店)= NULL)免费((*店));
(*店)= NULL;
}
下面的驱动程序:
INT的main()
{
//尾随 - 方案不给元素的正确的计数
//(关闭1在过去的元素应是空的) 为const char *文字=1-2-3-6-7-8-9-10-11-,DELIM =' - '; // 10元 焦炭**输出= NULL; INT C = TokenizeC(文字,DELIM,&安培;输出); 的printf(\\ n \\ n [*]%D,C); 为(中间体X = 0; X c为C ++ x)的
{
的printf(\\ n [主]'%s'的,输出[X]); //预计:1-2-3-6-7-8-9-10-11-<&空GT;
} reclaim2D(安培;输出,C); 文字=12-3-6-7-8-9-10-11; // 8元 C = TokenizeC(文字,DELIM,&安培;输出); 的printf(\\ n \\ n [*]%D,C); 为(中间体X = 0; X c为C ++ x)的
{
的printf(\\ n [主]'%s'的,输出[X]); //预期:12-3-6-7-8-9-10-11
} reclaim2D(安培;输出,C); 文字=----- 2--4--6-7100000000-8-9-10-11-100000000- // 17元 C = TokenizeC(文字,DELIM,&安培;输出); 的printf(\\ n \\ n [*]%D,C); 为(中间体X = 0; X c为C ++ x)的
{
的printf(\\ n [主]'%s'的,输出[X]);
//预期<&空GT; - <&空GT; - <&空GT; - <&空GT;
// - <&空GT;&-2- LT;空> -4-哌空> -6-7100000000
// -8-9-10-11-100000000-<&空GT;
} reclaim2D(安培;输出,C); 文字=----- 2--4--6-7100000000-8-9-10-11-100000000 // 16元 C = TokenizeC(文字,DELIM,&安培;输出); 的printf(\\ n \\ n [*]%D,C); 为(中间体X = 0; X c为C ++ x)的
{
的printf(\\ n [主]'%s'的,输出[X]);
//预期:其中,空> - <&空GT; - <&空GT; - <&空GT; - <&空GT;
// - 2-<&空GT; -4-哌空> -6-7100000000-8-9-10-11-100000000
} reclaim2D(安培;输出,C); 返回0;
}
是的,你注意到它的权利;现在仅适用于一个单一的分隔符,当然,我们有这个偏离一个错误顾不上。
输出:
块引用>[*] 9
[主]'1'
[主]'2'
[主]'3'
[主]'6'
[主]'7'
[主]'8'
[主]'9'
[主] '10'
[主] '11'[*] 8
[主] '12'
[主]'3'
[主]'6'
[主]'7'
[主]'8'
[主]'9'
[主] '10'
[主] '11'[*] 16
[主要]''
[主要]''
[主要]''
[主要]''
[主要]''
[主]'2'
[主要]''
[主]'4'
[主要]''
[主]'6'
[主]71亿
[主]'8'
[主]'9'
[主] '10'
[主] '11'
[主]'亿'[*] 16
[主要]''
[主要]''
[主要]''
[主要]''
[主要]''
[主]'2'
[主要]''
[主]'4'
[主要]''
[主]'6'
[主]71亿
[主]'8'
[主]'9'
[主] '10'
[主] '11'
[主]'亿'我也使这一个wiki,因为我在网上看到很多类似的请求。
解决方案在一些系统中,有一个函数调用strsep()。你可以找到谷歌的来源$ C $ C。例如,的http://www.google.com/$c$csearch/p?hl=zh-TW#XAzRy8oK4zA/libc/string/strsep.c&q=strsep
See also: Is this a good substr() for C?
strtok()
and friends skip over empty fields, and I do not know how to tell it not to skip but rather return empty in such cases.Similar behavior from most tokenizers I could see, and don't even get me started on
sscanf()
(but then it never said it would work on empty fields to begin with).I have been on a roll and feeling sleepy as well, so here it goes for review:
char* substr(const char* text, int nStartingPos, int nRun) { char* emptyString = strdup(""); /* C'mon! This cannot fail */ if(text == NULL) return emptyString; int textLen = strlen(text); --nStartingPos; if((nStartingPos < 0) || (nRun <= 0) || (textLen == 0) || (textLen < nStartingPos)) return emptyString; char* returnString = (char *)calloc((1 + nRun), sizeof(char)); if(returnString == NULL) return emptyString; strncat(returnString, (nStartingPos + text), nRun); /* We do not need emptyString anymore from this point onwards */ free(emptyString); emptyString = NULL; return returnString; } int TokenizeC(const char* text, char delim, char ***output) { if((*output) != NULL) return -1; /* I will allocate my own storage */ int nCountOfDelimiters = 0; int dx = 0; int nCountOfElements = 0; int textLen = strlen(text); for(; dx < textLen; ++dx) { if((text[dx] == delim) && (dx != (textLen - 1))) ++nCountOfDelimiters; /* trailing delimiter need not be counted separately as we are */ /* incrementing the count always by 1 */ } /* We will have as many array elements as nCountOfDelimiters + 1 Tokenizing an empty string should return a single token that would be empty (Is this not how most libraries behave? Or should it return NULL?) */ (*output) = (char **)malloc((1 + nCountOfDelimiters) * sizeof(char *)); for(dx = 0; dx < textLen; dx++) { int nStartOfString = (1 + dx); //printf("\n[! 1]dx = %d, nStartOfString = %d", dx, nStartOfString); /* Get the run between delimiters */ while((dx < textLen) && (text[dx] != delim)) dx++; //printf("\n[! 2]dx = %d, nStartOfString = %d", dx, nStartOfString); (*output)[nCountOfElements] = (1 + dx - nStartOfString) ? substr(text, nStartOfString, (1 + dx - nStartOfString)) : strdup(""); //printf("\n[!]substr(text, %d, %d) => '%s'", nStartOfString, (1 + dx - nStartOfString), (*output)[nCountOfElements]); if(NULL == (*output)[nCountOfElements]) { // Woops! Undo all // TODO: How to test this scenario?! for(; nCountOfElements >= 0; --nCountOfElements) { if((*output)[nCountOfElements] != NULL) free((*output)[nCountOfElements]); (*output)[nCountOfElements] = NULL; } return -2; } ++nCountOfElements; } return nCountOfElements; /* Return the number of tokens if sucessful */ } void reclaim2D(char ***store, unsigned int itemCount) { for (int x = 0; itemCount < itemCount; ++x) { if((*store)[x] != NULL) free((*store)[x]); (*store)[x] = NULL; } if((*store) != NULL) free((*store)); (*store) = NULL; }
Here's the driver:
int main() { // Trailing '-' scenarios not giving correct count of elements // (off by 1 for the last element that should come as empty) const char *text = "1-2-3-6-7-8-9-10-11-", delim = '-'; // 10 elements char **output = NULL; int c = TokenizeC(text, delim, &output); printf("\n\n[*]%d", c); for (int x = 0; x < c; ++x) { printf("\n[main]'%s'", output[x]); //Expected : 1-2-3-6-7-8-9-10-11-<empty> } reclaim2D(&output, c); text = "12-3-6-7-8-9-10-11"; // 8 elements c = TokenizeC(text, delim, &output); printf("\n\n[*]%d", c); for(int x = 0; x < c; ++x) { printf("\n[main]'%s'", output[x]); //Expected : 12-3-6-7-8-9-10-11 } reclaim2D(&output, c); text = "-----2--4--6-7100000000-8-9-10-11-100000000-"; // 17 elements c = TokenizeC(text, delim, &output); printf("\n\n[*]%d", c); for(int x = 0; x < c; ++x) { printf("\n[main]'%s'", output[x]); //Expected <empty>-<empty>-<empty>-<empty> // -<empty>-2-<empty>-4-<empty>-6-7100000000 // -8-9-10-11-100000000-<empty> } reclaim2D(&output, c); text = "-----2--4--6-7100000000-8-9-10-11-100000000"; // 16 elements c = TokenizeC(text, delim, &output); printf("\n\n[*]%d", c); for(int x = 0; x < c; ++x) { printf("\n[main]'%s'", output[x]); //Expected : <empty>-<empty>-<empty>-<empty>-<empty> //-2-<empty>-4-<empty>-6-7100000000-8-9-10-11-100000000 } reclaim2D(&output, c); return 0; }
Yes, you noticed it right; it works now only for a single delimiter, but of course, we have this off by one bug to attend to.
Outputs:
[*]9 [main]'1' [main]'2' [main]'3' [main]'6' [main]'7' [main]'8' [main]'9' [main]'10' [main]'11' [*]8 [main]'12' [main]'3' [main]'6' [main]'7' [main]'8' [main]'9' [main]'10' [main]'11' [*]16 [main]'' [main]'' [main]'' [main]'' [main]'' [main]'2' [main]'' [main]'4' [main]'' [main]'6' [main]'7100000000' [main]'8' [main]'9' [main]'10' [main]'11' [main]'100000000' [*]16 [main]'' [main]'' [main]'' [main]'' [main]'' [main]'2' [main]'' [main]'4' [main]'' [main]'6' [main]'7100000000' [main]'8' [main]'9' [main]'10' [main]'11' [main]'100000000'
I am also making this a wiki because I saw many similar requests on the net.
解决方案On some system, there is a function called strsep(). And you can find its source code by google. For example, http://www.google.com/codesearch/p?hl=zh-TW#XAzRy8oK4zA/libc/string/strsep.c&q=strsep
这篇关于Ç标记生成器(它返回空字段缺少太多时,耶!)的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!