如何提高性能,同时具有文件下操作 [英] How to improve the performance while operating with files in C

查看:127
本文介绍了如何提高性能,同时具有文件下操作的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我已经实现了对410K大型数据集朴素贝叶斯算法rows.Now我所有的记录都被正确归类但事情是该程序正在几乎是一个小时写记录到相应的files.What是最好的方式来提高我的$性能C $ c.Here是低于code.This件code是编写410K记录到相应的files.Thank你。<​​/ p>

 计划生育=的fopen(sales_ok_fraud.txt,R);
        而(与fgets(线,80,FP)!= NULL)//读取每一行从文件计算文件的大小。
        {
                令牌= strtok的(行,);
                令牌= strtok的(NULL,);
                令牌= strtok的(NULL,);
                令牌= strtok的(NULL,);
                令牌= strtok的(NULL,);
                令牌= strtok的(NULL,);
                TOKEN1 = strtok的(道理,\\ n);
                的memcpy(myStr中,&放大器; TOKEN1 [0],函数strlen(TOKEN1)-1);
                myStr的[strlen的(TOKEN1)-1] ='\\ 0';        如果(STRCMP(myStr中,OK)== 0)
            COUNTER_OK ++;
        其他
        counter_fraud ++;
    }
    的printf(以下简称没有用OK标签记录%F \\ n,COUNTER_OK);
    的printf(以下简称无欺诈标签记录%F \\ n,counter_fraud);    prblty_ok = COUNTER_OK /(COUNTER_OK + counter_fraud);
    prblty_fraud = counter_fraud /(COUNTER_OK + counter_fraud);
    的printf(OK的记录的概率为%F \\ N,prblty_ok);
    的printf(欺诈记录的概率为%F \\ N,prblty_fraud);
    FCLOSE(FP);    FP = FOPEN(sales_unknwn.txt,R);
    FP2 =的fopen(sales_unknown_ok_classified.txt,一个);
    FP3 =的fopen(sales_unknown_fraud_classified.txt,一个);
    而(与fgets(line1,80,FP)!= NULL)//读取每一行从文件计算文件的大小。
        {
                unknwn_attr1 = strtok的(行1,,);
                unknwn_attr2 = strtok的(NULL,,);
                unknwn_attr3 = strtok的(NULL,,);
                unknwn_attr4 = strtok的(NULL,,);
                unknwn_attr5 = strtok的(NULL,,);        //的printf(%S-%S-%S-%S-%S \\ n,unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);        FP1 = FOPEN(sales_ok_fraud.txt,R);
        而(与fgets(线,80 FP1)!= NULL)//读取每一行从文件计算文件的大小。
            {
            ok_fraud_attr1 = strtok的(行,,);
                    ok_fraud_attr2 = strtok的(NULL,,);
                    ok_fraud_attr3 = strtok的(NULL,,);
                    ok_fraud_attr4 = strtok的(NULL,,);
                    ok_fraud_attr5 = strtok的(NULL,,);
            ok_fraud_attr6 = strtok的(NULL,,);
                    的memcpy(ok_fraud_attr6_str,&放大器; ok_fraud_attr6 [0],函数strlen(ok_fraud_attr6)-2);
                    ok_fraud_attr6_str [strlen的(ok_fraud_attr6)-2] ='\\ 0';
            // ok_fraud_attr6 [strlen的(ok_fraud_attr6)-2] ='\\ 0';
            //的printf(测试ok_fraud_attr6 - %S-%d个\\ N,ok_fraud_attr6_str,strlen的(ok_fraud_attr6_str));
            如果(STRCMP(ok_fraud_attr6_str,OK)== 0)
            {
                如果(STRCMP(unknwn_attr2,ok_fraud_attr2)== 0)
                counter_ok_attr2 ++;                如果(STRCMP(unknwn_attr3,ok_fraud_attr3)== 0)
                counter_ok_attr3 ++;                如果(STRCMP(unknwn_attr4,ok_fraud_attr4)== 0)
                counter_ok_attr4 ++;                如果(STRCMP(unknwn_attr5,ok_fraud_attr5)== 0)
                counter_ok_attr5 ++;
            }                    如果(STRCMP(ok_fraud_attr6_str,欺诈)== 0)
                        {
                如果(STRCMP(unknwn_attr2,ok_fraud_attr2)== 0)
                counter_fraud_attr2 ++;                如果(STRCMP(unknwn_attr3,ok_fraud_attr3)== 0)
                counter_fraud_attr3 ++;                如果(STRCMP(unknwn_attr4,ok_fraud_attr4)== 0)
                counter_fraud_attr4 ++;                如果(STRCMP(unknwn_attr5,ok_fraud_attr5)== 0)
                counter_fraud_attr5 ++;
            }
        }
        FCLOSE(FP1);
        如果(counter_ok_attr2 == 0)
        prblty_attr2_given_ok =(counter_ok_attr2 + arbitrary_value * prblty_ok)/(COUNTER_OK + arbitrary_value);
        其他
        prblty_attr2_given_ok =(counter_ok_attr2)/(COUNTER_OK);        如果(counter_ok_attr3 == 0)
        prblty_attr3_given_ok =(counter_ok_attr3 + arbitrary_value * prblty_ok)/(COUNTER_OK + arbitrary_value);
        其他
                prblty_attr3_given_ok =(counter_ok_attr3)/(COUNTER_OK);        如果(counter_ok_attr4 == 0)
        prblty_attr4_given_ok =(counter_ok_attr4 + arbitrary_value * prblty_ok)/(COUNTER_OK + arbitrary_value);
        其他
                prblty_attr4_given_ok =(counter_ok_attr4)/(COUNTER_OK);        如果(counter_ok_attr5 == 0)
        prblty_attr5_given_ok =(counter_ok_attr5 + arbitrary_value * prblty_ok)/(COUNTER_OK + arbitrary_value);
        其他
                prblty_attr5_given_ok =(counter_ok_attr5)/(COUNTER_OK);        如果(counter_fraud_attr2 == 0)
        prblty_attr2_given_fraud =(counter_fraud_attr2 + arbitrary_value * prblty_fraud)/(counter_fraud + arbitrary_value);
        其他
                prblty_attr2_given_fraud =(counter_fraud_attr2)/(counter_fraud);        如果(counter_fraud_attr3 == 0)
        prblty_attr3_given_fraud =(counter_fraud_attr3 + arbitrary_value * prblty_fraud)/(counter_fraud + arbitrary_value);
        其他
                prblty_attr3_given_fraud =(counter_fraud_attr3)/(counter_fraud);        如果(counter_fraud_attr4 == 0)
        prblty_attr4_given_fraud =(counter_fraud_attr4 + arbitrary_value * prblty_fraud)/(counter_fraud + arbitrary_value);
        其他
                prblty_attr4_given_fraud =(counter_fraud_attr4)/(counter_fraud);        如果(counter_fraud_attr5 == 0)
        prblty_attr5_given_fraud =(counter_fraud_attr5 + arbitrary_value * prblty_fraud)/(counter_fraud + arbitrary_value);
        其他
                prblty_attr5_given_fraud =(counter_fraud_attr5)/(counter_fraud);        total_prblty_ok = prblty_ok * prblty_attr2_given_ok * prblty_attr3_given_ok * prblty_attr4_given_ok * prblty_attr5_given_ok;
        total_prblty_fraud = prblty_fraud*prblty_attr2_given_fraud*prblty_attr3_given_fraud*prblty_attr4_given_fraud*prblty_attr5_given_fraud;
//的printf(测试为计数OK - %F - %F - %F - %F \\ N,counter_ok_attr2,counter_ok_attr3,counter_ok_attr4,counter_ok_attr5);
//的printf(测试计数诈骗 - %F - %F - %F - %F \\ N,counter_fraud_attr2,counter_fraud_attr3,counter_fraud_attr4,counter_fraud_attr5);
//的printf(测试属性概率OK - %F - %F - %F - %F \\ N,prblty_attr2_given_ok,prblty_attr3_given_ok,prblty_attr4_given_ok,prblty_attr5_given_ok);
//的printf(测试属性为FRAUD-%F概率 - %F - %F - %f\
\",prblty_attr2_given_fraud,prblty_attr3_given_fraud,prblty_attr4_given_fraud,prblty_attr5_given_fraud);
//输出(最后的概率为%F - %F \\ N,total_prblty_ok,total_prblty_fraud);        如果(total_prblty_ok&GT; total_prblty_fraud)
        {
            fprintf中(FP2,%S%S%S,%S,%S,OK \\ N,unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
        }
        其他
        {
            fprintf(fp3,\"%s,%s,%s,%s,%s,fraud\
\",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
        }
        counter_ok_attr2 = counter_ok_attr3 = counter_ok_attr4 = counter_ok_attr5 = 0;
        counter_fraud_attr2 = counter_fraud_attr3 = counter_fraud_attr4 = counter_fraud_attr5 = 0;
    }    FCLOSE(FP);
        FCLOSE(FP2);
        FCLOSE(FP3);


解决方案

有一个几件事情,我可以马上看到你可以做,在订单我会尝试一下:


  1. 对输出文件的重复开写关闭,开写接近的意识形态停止。他们的名字是固定的,有限的。打开他们的所有在适当的这件事情开始,然后冲洗和关闭时,即可大功告成。

  2. 有几个逻辑结构,可以显著简化。

  3. 的strlen()横冲直撞需要的显著的降低。最体面的优化编译器将检测不变源并优化了对已知不变炭火PTR随后的电话,所以我会做这最后(但老实说,我还是会做它作为一个不好的做法来调用重复的strlen()调用了相同的数据。

  4. 后加入交谈与OP :您反复重新分析了相同的数据文件(sales_ok_fraud.txt),并且,一旦在sales_unknwn.txt数据线。 12GB / ABG线长度是不必要的重复解析很多,如果sales_ok_fraud.txt可以容纳在内存中。加载数据的一次的计算出其基础数据的一次的,并利用这些数据和统计其中为您的数据紧缩的其余部分。


逻辑排量

您可以在特定的切出工作的的在一个地方,改变这个:

 如果(STRCMP(unknwn_attr2,ok_fraud_attr2)== 0安培;&安培; STRCMP(ok_fraud_attr6_str,OK)== 0)
        counter_ok_attr2 ++;    如果(STRCMP(unknwn_attr3,ok_fraud_attr3)== 0安培;&安培; STRCMP(ok_fraud_attr6_str,OK)== 0)
        counter_ok_attr3 ++;    如果(STRCMP(unknwn_attr4,ok_fraud_attr4)== 0安培;&安培; STRCMP(ok_fraud_attr6_str,OK)== 0)
        counter_ok_attr4 ++;    如果(STRCMP(unknwn_attr5,ok_fraud_attr5)== 0安培;&安培; STRCMP(ok_fraud_attr6_str,OK)== 0)
        counter_ok_attr5 ++;    如果(STRCMP(unknwn_attr2,ok_fraud_attr2)== 0安培;&安培; STRCMP(ok_fraud_attr6_str,欺诈)== 0)
        counter_fraud_attr2 ++;    如果(STRCMP(unknwn_attr3,ok_fraud_attr3)== 0安培;&安培; STRCMP(ok_fraud_attr6_str,欺诈)== 0)
        counter_fraud_attr3 ++;    如果(STRCMP(unknwn_attr4,ok_fraud_attr4)== 0安培;&安培; STRCMP(ok_fraud_attr6_str,欺诈)== 0)
        counter_fraud_attr4 ++;    如果(STRCMP(unknwn_attr5,ok_fraud_attr5)== 0安培;&安培; STRCMP(ok_fraud_attr6_str,欺诈)== 0)
        counter_fraud_attr5 ++;

要这样:

 如果(STRCMP(ok_fraud_attr6_str,OK)== 0)
    {
        如果(STRCMP(unknwn_attr2,ok_fraud_attr2)== 0)
            counter_ok_attr2 ++;        如果(STRCMP(unknwn_attr3,ok_fraud_attr3)== 0)
            counter_ok_attr3 ++;        如果(STRCMP(unknwn_attr4,ok_fraud_attr4)== 0)
            counter_ok_attr4 ++;        如果(STRCMP(unknwn_attr5,ok_fraud_attr5)== 0)
            counter_ok_attr5 ++;
    }
    否则,如果(STRCMP(ok_fraud_attr6_str,欺诈)== 0)
    {
        如果(STRCMP(unknwn_attr2,ok_fraud_attr2)== 0)
            counter_fraud_attr2 ++;        如果(STRCMP(unknwn_attr3,ok_fraud_attr3)== 0)
            counter_fraud_attr3 ++;        如果(STRCMP(unknwn_attr4,ok_fraud_attr4)== 0)
            counter_fraud_attr4 ++;        如果(STRCMP(unknwn_attr5,ok_fraud_attr5)== 0)
            counter_fraud_attr5 ++;
    }


前载 sales_ok_fraud.txt

下面依赖于你的 sales_ok_fraud.txt 统计文件的数据格式的神圣性,而试图尽可能在确认说格式那样迂腐。它分配的内存大到足以容纳整个文件加一字符对待整个身体作为一个单一的空项串一大块。此时,该缓冲区通过相同的通用算​​法之前,你不得不拼凑式。其结果将是指向固定长度的字符指针数组然后可以在同一个地方,你目前(反复)开放,分析,利用反复使用的表,扔掉所有的内容。

  //声明六弦指针数组
的typedef的char * OFAttribs [6];//加载由以下格式的表:
//
// STR1,STR2,STR3,STR4,STR5,str6 \\ n
// STR1,STR2,STR3,STR4,STR5,str6 \\ n
// ...
// STR1,STR2,STR3,STR4,STR5,str6
//
//从上面的任何偏差都会导致循环的premature终止
//但将返回任何能够被解析到故障点。
//调用者应该始终`免费()`生成的表和数据
//指针。为size_t init_ok_fraud_data(为const char * FNAME,OFAttribs ** ppTable,焦炭** ppTableData)
{
    如果(!FNAME ||!* FNAME)
        返回0;    //检查文件打开竖起大拇指
    FILE * FP = FOPEN(FNAME,RB);
    如果(!FP)
        返回0;    //分配足够的内存来保存整个文件,再加上一个终止符
    fseek的(FP,0,SEEK_END);
    长LEN = FTELL(FP);
    fseek的(FP,0,SEEK_SET);    //分配足够的RAM整个文件加上终结者
    OFAttribs * PTABLE = NULL;
    为size_t nTableLen = 0;
    字符* pTableData =的malloc((LEN + 1)* sizeof的(炭));
    如果(NULL!= pTableData)
    {
        FREAD(pTableData,LEN,1,FP);
        pTableData [长度] = 0;
    }    //不再需要的文件
    FCLOSE(FP);    //引发第一令牌
    字符*令牌= strtok的(pTableData,);
    而(令牌)
    {
        //读取令牌下一行
        OFAttribs attribs = {NULL};
        对(INT I = 0; I&4;放大器;&放大器;令牌++ⅰ)
        {
            attribs [I] =记号。
            令牌= strtok的(NULL,);
        }        //充满0..3,设置纬度令牌和移动
        如果(attribs [3]&放大器;&放大器;令牌)
        {
            //下一个到最后一个条目集
            attribs [4] =令牌;            //行中输入只能通过换行终止
            令牌= strtok的(NULL,\\ n);
            如果(标记)
            {
                //正确的格式。 6 PARMS,5逗号,一是新线。
                attribs [5] =令牌;
                为size_t SLEN = strlen的(标记);
                如果(SLEN大于0)
                {
                    而(isspace为(令牌[ - SLEN]))
                        令牌[SLEN] = 0;
                }                //使主列表另一个空间。
                OFAttribs * TMP = realloc的(PTABLE,sizeof的(* TMP)*(nTableLen + 1));
                如果(NULL!= TMP)
                {
                    PTABLE = tmp目录;
                    的memcpy(PTABLE + nTableLen ++,attribs,sizeof的(attribs));
                }
                其他
                {//分配失败。
                    的printf(错误扩大OKFraud数据集分配内存);
                    出口(EXIT_FAILURE);
                }
            }
            其他
            { // 不好。
                的printf(无效行格式检测预期OK /欺诈\\\\ñ。);
                打破;
            }            //新行的下一个标记
            令牌= strtok的(NULL,);
        }
    }    //设置输出变量
    * ppTable = PTABLE;
    * ppTableData = pTableData;
    返回nTableLen;
}


将其组合在一起

上述

纳入一切都在你的code基产生以下影响:

  //加载ok_fraud表一次。
OFAttribs * okfr = NULL;
字符* okfr_data = NULL;
为size_t okfr_len = init_ok_fraud_data(sales_ok_fraud.txt,&放大器; okfr,&放大器; okfr_data);//步行表确定OK和欺诈状态的概率。
//注意:这的确应该做的装载机的一部分。
用于(为size_t我= 0; I&LT; okfr_len ++ I)
{
    如果(0 ==的strcmp(OK,okfr [I] [5]))
        ++ COUNTER_OK;
    其他
        ++ counter_fraud;
}的printf(以下简称没有用OK标签记录%F \\ n,COUNTER_OK);
的printf(以下简称无欺诈标签记录%F \\ n,counter_fraud);//计算probabilites用于确定和欺诈状态
prblty_ok =(浮点)COUNTER_OK /(浮点)(okfr_len);
prblty_fraud =(浮点)counter_fraud /(浮点)(okfr_len);
的printf(OK的记录的概率为%F \\ N,prblty_ok);
的printf(欺诈记录的概率为%F \\ N,prblty_fraud);FP = FOPEN(sales_unknwn.txt,R);
FP2 =的fopen(sales_unknown_ok_classified.txt,W);
FP3 =的fopen(sales_unknown_fraud_classified.txt,W);
而(与fgets(行1,sizeof的(一号线),FP)!= NULL)//从文件中读取每一行来计算文件的大小。
{
    字符* unknwn_attr1 = strtok的(一号线,);
    字符* unknwn_attr2 =的strtok(NULL,);
    字符* unknwn_attr3 =的strtok(NULL,);
    字符* unknwn_attr4 =的strtok(NULL,);
    字符* unknwn_attr5 =的strtok(NULL,);    //的printf(%S-%S-%S-%S-%S \\ n,unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);    用于(为size_t我= 0; I&LT; okfr_len ++ I)
    {        如果(的strcmp(okfr [I] [5],OK)== 0)
        {
            // OK情况
            如果(的strcmp(unknwn_attr2,okfr [Ⅰ] [1])== 0)
                counter_ok_attr2 ++;            如果(的strcmp(unknwn_attr3,okfr [I] [2])== 0)
                counter_ok_attr3 ++;            如果(的strcmp(unknwn_attr4,okfr [Ⅰ] [3])== 0)
                counter_ok_attr4 ++;            如果(的strcmp(unknwn_attr5,okfr [I] [4])== 0)
                counter_ok_attr5 ++;
        }        否则//诈骗案
        {
            如果(的strcmp(unknwn_attr2,okfr [Ⅰ] [1])== 0)
                counter_fraud_attr2 ++;            如果(的strcmp(unknwn_attr3,okfr [I] [2])== 0)
                counter_fraud_attr3 ++;            如果(的strcmp(unknwn_attr4,okfr [Ⅰ] [3])== 0)
                counter_fraud_attr4 ++;            如果(的strcmp(unknwn_attr5,okfr [I] [4])== 0)
                counter_fraud_attr5 ++;
        }
    }    如果(counter_ok_attr2 == 0)
        prblty_attr2_given_ok =(counter_ok_attr2 + arbitrary_value * prblty_ok)/(COUNTER_OK + arbitrary_value);
    其他
        prblty_attr2_given_ok =(counter_ok_attr2)/(COUNTER_OK);    如果(counter_ok_attr3 == 0)
        prblty_attr3_given_ok =(counter_ok_attr3 + arbitrary_value * prblty_ok)/(COUNTER_OK + arbitrary_value);
    其他
        prblty_attr3_given_ok =(counter_ok_attr3)/(COUNTER_OK);    如果(counter_ok_attr4 == 0)
        prblty_attr4_given_ok =(counter_ok_attr4 + arbitrary_value * prblty_ok)/(COUNTER_OK + arbitrary_value);
    其他
        prblty_attr4_given_ok =(counter_ok_attr4)/(COUNTER_OK);    如果(counter_ok_attr5 == 0)
        prblty_attr5_given_ok =(counter_ok_attr5 + arbitrary_value * prblty_ok)/(COUNTER_OK + arbitrary_value);
    其他
        prblty_attr5_given_ok =(counter_ok_attr5)/(COUNTER_OK);    如果(counter_fraud_attr2 == 0)
        prblty_attr2_given_fraud =(counter_fraud_attr2 + arbitrary_value * prblty_fraud)/(counter_fraud + arbitrary_value);
    其他
        prblty_attr2_given_fraud =(counter_fraud_attr2)/(counter_fraud);    如果(counter_fraud_attr3 == 0)
        prblty_attr3_given_fraud =(counter_fraud_attr3 + arbitrary_value * prblty_fraud)/(counter_fraud + arbitrary_value);
    其他
        prblty_attr3_given_fraud =(counter_fraud_attr3)/(counter_fraud);    如果(counter_fraud_attr4 == 0)
        prblty_attr4_given_fraud =(counter_fraud_attr4 + arbitrary_value * prblty_fraud)/(counter_fraud + arbitrary_value);
    其他
        prblty_attr4_given_fraud =(counter_fraud_attr4)/(counter_fraud);    如果(counter_fraud_attr5 == 0)
        prblty_attr5_given_fraud =(counter_fraud_attr5 + arbitrary_value * prblty_fraud)/(counter_fraud + arbitrary_value);
    其他
        prblty_attr5_given_fraud =(counter_fraud_attr5)/(counter_fraud);    total_prblty_ok = prblty_ok * prblty_attr2_given_ok * prblty_attr3_given_ok * prblty_attr4_given_ok * prblty_attr5_given_ok;
    total_prblty_fraud = prblty_fraud*prblty_attr2_given_fraud*prblty_attr3_given_fraud*prblty_attr4_given_fraud*prblty_attr5_given_fraud;
    //的printf(测试为计数OK - %F - %F - %F - %F \\ N,counter_ok_attr2,counter_ok_attr3,counter_ok_attr4,counter_ok_attr5);
    //的printf(测试计数诈骗 - %F - %F - %F - %F \\ N,counter_fraud_attr2,counter_fraud_attr3,counter_fraud_attr4,counter_fraud_attr5);
    //的printf(测试属性概率OK - %F - %F - %F - %F \\ N,prblty_attr2_given_ok,prblty_attr3_given_ok,prblty_attr4_given_ok,prblty_attr5_given_ok);
    //的printf(测试属性为FRAUD-%F概率 - %F - %F - %f\
\",prblty_attr2_given_fraud,prblty_attr3_given_fraud,prblty_attr4_given_fraud,prblty_attr5_given_fraud);
    //输出(最后的概率为%F - %F \\ N,total_prblty_ok,total_prblty_fraud);    如果(total_prblty_ok&GT; total_prblty_fraud)
    {
        fprintf中(FP2,%S%S%S,%S,%S,OK \\ N,unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
    }
    其他
    {
        fprintf(fp3,\"%s,%s,%s,%s,%s,fraud\
\",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
    }
    counter_ok_attr2 = counter_ok_attr3 = counter_ok_attr4 = counter_ok_attr5 = 0;
    counter_fraud_attr2 = counter_fraud_attr3 = counter_fraud_attr4 = counter_fraud_attr5 = 0;
}//释放表数据和动态指针数组
免费(okfr);
免费(okfr_data);FCLOSE(FP);
FCLOSE(FP2);
FCLOSE(FP3);
返回0;


这些都只是一些想法。还有更多的东西在里面待确定,但这些应该帮助的极大地的与连续输出,这大约是有效的,你会得到下处理文件单正向扫描这些情况。毫无疑问三巨头的组合:单个文件打开+关闭,减少逻辑和单解析,cache'ing的sales_ok_fraud.txt文件会有性能上的巨大改善,特别是第一个和最后的这些

修改辅助的OP在更新此处理器前端加载sales_ok_fraud.txt文件内容,从而消除了重复加载,解析,并及时抛出一些15000+行文字被反复解析(每一次主源输入线)。回答上述相应的更新。

I have implemented Naive Bayes algorithm on a large data set of 410k rows.Now all my records are getting classified correctly but the thing is the program is taking almost an hr to write the records into the corresponding files.What is the best way to improve performance of my code.Here is the below code.This piece of code is writing the 410k records into the corresponding files.Thank you.

    fp=fopen("sales_ok_fraud.txt","r");
        while(fgets(line,80,fp)!=NULL) //Reading each line from file to calculate the file size.
        {
                token = strtok(line,",");
                token = strtok(NULL,",");
                token = strtok(NULL,",");
                token = strtok(NULL,",");
                token = strtok(NULL,",");
                token = strtok(NULL,",");
                token1 = strtok(token,"\n");
                memcpy(mystr,&token1[0],strlen(token1)-1);
                mystr[strlen(token1)-1] = '\0';

        if( strcmp(mystr,"ok") == 0 )
            counter_ok++;
        else 
        counter_fraud++;       
    }
    printf("The no. of records with OK label are %f\n",counter_ok);
    printf("The no. of records with FRAUD label are %f\n",counter_fraud);

    prblty_ok = counter_ok/(counter_ok+counter_fraud);
    prblty_fraud = counter_fraud/(counter_ok+counter_fraud);
    printf("The probability of OK records is %f\n",prblty_ok);
    printf("The probability of FRAUD records is %f\n",prblty_fraud);
    fclose(fp);

    fp=fopen("sales_unknwn.txt","r");
    fp2=fopen("sales_unknown_ok_classified.txt","a");
    fp3=fopen("sales_unknown_fraud_classified.txt","a");
    while(fgets(line1,80,fp)!=NULL) //Reading each line from file to calculate the file size.
        {
                unknwn_attr1 = strtok(line1,",");
                unknwn_attr2 = strtok(NULL,",");
                unknwn_attr3 = strtok(NULL,",");
                unknwn_attr4 = strtok(NULL,",");
                unknwn_attr5 = strtok(NULL,",");

        //printf("%s-%s-%s-%s-%s\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);

        fp1=fopen("sales_ok_fraud.txt","r");
        while(fgets(line,80,fp1)!=NULL) //Reading each line from file to calculate the file size.
            {
            ok_fraud_attr1 = strtok(line,",");
                    ok_fraud_attr2 = strtok(NULL,",");
                    ok_fraud_attr3 = strtok(NULL,",");
                    ok_fraud_attr4 = strtok(NULL,",");
                    ok_fraud_attr5 = strtok(NULL,",");
            ok_fraud_attr6 = strtok(NULL,",");
                    memcpy(ok_fraud_attr6_str,&ok_fraud_attr6[0],strlen(ok_fraud_attr6)-2);
                    ok_fraud_attr6_str[strlen(ok_fraud_attr6)-2] = '\0';
            //ok_fraud_attr6[strlen(ok_fraud_attr6)-2] = '\0';      
            //printf("Testing ok_fraud_attr6 - %s-%d\n",ok_fraud_attr6_str,strlen(ok_fraud_attr6_str)); 
            if( strcmp(ok_fraud_attr6_str,"ok") == 0 )
            {
                if( strcmp(unknwn_attr2,ok_fraud_attr2) == 0 )
                counter_ok_attr2++;

                if( strcmp(unknwn_attr3,ok_fraud_attr3) == 0 )
                counter_ok_attr3++;

                if( strcmp(unknwn_attr4,ok_fraud_attr4) == 0 )
                counter_ok_attr4++;

                if( strcmp(unknwn_attr5,ok_fraud_attr5) == 0 )
                counter_ok_attr5++;
            }

                    if( strcmp(ok_fraud_attr6_str,"fraud") == 0 )
                        {
                if( strcmp(unknwn_attr2,ok_fraud_attr2) == 0 )
                counter_fraud_attr2++;

                if( strcmp(unknwn_attr3,ok_fraud_attr3) == 0 )
                counter_fraud_attr3++;

                if( strcmp(unknwn_attr4,ok_fraud_attr4) == 0 )
                counter_fraud_attr4++;

                if( strcmp(unknwn_attr5,ok_fraud_attr5) == 0 )
                counter_fraud_attr5++;
            }
        }
        fclose(fp1);
        if(counter_ok_attr2 == 0)
        prblty_attr2_given_ok = (counter_ok_attr2+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
        else
        prblty_attr2_given_ok = (counter_ok_attr2)/(counter_ok);

        if(counter_ok_attr3 == 0)
        prblty_attr3_given_ok = (counter_ok_attr3+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
        else
                prblty_attr3_given_ok = (counter_ok_attr3)/(counter_ok);

        if(counter_ok_attr4 == 0)
        prblty_attr4_given_ok = (counter_ok_attr4+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
        else
                prblty_attr4_given_ok = (counter_ok_attr4)/(counter_ok);

        if(counter_ok_attr5 == 0)
        prblty_attr5_given_ok = (counter_ok_attr5+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
        else
                prblty_attr5_given_ok = (counter_ok_attr5)/(counter_ok);

        if(counter_fraud_attr2 == 0)
        prblty_attr2_given_fraud = (counter_fraud_attr2+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
        else
                prblty_attr2_given_fraud = (counter_fraud_attr2)/(counter_fraud);

        if(counter_fraud_attr3 == 0)
        prblty_attr3_given_fraud = (counter_fraud_attr3+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
        else
                prblty_attr3_given_fraud = (counter_fraud_attr3)/(counter_fraud);

        if(counter_fraud_attr4 == 0)
        prblty_attr4_given_fraud = (counter_fraud_attr4+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
        else
                prblty_attr4_given_fraud = (counter_fraud_attr4)/(counter_fraud);

        if(counter_fraud_attr5 == 0)
        prblty_attr5_given_fraud = (counter_fraud_attr5+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
        else
                prblty_attr5_given_fraud = (counter_fraud_attr5)/(counter_fraud);

        total_prblty_ok = prblty_ok*prblty_attr2_given_ok*prblty_attr3_given_ok*prblty_attr4_given_ok*prblty_attr5_given_ok;
        total_prblty_fraud = prblty_fraud*prblty_attr2_given_fraud*prblty_attr3_given_fraud*prblty_attr4_given_fraud*prblty_attr5_given_fraud;


//      printf("Testing counts for OK - %f - %f - %f - %f\n",counter_ok_attr2,counter_ok_attr3,counter_ok_attr4,counter_ok_attr5);
//      printf("Testing counts for FRAUD - %f - %f - %f - %f\n",counter_fraud_attr2,counter_fraud_attr3,counter_fraud_attr4,counter_fraud_attr5);
//      printf("Testing attribute probabilities for OK - %f - %f - %f - %f\n",prblty_attr2_given_ok,prblty_attr3_given_ok,prblty_attr4_given_ok,prblty_attr5_given_ok);
//      printf("Testing attribute probabilities for FRAUD- %f - %f - %f - %f\n",prblty_attr2_given_fraud,prblty_attr3_given_fraud,prblty_attr4_given_fraud,prblty_attr5_given_fraud);
//      printf("The final probabilities are %f - %f\n",total_prblty_ok,total_prblty_fraud);

        if(total_prblty_ok > total_prblty_fraud)
        {
            fprintf(fp2,"%s,%s,%s,%s,%s,ok\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
        }   
        else
        {
            fprintf(fp3,"%s,%s,%s,%s,%s,fraud\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
        }
        counter_ok_attr2=counter_ok_attr3=counter_ok_attr4=counter_ok_attr5=0;
        counter_fraud_attr2=counter_fraud_attr3=counter_fraud_attr4=counter_fraud_attr5=0;
    }

    fclose(fp);
        fclose(fp2);    
        fclose(fp3);    

解决方案

There are a several things I can see right away you can do, in the order I would try them:

  1. Stop with the repeated open-write-close, open-write-close ideology on your output files. Their names are fixed and limited. Open them all appropriately at the start of this thing, then flush-and-close when you're done.
  2. There are several logic-constructs that can be significantly simplified.
  3. Your strlen() rampage needs to significantly be reduced. Most decent optimizing compilers will detect the unchanged source and optimize out the subsequent calls on a known-unchanged char-ptr, so I would do this last (but honestly I'd still do it as its a bad practice to call repeated strlen() invokes on the same data.
  4. ADDED AFTER CONVERSING WITH OP :You repeatedly re-parse the same data file (sales_ok_fraud.txt) over and over, once for line of data in sales_unknwn.txt. 12gB/abg-line-length is a LOT of unneeded repetitious parsing if sales_ok_fraud.txt can fit in memory. Load that data once calculate its base stats once, and use the data and stats therein for the rest of your data crunch.


Logic Reductions

You can cut out a ton of work in one place in particular, changing this:

    if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)
        counter_ok_attr2++;

    if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)
        counter_ok_attr3++;

    if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)
        counter_ok_attr4++;

    if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)
        counter_ok_attr5++;

    if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)
        counter_fraud_attr2++;

    if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)
        counter_fraud_attr3++;

    if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)
        counter_fraud_attr4++;

    if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)
        counter_fraud_attr5++;

To this:

    if (strcmp(ok_fraud_attr6_str, "ok") == 0)
    {
        if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0)
            counter_ok_attr2++;

        if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0 )
            counter_ok_attr3++;

        if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0)
            counter_ok_attr4++;

        if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0)
            counter_ok_attr5++;
    }
    else if (strcmp(ok_fraud_attr6_str,"fraud") == 0)
    {
        if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0)
            counter_fraud_attr2++;

        if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0)
            counter_fraud_attr3++;

        if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0)
            counter_fraud_attr4++;

        if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0)
            counter_fraud_attr5++;
    }


Front-Loading sales_ok_fraud.txt

The following relies on the sanctity of the data format of your sales_ok_fraud.txt stats file, while trying to be as pedantic as possible in validating said format. It allocates a chunk of memory large enough to hold the entire file plus-one-char to treat the entire body as a single null-term-string. That buffer is then pieced-up via the same general algorithm you had prior. The result will be a table of pointers to fixed-length char-pointer arrays that can then be used iteratively in the same place you currently (and repeatedly) open, parse, use, and throw away all that content.

// declare an array of six string pointers
typedef char *OFAttribs[6];

// loads a table consisting of the following format:
//
// str1,str2,str3,str4,str5,str6\n
// str1,str2,str3,str4,str5,str6\n
// ...
// str1,str2,str3,str4,str5,str6
//
// any deviation from the above will cause premature termination of the loop
//  but will return whatever was able to be parsed up to the point of failure.
//  the caller should therefore always `free()` the resulting table and data
//  pointers.

size_t init_ok_fraud_data(const char *fname, OFAttribs **ppTable, char **ppTableData)
{
    if (!fname || !*fname)
        return 0;

    // check file open for thumbs up
    FILE *fp = fopen(fname, "rb");
    if (!fp)
        return 0;

    // allocate enough memory to hold the entire file, plus a terminator
    fseek(fp, 0,  SEEK_END);
    long len = ftell(fp);
    fseek(fp, 0,  SEEK_SET);

    // allocate enough ram for the entire file plus terminator
    OFAttribs *pTable = NULL;
    size_t nTableLen = 0;
    char *pTableData =  malloc((len+1) * sizeof(char));
    if (NULL != pTableData)
    {
        fread(pTableData , len, 1, fp);
        pTableData[len] = 0;
    }

    // no longer need the file
    fclose(fp);

    // prime first token
    char *token = strtok(pTableData, ",");
    while (token)
    {
        // read next line of tokens
        OFAttribs attribs = { NULL };
        for (int i=0;i<4 && token; ++i)
        {
            attribs[i] = token;
            token = strtok(NULL, ",");
        }

        // filled 0..3, set lat token and move on
        if (attribs[3] && token)
        {
            // next-to-last entry set
            attribs[4] = token;

            // line enter is only terminated by newline
            token = strtok(NULL, "\n");
            if (token)
            {
                // proper format. 6 parms, 5 commas, one new-line.
                attribs[5] = token;
                size_t slen = strlen(token);
                if (slen > 0)
                {
                    while (isspace(token[--slen]))
                        token[slen] = 0;
                }

                // make space on the master list for another.
                OFAttribs *tmp = realloc(pTable, sizeof(*tmp) * (nTableLen+1));
                if (NULL != tmp)
                {
                    pTable = tmp;
                    memcpy(pTable + nTableLen++, attribs, sizeof(attribs));
                }
                else
                {   // allocation failure.
                    printf("Error allocating memory for expanding OKFraud data set");
                    exit(EXIT_FAILURE);
                }
            }
            else
            {   // not good.
                printf("Invalid line format detected. Expected ok/fraud\\n");
                break;
            }

            // next token of new line
            token = strtok(NULL, ",");
        }
    }

    // set output variables
    *ppTable = pTable;
    *ppTableData = pTableData;
    return nTableLen;
}


Putting It Together

Incorporating everything above has the following effect on your code base:

// load the ok_fraud table ONCE.
OFAttribs *okfr = NULL;
char *okfr_data = NULL;
size_t okfr_len = init_ok_fraud_data("sales_ok_fraud.txt", &okfr, &okfr_data);

// walk table to determine probabilities of ok and fraud states.
//  note: this really should be done as part of the loader.
for (size_t i=0;i<okfr_len; ++i)
{
    if (0 == strcmp("ok", okfr[i][5]))
        ++counter_ok;
    else
        ++counter_fraud;
}

printf("The no. of records with OK label are %f\n",counter_ok);
printf("The no. of records with FRAUD label are %f\n",counter_fraud);

// compute probabilites for ok and fraud states
prblty_ok = (float)counter_ok/(float)(okfr_len);
prblty_fraud = (float)counter_fraud/(float)(okfr_len);
printf("The probability of OK records is %f\n",prblty_ok);
printf("The probability of FRAUD records is %f\n",prblty_fraud);

fp=fopen("sales_unknwn.txt","r");
fp2=fopen("sales_unknown_ok_classified.txt","w");
fp3=fopen("sales_unknown_fraud_classified.txt","w");
while(fgets(line1,sizeof(line1),fp)!=NULL) //Reading each line from file to calculate the file size.
{
    char *unknwn_attr1 = strtok(line1,",");
    char *unknwn_attr2 = strtok(NULL,",");
    char *unknwn_attr3 = strtok(NULL,",");
    char *unknwn_attr4 = strtok(NULL,",");
    char *unknwn_attr5 = strtok(NULL,",");

    //printf("%s-%s-%s-%s-%s\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);

    for (size_t i=0;i<okfr_len; ++i)
    {

        if( strcmp(okfr[i][5], "ok") == 0 )
        {
            // ok case
            if( strcmp(unknwn_attr2, okfr[i][1]) == 0 )
                counter_ok_attr2++;

            if( strcmp(unknwn_attr3, okfr[i][2]) == 0 )
                counter_ok_attr3++;

            if( strcmp(unknwn_attr4, okfr[i][3]) == 0 )
                counter_ok_attr4++;

            if( strcmp(unknwn_attr5, okfr[i][4]) == 0 )
                counter_ok_attr5++;
        }

        else // fraud case
        {
            if( strcmp(unknwn_attr2, okfr[i][1]) == 0 )
                counter_fraud_attr2++;

            if( strcmp(unknwn_attr3, okfr[i][2]) == 0 )
                counter_fraud_attr3++;

            if( strcmp(unknwn_attr4, okfr[i][3]) == 0 )
                counter_fraud_attr4++;

            if( strcmp(unknwn_attr5, okfr[i][4]) == 0 )
                counter_fraud_attr5++;
        }
    }

    if(counter_ok_attr2 == 0)
        prblty_attr2_given_ok = (counter_ok_attr2+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
    else
        prblty_attr2_given_ok = (counter_ok_attr2)/(counter_ok);

    if(counter_ok_attr3 == 0)
        prblty_attr3_given_ok = (counter_ok_attr3+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
    else
        prblty_attr3_given_ok = (counter_ok_attr3)/(counter_ok);

    if(counter_ok_attr4 == 0)
        prblty_attr4_given_ok = (counter_ok_attr4+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
    else
        prblty_attr4_given_ok = (counter_ok_attr4)/(counter_ok);

    if (counter_ok_attr5 == 0)
        prblty_attr5_given_ok = (counter_ok_attr5+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
    else
        prblty_attr5_given_ok = (counter_ok_attr5)/(counter_ok);

    if(counter_fraud_attr2 == 0)
        prblty_attr2_given_fraud = (counter_fraud_attr2+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
    else
        prblty_attr2_given_fraud = (counter_fraud_attr2)/(counter_fraud);

    if(counter_fraud_attr3 == 0)
        prblty_attr3_given_fraud = (counter_fraud_attr3+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
    else
        prblty_attr3_given_fraud = (counter_fraud_attr3)/(counter_fraud);

    if(counter_fraud_attr4 == 0)
        prblty_attr4_given_fraud = (counter_fraud_attr4+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
    else
        prblty_attr4_given_fraud = (counter_fraud_attr4)/(counter_fraud);

    if(counter_fraud_attr5 == 0)
        prblty_attr5_given_fraud = (counter_fraud_attr5+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
    else
        prblty_attr5_given_fraud = (counter_fraud_attr5)/(counter_fraud);

    total_prblty_ok = prblty_ok*prblty_attr2_given_ok*prblty_attr3_given_ok*prblty_attr4_given_ok*prblty_attr5_given_ok;
    total_prblty_fraud = prblty_fraud*prblty_attr2_given_fraud*prblty_attr3_given_fraud*prblty_attr4_given_fraud*prblty_attr5_given_fraud;


    //      printf("Testing counts for OK - %f - %f - %f - %f\n",counter_ok_attr2,counter_ok_attr3,counter_ok_attr4,counter_ok_attr5);
    //      printf("Testing counts for FRAUD - %f - %f - %f - %f\n",counter_fraud_attr2,counter_fraud_attr3,counter_fraud_attr4,counter_fraud_attr5);
    //      printf("Testing attribute probabilities for OK - %f - %f - %f - %f\n",prblty_attr2_given_ok,prblty_attr3_given_ok,prblty_attr4_given_ok,prblty_attr5_given_ok);
    //      printf("Testing attribute probabilities for FRAUD- %f - %f - %f - %f\n",prblty_attr2_given_fraud,prblty_attr3_given_fraud,prblty_attr4_given_fraud,prblty_attr5_given_fraud);
    //      printf("The final probabilities are %f - %f\n",total_prblty_ok,total_prblty_fraud);

    if(total_prblty_ok > total_prblty_fraud)
    {
        fprintf(fp2,"%s,%s,%s,%s,%s,ok\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
    }
    else
    {
        fprintf(fp3,"%s,%s,%s,%s,%s,fraud\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
    }
    counter_ok_attr2=counter_ok_attr3=counter_ok_attr4=counter_ok_attr5=0;
    counter_fraud_attr2=counter_fraud_attr3=counter_fraud_attr4=counter_fraud_attr5=0;
}

// free the table data and dynamic pointer array
free(okfr);
free(okfr_data);

fclose(fp);
fclose(fp2);
fclose(fp3);
return 0;


These are just a few ideas. There are more things in there to-be-sure, but these should help enormously in processing your file single-forward-scan with continuous output, which is about as efficient as you're going to get under these circumstances. Without question the combination of the big three: single file open+close, logic reductions and single-parse-cache'ing the sales_ok_fraud.txt file will have a huge improvement in performance, especially the first and last of these.

EDIT Assisted the OP in updating this processor to front-load the sales_ok_fraud.txt file content, thereby eliminating repeated loading, parsing, and promptly throwing out some 15000+ line of text to be parsed repeatedly (once per main-source input line). Answer above updated accordingly.

这篇关于如何提高性能,同时具有文件下操作的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆