读文件用C更快 [英] Reading a file faster in C

查看:131
本文介绍了读文件用C更快的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

嗯,我不知道是否要读取的文件比使用的fscanf更快的方式()

例如假设我有这样的文字

  455ķ52Ø24升523我

首先,我想读的第一个数字给了我们以下的行数。

让这个数字调用N

ñ后,我想读一下有一个整数和字符N行。
 随着的fscanf 它会是这样

 的fscanf(翅,%D%C,和放大器;一,和C);


解决方案

您做几乎没有处理所以大概的瓶颈是文件系统的吞吐量。但是你应该先衡量它是否真的是。如果你不想使用一个分析器,你可以衡量你的应用程序的运行时间。由运行时间除以输入文件的大小可以用来检查是否已经达到了文件系统的吞吐量限制。

然后,如果你远离上述限制,你可能需要优化你读文件的方式。这可能是更好的使用较大的块来阅读 FREAD(),然后处理存储在内存中的缓冲区的sscanf()

您也可以解析缓冲区自己这将是快于 * scanf()的

特别是对于Drakosha:

  $时间./main1
良好的条目:千万真正0m3.732s
用户0m3.531s
SYS 0m0.109s
$时间./main2
良好的条目:千万真正0m0.605s
用户0m0.496s
SYS 0m0.094s

所以优化版本使得〜127MB / s的这可能是我的文件系统的瓶颈或者OS缓存在内存中的文件。原文版本乃〜20MB /秒。

一个80MB的文件进行测试:

 千万12341234
...

main1.c

 的#include<&stdio.h中GT;INT OK = 0;
无效processEntry(INT一个,焦炭C){
    如果(一个== 1234&放大器;和C =='一'){
        ++ OK;
    }
}INT主(INT ARGC,字符** argv的){
    FILE * F =的fopen(data.txt中,R);
    INT总= 0;
    int类型的;
    焦炭℃;
    INT I = 0;    的fscanf(F,%D,&安培;总);
    对于(i = 0; I<总++我){
        如果(2 =的fscanf(F,%D%C,和放大器;!一,和C)){
            FCLOSE(F);
            返回1;
        }
        processEntry(A,C);
    }
    FCLOSE(F);
    的printf(好条目数:%d \\ n,确定);
    返回(OK ==总数)? 0:1;
}

main2.c

 的#include<&stdio.h中GT;
#包括LT&;&stdlib.h中GT;INT OK = 0;
无效processEntry(INT一个,焦炭C){
    如果(一个== 1234安培;和C =='一'){
        ++ OK;
    }
}INT主(INT ARGC,字符** argv的){
    FILE * F =的fopen(data.txt中,R);
    INT总= 0;
    int类型的;
    焦炭℃;
    INT I = 0;
    字符* numberPtr = NULL;
    焦炭BUF [2048];
    为size_t麦克罗公司一直供应= sizeof的(BUF);
    INT状态= 0;
    INT文件长度,lengthLeft;    fseek的(F,0,SEEK_END);
    文件长度= FTELL(F);
    fseek的(F,0,SEEK_SET);    的fscanf(F,%D,&安培;总); //读取第一行    lengthLeft =文件长度 - FTELL(F);    //读取使用FSM其他线路
    做{
        如果(lengthLeft&下;的sizeof(buf中)){
            FREAD(BUF,lengthLeft,1,F);
            麦克罗公司一直供应= lengthLeft;
        }其他{
            FREAD(BUF,sizeof的(BUF),1,F);
            麦克罗公司一直供应= sizeof的(BUF);
        }
        lengthLeft - =麦克罗公司一直供应;
        对于(i = 0; I<麦克罗公司一直供应++我){
            开关(州){
                情况下0:
                    如果(ISDIGIT(BUF [I])){
                        状态= 1;
                        一个= BUF [Ⅰ] - '0';
                    }
                    打破;
                情况1:
                    如果(ISDIGIT(BUF [I])){
                        A = A * 10 +的buf [Ⅰ] - '0';
                    }其他{
                        状态= 2;
                    }
                    打破;
                案例2:
                    如果(因而isalpha(BUF [I])){
                        状态= 0;
                        C = BUF [I]
                        processEntry(A,C);
                    }
                    打破;
            }
        }
    }而(麦克罗公司一直供应==的sizeof(BUF));    FCLOSE(F);
    的printf(好条目数:%d \\ n,确定);
    返回(OK ==总数)? 0:1;
}

Hmm i wonder whether is a way to read a FILE faster than using fscanf()

For example suppose that i have this text

4

55 k

52 o

24 l

523 i

First i want to read the first number which gives us the number of following lines.

Let this number be called N.

After N, I want to read N lines which have an integer and a character. With fscanf it would be like this

fscanf(fin,"%d %c",&a,&c);

解决方案

You do almost no processing so probably the bottleneck is the file system throughput. However you should measure first if it really is. If you don't want to use a profiler, you can just measure the running time of your application. The size of input file divided by the running time can be used to check if you've reached the file system throughput limit.

Then if you are far away from aforementioned limit you probably need to optimize the way you read the file. It may be better to read it in larger chunks using fread() and then process the buffer stored in memory with sscanf().

You also can parse the buffer yourself which would be faster than *scanf().

[edit]

Especially for Drakosha:

$ time ./main1
Good entries: 10000000

real    0m3.732s
user    0m3.531s
sys 0m0.109s
$ time ./main2
Good entries: 10000000

real    0m0.605s
user    0m0.496s
sys 0m0.094s

So the optimized version makes ~127MB/s which may be my file system's bottleneck or maybe OS caches the file in RAM. The original version is ~20MB/s.

Tested with a 80MB file:

10000000

1234 a

1234 a
...

main1.c

#include <stdio.h>

int ok = 0;
void processEntry(int a, char c) {
    if (a == 1234 && c == 'a') {
        ++ok;
    }
}

int main(int argc, char **argv) {
    FILE *f = fopen("data.txt", "r");
    int total = 0;
    int a;
    char c;
    int i = 0;

    fscanf(f, "%d", &total);
    for (i = 0; i < total; ++i) {
        if (2 != fscanf(f, "%d %c", &a, &c)) {
            fclose(f);
            return 1;
        }
        processEntry(a, c);
    }
    fclose(f);
    printf("Good entries: %d\n", ok);
    return (ok == total) ? 0 : 1;
}

main2.c

#include <stdio.h>
#include <stdlib.h>

int ok = 0;
void processEntry(int a, char c) {
    if (a == 1234 && c == 'a') {
        ++ok;
    }
}

int main(int argc, char **argv) {
    FILE *f = fopen("data.txt", "r");
    int total = 0;
    int a;
    char c;
    int i = 0;
    char *numberPtr = NULL;
    char buf[2048];
    size_t toProcess = sizeof(buf);
    int state = 0;
    int fileLength, lengthLeft;

    fseek(f, 0, SEEK_END);
    fileLength = ftell(f);
    fseek(f, 0, SEEK_SET);

    fscanf(f, "%d", &total);  // read the first line

    lengthLeft = fileLength - ftell(f);

    // read other lines using FSM
    do {
        if (lengthLeft < sizeof(buf)) {
            fread(buf, lengthLeft, 1, f);
            toProcess = lengthLeft;
        } else {
            fread(buf, sizeof(buf), 1, f);
            toProcess = sizeof(buf);
        }
        lengthLeft -= toProcess;
        for (i = 0; i < toProcess; ++i) {
            switch (state) {
                case 0:
                    if (isdigit(buf[i])) {
                        state = 1;
                        a = buf[i] - '0';
                    }
                    break;
                case 1:
                    if (isdigit(buf[i])) {
                        a = a * 10 + buf[i] - '0';
                    } else {
                        state = 2;
                    }
                    break;
                case 2:
                    if (isalpha(buf[i])) {
                        state = 0;
                        c = buf[i];
                        processEntry(a, c);
                    }
                    break;
            }
        }
    } while (toProcess == sizeof(buf));

    fclose(f);
    printf("Good entries: %d\n", ok);
    return (ok == total) ? 0 : 1;
}

这篇关于读文件用C更快的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆