计算文本C中的单词频率 [英] Count word frequency in text C
问题描述
我找到了一个用于计算文本文件中单词频率的C代码,但它仅适用于> 1000个单词,我需要将其用于具有+40000个单词的文件。
如何修复它以处理大文件?
代码:
I've found a C code to count word frequency in a text file but it works only with >1000 words and I need to use it with files having +40000 words.
How can I fix it to work with big files?
Code:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
int main(int argc, char* argv[])
{
if (argc == 1) {
printf("The input file name has not been provided\n");
}
else if (argc == 2) {
FILE *f = fopen(argv[1], "rb");
fseek(f, 0, SEEK_END);
long fsize = ftell(f);
fseek(f, 0, SEEK_SET);
char *str = malloc(fsize + 1);
fread(str, fsize, 1, f);
fclose(f);
str[fsize] = 0;
int count = 0, c = 0, i, j = 0, k, space = 0;
char p[1000][512], str1[512], ptr1[1000][512];
char *ptr;
for (i = 0;i<strlen(str);i++)
{
if ((str[i] == ' ')||(str[i] == ',')||(str[i] == '.'))
{
space++;
}
}
for (i = 0, j = 0, k = 0;j < strlen(str);j++)
{
if ((str[j] == ' ')||(str[j] == 44)||(str[j] == 46))
{
p[i][k] = '\0';
i++;
k = 0;
}
else
p[i][k++] = str[j];
}
k = 0;
for (i = 0;i <= space;i++)
{
for (j = 0;j <= space;j++)
{
if (i == j)
{
strcpy(ptr1[k], p[i]);
k++;
count++;
break;
}
else
{
if (strcmp(ptr1[j], p[i]) != 0)
continue;
else
break;
}
}
}
for (i = 0;i < count;i++)
{
for (j = 0;j <= space;j++)
{
if (strcmp(ptr1[i], p[j]) == 0)
c++;
}
printf("%s %d \n", ptr1[i], c);
c = 0;
}
}
return 0;
}
我的尝试:
我认为这个问题与:p [1000] [512],str1 [512],ptr1 [1000] [512]
What I have tried:
I think the problem is something related to: p[1000][512], str1[512], ptr1[1000][512]
推荐答案
学会正确缩进代码,显示其结构,有助于阅读和理解。它还有助于发现结构错误。
Learn to indent properly your code, it show its structure and it helps reading and understanding. It also helps spotting structures mistakes.
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
int main(int argc, char* argv[])
{
if (argc == 1) {
printf("The input file name has not been provided\n");
}
else if (argc == 2) {
FILE *f = fopen(argv[1], "rb");
fseek(f, 0, SEEK_END);
long fsize = ftell(f);
fseek(f, 0, SEEK_SET);
char *str = malloc(fsize + 1);
fread(str, fsize, 1, f);
fclose(f);
str[fsize] = 0;
int count = 0, c = 0, i, j = 0, k, space = 0;
char p[1000][512], str1[512], ptr1[1000][512];
char *ptr;
for (i = 0;i<strlen(str);i++)
{
if ((str[i] == ' ')||(str[i] == ',')||(str[i] == '.'))
{
space++;
}
}
for (i = 0, j = 0, k = 0;j < strlen(str);j++)
{
if ((str[j] == ' ')||(str[j] == 44)||(str[j] == 46))
{
p[i][k] = '\0';
i++;
k = 0;
}
else
p[i][k++] = str[j];
}
k = 0;
for (i = 0;i <= space;i++)
{
for (j = 0;j <= space;j++)
{
if (i == j)
{
strcpy(ptr1[k], p[i]);
k++;
count++;
break;
}
else
{
if (strcmp(ptr1[j], p[i]) != 0)
continue;
else
break;
}
}
}
for (i = 0;i < count;i++)
{
for (j = 0;j <= space;j++)
{
if (strcmp(ptr1[i], p[j]) == 0)
c++;
}
printf("%s %d \n", ptr1[i], c);
c = 0;
}
}
return 0;
}
专业程序员的编辑器具有此功能,其他功能包括括号匹配和语法高亮。
< a href =https://notepad-plus-plus.org/> Notepad ++ Home [ ^ ]
ultraedit [ ^ ]
代码中的注释也是一个好主意。
Professional programmer's editors have this feature and others ones such as parenthesis matching and syntax highlighting.
Notepad++ Home[^]
ultraedit[^]
Comments in code are also a good idea.
我认为这个问题与以下内容有关:p [ 1000] [512],str1 [512],ptr1 [1000] [512]
I think the problem is something related to: p[1000][512], str1[512], ptr1[1000][512]
有一种简单的方法可以知道,尝试,你会看到。
据我所知,这个代码非常低效。它是运行时和内存中的强制力。
There is an easy way to know, try and you will see.
As far as I understand this code, it is highly inefficient. It is brut force, both runtime and in memory.
这篇关于计算文本C中的单词频率的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!