检测文件上使用C重复行 [英] Detecting duplicate lines on file using c

查看：129 发布时间：2016/8/21 22:06:43 c search file-io csv

本文介绍了检测文件上使用C重复行的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我有大约（15000-25000）线（固定大小）的一个CSV文件，我想知道我怎么可以检测用c语言重复的行。

的输出的一个例子是这样的：

  0123456789; CUST098WZAX; 35

我没有记忆或时间限制，所以我想最简单的解决方案。

感谢您的帮助。

解决方案

 的#include＆LT;＆stdio.h中GT;
＃包括LT＆;＆stdlib.h中GT;
＃包括LT＆;＆string.h中GT;结构somehash {
        结构somehash *接下来的;
        无符号的散列;
        字符* MEM;
        };＃定义THE_SIZE 100000
结构somehash *表[THE_SIZE] = {NULL，};结构somehash ** some_find（的char * str中，无符号LEN）;
静态无符号some_hash（的char * str中，无符号LEN）;INT主要（无效）
{
炭缓冲器[100];
结构somehash **页;
为size_t LEN;而（与fgets（缓冲区，缓冲区的sizeof，标准输入））{
        LEN = strlen的（缓冲器）;
        PP = some_find（缓冲，LEN）;
        如果（* PP）{/ * *中找到/
                fprintf中（标准错误，复制：％S \\ n，缓冲区）;
                }
        其他{/ *未找到：创建一个* /
                fprintf中（标准输出，％S，缓冲区）;
                * PP =的malloc（sizeof的**页）;
                （* PP） - ＆gt;接着= NULL;
                （* PP） -  GT;哈希= some_hash（缓冲，LEN）;
                （* PP） -  GT;纪念品=的malloc（1 + LEN）;
                的memcpy（（* PP） -  GT;纪念，缓冲，1 + LEN）;
                }
        }
返回0;
}
结构somehash ** some_find（的char * str中，无符号LEN）
{
无符号的散列;
无符号插槽;
结构somehash ** HND;哈希= some_hash（STR，LEN）;
SLOT =哈希％THE_SIZE;
对于（HND =安培;表[槽]; * HND; HND =及（* HND） - ＆gt;接下来）{
        如果（（* HND） -  GT;！=哈希散列）继续;
        如果（STRCMP（（* HND） -  GT;纪念品，STR））继续;
        打破;
        }
返回HND;
}静态无符号some_hash（的char * str中，无符号LEN）
{
无符号VAL;
无符号IDX;如果len = strlen的（STR）（LEN！）;VAL = 0;
对于（IDX = 0; IDX＆LT; LEN; IDX ++）{
        VAL ^ =（VAL＆GT;＆→2）^（VAL＆所述;小于5）^（VAL＆所述;δ13）^ STR [idx的] ^ 0x80001801;
        }
返回VAL;
}

I have a csv file with about (15000-25000) lines(of fixed size) and i want to know how can i detect duplicated lines using c language.

An example of the output is like this :

0123456789;CUST098WZAX;35

I have no memory or time constraint, so i want the simplest solution.

Thanks for your help.

解决方案

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

struct somehash {
        struct somehash *next;
        unsigned hash;
        char *mem;
        };

#define THE_SIZE 100000
struct somehash *table[THE_SIZE] = { NULL,};

struct somehash **some_find(char *str, unsigned len);
static unsigned some_hash(char *str, unsigned len);

int main (void)
{
char buffer[100];
struct somehash **pp;
size_t len;

while (fgets(buffer, sizeof buffer, stdin)) {
        len = strlen(buffer);
        pp = some_find(buffer, len);
        if (*pp) { /* found */
                fprintf(stderr, "Duplicate:%s\n", buffer);
                }
        else    {       /* not found: create one */
                fprintf(stdout, "%s", buffer);
                *pp = malloc(sizeof **pp);
                (*pp)->next = NULL;
                (*pp)->hash = some_hash(buffer,len);
                (*pp)->mem = malloc(1+len);
                memcpy((*pp)->mem , buffer,  1+len);
                }
        }
return 0;
}
struct somehash **some_find(char *str, unsigned len)
{
unsigned hash;
unsigned slot;
struct somehash **hnd;

hash = some_hash(str,len);
slot = hash % THE_SIZE;
for (hnd = &table[slot]; *hnd ; hnd = &(*hnd)->next ) {
        if ( (*hnd)->hash != hash) continue;
        if ( strcmp((*hnd)->mem , str) ) continue;
        break;
        }
return hnd;
}

static unsigned some_hash(char *str, unsigned len)
{
unsigned val;
unsigned idx;

if (!len) len = strlen(str);

val = 0;
for(idx=0; idx < len; idx++ )   {
        val ^= (val >> 2) ^ (val << 5) ^ (val << 13) ^ str[idx] ^ 0x80001801;
        }
return val;
}

这篇关于检测文件上使用C重复行的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

检测文件上使用C重复行 [英] Detecting duplicate lines on file using c

问题描述

相关文章

C/C++最新文章

热门教程

热门工具

登录关闭

检测文件上使用C重复行 [英] Detecting duplicate lines on file using c

问题描述

相关文章

C/C++最新文章

热门教程

热门工具

登录 关闭

登录关闭