特里语中的区分词 [英] Differentiating words in trie

查看:76
本文介绍了特里语中的区分词的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我的特里里有一个单词 all和一个单词 alter,但 alt不是一个单词。但是当我检查 alt时,它仍然返回true,因为is_word为true,因为 all是一个单词。

I have a word "all" in my trie and a word "alter" but "alt" is not a word in the trie. But when I check for "alt" it still returns true because is_word is true as "all" was a word. How am is supposed to work this error.

//Here's the code
typedef struct node{
  bool is_word;    
  struct node *children[27];
} node;

unsigned int wsize=0;
node * root;

bool check(const char* word)
{
    // TODO
    node *chrawler=root;   
    for(int i=0;i<strlen(word)-1;i++)
    {
        int t;
        if(word[i]>=65&&word[i]<=90)
        {        
            t=word[i]-'A';
        }
        else if(isalpha(word[i]))
            t=word[i]-'a';
        else
            t=26;

        if(chrawler->children[t]==NULL)
            return false;
        else
            chrawler=chrawler->children[t];
    }

    if(chrawler->is_word)
        return true;
    return false;    

}

// Load function
bool load(const char* dictionary)
{
    // TODO

    FILE *inptr=fopen(dictionary,"r");
    if(inptr==NULL)
    {
        return false;
    }

    node *new_node=malloc(sizeof(node));
    root=new_node;

    char * word=malloc((LENGTH+1)*sizeof(char));
    int index=0;
    for(int c=fgetc(inptr);c!=EOF;c=fgetc(inptr))
    {
        char ch=c;
        if(ch=='\n')
        { 
            word[index]='\0';
            index=0;
            node *chrawler=root;
            for(int i=1;i<strlen(word);i++)
            {
                    int t;
                    if(isalpha(word[i-1]))
                        t=word[i-1]-'a';
                    else
                        t=26;
                    if(chrawler->children[t]==NULL)
                    {
                        node *new_node=malloc(sizeof(node));
                        chrawler->children[t]=new_node;

                        chrawler=chrawler->children[t];
                    }
                    else
                        chrawler=chrawler->children[t];
            }
            chrawler->is_word=1;
            wsize++;

        }
        else
        {
            word[index]=ch;
            index++;
        }

    }

    return true;
}


推荐答案

您需要确保所有新节点中的指针为空,并将 is_word 的值设置为 false 。也许最容易的方法是使用 calloc()分配空间。创建一个用于分配和错误检查节点分配的函数将使其变得更加容易。同样,您有两个代码块将字符映射到特里索引。

You need to ensure that all the pointers in a new node are null, as well as setting the is_word value to false. This is, perhaps, most easily done by using calloc() to allocate the space. Creating a function to allocate and error check the allocation of a node makes it easier. Similarly, you have two blocks of code mapping characters to trie indexes. You should use functions — even small ones — more generously.

对于一行数据来说,逐个字符输入也不是必需的。最好使用 fgets()读取行。

The character-by-character input for a line of data is not really necessary, either; it would be better to use fgets() to read lines.

添加这些内容并进行其他更改(例如,本地数组 word 而不是动态分配的数组(未释放的数组;完成后关闭文件;等等)给出MCVE(最小,完整,可验证的示例),如下所示:

Adding these and sundry other changes (for example, local array word instead of dynamically allocated array — which wasn't freed; closing the file when finished; etc.) gives an MCVE (Minimal, Complete, Verifiable Example) like this:

#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

enum { LENGTH = 256 };

// Here's the code
typedef struct node
{
    bool is_word;
    struct node *children[27];
} node;

unsigned int wsize = 0;
node *root;

static inline int map_char(unsigned char c)
{
    int t;
    if (isalpha(c))
        t = tolower(c) - 'a';
    else
        t = 26;
    return t;
}

static inline node *alloc_node(void)
{
    node *new_node = calloc(1, sizeof(node));
    if (new_node == 0)
    {
        fprintf(stderr, "Memory allocation failed in %s\n", __func__);
        exit(1);
    }
    return new_node;
}

static bool check(const char *word)
{
    node *chrawler = root;
    int len = strlen(word);
    for (int i = 0; i < len; i++)
    {
        int t = map_char(word[i]);
        if (chrawler->children[t] == NULL)
            return false;
        else
            chrawler = chrawler->children[t];
    }

    return chrawler->is_word;
}

// Load function
static bool load(const char *dictionary)
{
    FILE *inptr = fopen(dictionary, "r");
    if (inptr == NULL)
    {
        fprintf(stderr, "Failed to open file '%s' for reading\n", dictionary);
        return false;
    }

    root = alloc_node();

    char word[LENGTH];
    while (fgets(word, sizeof(word), inptr) != 0)
    {
        word[strcspn(word, "\n")] = '\0';
        printf("[%s]\n", word);
        node *chrawler = root;
        int len = strlen(word);
        for (int i = 0; i < len; i++)
        {
            int t = map_char(word[i]);
            //printf("t = %d (%c)\n", t, word[i]);
            if (chrawler->children[t] == NULL)
                chrawler->children[t] = alloc_node();
            chrawler = chrawler->children[t];
        }
        chrawler->is_word = 1;
        wsize++;
    }
    printf("%d words read from %s\n", wsize, dictionary);
    fclose(inptr);

    return true;
}

int main(void)
{
    const char *wordfile = "words.txt";
    if (load(wordfile))
    {
        char line[4096];
        while (fgets(line, sizeof(line), stdin) != 0)
        {
            line[strcspn(line, "\n")] = '\0';
            if (check(line))
                printf("[%s] is a word\n", line);
            else
                printf("[%s] is unknown\n", line);
        }
    }
    return 0;
}

还应该进行其他更改。例如,应将
wsize 变量设为非全局变量;在 load()函数之外并没有真正使用它。根节点也不应该是全局的,这很容易争论。 load()函数应返回根节点,而 check()函数应被传递根节点。通常,应尽可能避免使用全局变量,并且通常是可能的。

There are other changes that should be made. For example, the wsize variable should be made non-global; it isn't really used outside the load() function. It's easily arguable that the root node should not be global either; the load() function should return the root node, and the check() function should be passed the root node. In general, global variables should be avoided when possible, and it is usually possible.

给出文件 words.txt 包含:

abelone
abyssinia
archimedes
brachiosaurus
triceratops
all
alter
asparagus
watchamacallit
a
abracadabra
abyss
ant

该程序运行的输出为:

[abelone]
[abyssinia]
[archimedes]
[brachiosaurus]
[triceratops]
[all]
[alter]
[asparagus]
[watchamacallit]
[a]
[abracadabra]
[abyss]
[ant]
13 words read from words.txt
a
[a] is a word
ab
[ab] is unknown
al
[al] is unknown
all
[all] is a word
alt
[alt] is unknown
alte
[alte] is unknown
alter
[alter] is a word
triceratops
[triceratops] is a word
brachiosaurus
[brachiosaurus] is a word
abys
[abys] is unknown
abbey
[abbey] is unknown
abyss
[abyss] is a word
ant
[ant] is a word
a
[a] is a word
archimedes
[archimedes] is a word

这篇关于特里语中的区分词的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆