递归下降解析器 [英] Recursive Descent Parser

查看:112
本文介绍了递归下降解析器的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

这本书现代编译器设计是关于编译器的好书。但在它的源代码的东西,令人讨厌的是AST或抽象语法Tree.Suppose我们想写一个括号表达式解析器,它解析类似:((2 + 3)* 4)* 2!这本书说我们有一个AST喜欢:

 ((2 + 3)* 4)* 2 
/ | \
(2 + 3)* 4 * 2
/ | \
(2 + 3)* 4
/ | \
2 + 3

因此,我应该在内存中保存树还是使用递归调用;注意如果我不将它存储在内存中,如何将它转换为机器码?
解析器代码:

  int parse(Expression& expr)
{
if token.class =='D')
{
expr.type ='D';
expr.value = token.val-'0';
get_next_token();
return 1;
}
if(token.class =='('){
expr.type ='P';
get_next_token();
parse(& expr - > left);
parse_operator(& expr-> op);
parse(& expr-> right);
if(token.class! )错误(missing));
get_next_token();
return 1;
}
return 0;
}

语法是:

  expr  - > expr | (expr op expr)
digit - > 0 | 1 | 2 .... | 9
op - > + | *


解决方案

可以直接生成所需的输出代码。存储中间形式通常是为了能够在生成输出之前对更高级别的代码进行一些处理。



在你的情况下,例如,发现您的表达式不包含变量,因此结果是一个固定数字。一次只查看一个节点,然而这是不可能的。更明确的是,如果在查看2 *之后生成用于计算双精度值的机器代码,当其他部分为例如3时,该代码是浪费的,因为您的程序将计算3,然后计算每次 的两倍,而加载6将是等效的,但是更短和更快。



如果要生成机器代码,你需要首先知道什么样的机器代码将被生成...最简单的模型是基于堆栈的方法。在这种情况下,您不需要寄存器分配逻辑,并且很容易直接编译为没有中间表示的机器代码。考虑这个小例子只处理整数,四个操作,一元否定和变量...你会注意到,没有使用数据结构:读取源代码字符和机器指令写入输出...

  #include< stdio.h> 
#include< stdlib.h>

void error(const char * what)
{
fprintf(stderr,ERROR:%s\\\
,what)
exit(1);
}

void compileLiteral(const char *& s)
{
int v = 0;
while(* s> ='0'&& * s< ='9')
{
v = v * 10 + * s ++ - '0'
}
printf(mov eax,%i \\\
,v);
}

void compileSymbol(const char *& s)
{
printf(mov eax,dword ptr);
while((* s> ='a'&& * s" ='z')||
(* s> ='A'&& ; ='Z')||
(* s> ='0'&& * s'='9')||

{
putchar(* s ++);
}
printf(\\\
);
}

void compileExpression(const char *&);

void compileTerm(const char *& s)
{
if(* s> ='0'&& * s< ='9') {
// Number
compileLiteral(s);
} else if((* s> ='a'&& * s< ='z')||
(* s> ='A'&& s <='Z')||
(* s =='_')){
// Variable
compileSymbol(s)
} else if(* s ==' - '){
//一元否定
s ++;
compileTerm(s);
printf(neg eax\\\
);
} else if(* s =='('){
//带括号的子表达式
s ++;
compileExpression(s);
if !=')')
error(')'expected);
s ++;
} else {
错误(语法错误);
}
}

void compileMulDiv(const char *& s)
{
compileTerm(s);
for(;;)
{
if(* s =='*'){
s ++;
printf(push eax\\\
);
compileTerm(s);
printf(mov ebx,eax\\\
);
printf(pop eax\\\
);
printf(imul ebx\\\
);
} else if(* s =='/'){
s ++;
printf(push eax\\\
);
compileTerm(s);
printf(mov ebx,eax\\\
);
printf(pop eax\\\
);
printf(idiv ebx\\\
);
} else break;
}
}

void compileAddSub(const char *& s)
{
compileMulDiv(s);
for(;;)
{
if(* s =='+'){
s ++;
printf(push eax\\\
);
compileMulDiv(s);
printf(mov ebx,eax\\\
);
printf(pop eax\\\
);
printf(add ebx\\\
);
} else if(* s ==' - '){
s ++;
printf(push eax \\\
);
compileMulDiv(s);
printf(mov ebx,eax\\\
);
printf(pop eax\\\
);
printf(sub ebx\\\
);
} else break;
}
}

void compileExpression(const char *& s)
{
compileAddSub
}

int main(int argc,const char * argv [])
{
if(argc!= 2) < expr> \\\
);
compileExpression(argv [1]);
return 0;
}



例如运行编译器 1 + y * (-3 + x)作为输出。

  mov eax,1 
push eax
mov eax,dword ptr y
push eax
mov eax,3
neg eax
push eax
mov eax,dword ptr x
mov ebx,eax
pop eax
add ebx
mov ebx,eax
pop eax
imul ebx
mov ebx,eax $ b $然而,编写编译器的这种方法不能很好地扩展到编译器,因为编译器可以编译为一个优化编译器。



虽然可以通过在输出阶段添加一个窥视孔优化器来获得一些优化,但许多有用的优化只能查看更高



即使是裸机代码生成也可以通过查看更多代码来获益,例如决定哪个寄存器分配给什么或决定哪个可能的汇编器



例如,同一个表达式可以由优化编译器编译为

  mov eax,dword ptr x 
sub eax,3
imul dword ptr y
inc eax


The book 'Modern Compiler Design' Is the nice book about compilers. but in it's source code something that is annoying me is AST or Abstract Syntax Tree.Suppose we want to write a parenthesized expression parser which parses something like : ((2+3)*4) * 2 ! The book says that we have a AST Like :

        ((2+3)*4) * 2
        /   |   \
    (2+3)*4     *     2
        / |  \
   (2+3)  *   4
   / | \
  2  + 3

So Should I save a tree in memory or just use recursive calls;Note If I don't store it in memory , How Can I convert it to machine code ? Parser code :

int parse(Expression &expr)
{
 if(token.class=='D')
  { 
    expr.type='D';
    expr.value=token.val-'0';
    get_next_token();
    return 1;
  }
 if(token.class=='('){
   expr.type='P';
   get_next_token();
   parse(&expr->left);
   parse_operator(&expr->op);
   parse(&expr->right);
 if(token.class!=')')Error("missing )");
 get_next_token();
 return 1;
 }
 return 0;
}

Grammar Is :

expr -> expr | (expr op expr)
digit   -> 0|1|2....|9
op  -> +|*

解决方案

You can store the tree in memory or you can directly produce the required output code. Storing the intermediate form is normally done to be able to do some processing on the code at an higher level before generating output.

In your case for example it would be simple to discover that your expression contains no variables and therefore the result is a fixed number. Looking only at one node at a time this however is not possible. To be more explicit if after looking at "2*" you generate machine code for computing the double of something this code is sort of wasted when the other part is for example "3" because your program will compute "3" and then compute the double of that every time while just loading "6" would be equivalent but shorter and faster.

If you want to generate the machine code then you need first to know for what kind of machine the code is going to be generated... the simplest model is a stack-based approach. In this case you need no register allocation logic and it's easy to compile directly to machine code without the intermediate representation. Consider this small example that handles just integers, four operations, unary negation and variables... you will notice that no data structure is used at all: source code characters are read and machine instructions are written to output...

#include <stdio.h>
#include <stdlib.h>

void error(const char *what)
{
    fprintf(stderr, "ERROR: %s\n", what);
    exit(1);
}

void compileLiteral(const char *& s)
{
    int v = 0;
    while (*s >= '0' && *s <= '9')
    {
        v = v*10 + *s++ - '0';
    }
    printf("    mov  eax, %i\n", v);
}

void compileSymbol(const char *& s)
{
    printf("    mov  eax, dword ptr ");
    while ((*s >= 'a' && *s <= 'z') ||
           (*s >= 'A' && *s <= 'Z') ||
           (*s >= '0' && *s <= '9') ||
           (*s == '_'))
    {
        putchar(*s++);
    }
    printf("\n");
}

void compileExpression(const char *&);

void compileTerm(const char *& s)
{
    if (*s >= '0' && *s <= '9') {
        // Number
        compileLiteral(s);
    } else if ((*s >= 'a' && *s <= 'z') ||
               (*s >= 'A' && *s <= 'Z') ||
               (*s == '_')) {
        // Variable
        compileSymbol(s);
    } else if (*s == '-') {
        // Unary negation
        s++;
        compileTerm(s);
        printf("    neg  eax\n");
    } else if (*s == '(') {
        // Parenthesized sub-expression
        s++;
        compileExpression(s);
        if (*s != ')')
            error("')' expected");
        s++;
    } else {
        error("Syntax error");
    }
}

void compileMulDiv(const char *& s)
{
    compileTerm(s);
    for (;;)
    {
        if (*s == '*') {
            s++;
            printf("    push eax\n");
            compileTerm(s);
            printf("    mov  ebx, eax\n");
            printf("    pop  eax\n");
            printf("    imul ebx\n");
        } else if (*s == '/') {
            s++;
            printf("    push eax\n");
            compileTerm(s);
            printf("    mov  ebx, eax\n");
            printf("    pop  eax\n");
            printf("    idiv ebx\n");
        } else break;
    }
}

void compileAddSub(const char *& s)
{
    compileMulDiv(s);
    for (;;)
    {
        if (*s == '+') {
            s++;
            printf("    push eax\n");
            compileMulDiv(s);
            printf("    mov  ebx, eax\n");
            printf("    pop  eax\n");
            printf("    add  ebx\n");
        } else if (*s == '-') {
            s++;
            printf("    push eax\n");
            compileMulDiv(s);
            printf("    mov  ebx, eax\n");
            printf("    pop  eax\n");
            printf("    sub  ebx\n");
        } else break;
    }
}

void compileExpression(const char *& s)
{
    compileAddSub(s);
}

int main(int argc, const char *argv[])
{
    if (argc != 2) error("Syntax: simple-compiler <expr>\n");
    compileExpression(argv[1]);
    return 0;
}

For example running the compiler with 1+y*(-3+x) as input you get as output

mov  eax, 1
push eax
mov  eax, dword ptr y
push eax
mov  eax, 3
neg  eax
push eax
mov  eax, dword ptr x
mov  ebx, eax
pop  eax
add  ebx
mov  ebx, eax
pop  eax
imul ebx
mov  ebx, eax
pop  eax
add  ebx

However this approach of writing compilers doesn't scale well to an optimizing compiler.

While it's possible to get some optimization by adding a "peephole" optimizer in the output stage, many useful optimizations are possible only looking at code from an higher point of view.

Also even the bare machine code generation could benefit by seeing more code, for example to decide which register assign to what or to decide which of the possible assembler implementations would be convenient for a specific code pattern.

For example the same expression could be compiled by an optimizing compiler to

mov  eax, dword ptr x
sub  eax, 3
imul dword ptr y
inc  eax

这篇关于递归下降解析器的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆