对于VM /间preTER性能改进策略？ [英] Performance improvement strategies for VM / interpreter?

查看：111 发布时间：2016/8/21 20:43:34 c++ c performance interpreter vm-implementation

本文介绍了对于VM /间preTER性能改进策略？的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我已经用C语言编写一个简单的虚拟机，使用指令一个简单的开关，不用任何指令译码，但性能是可怕的。

对于简单的aritmetic操作虚拟机比本地C code为相同的操作慢约4000倍。我与一组长度为10亿阵列的测试中，第一自由以下组成的程序指令，随机+ - * /运算，2阵列保持随机整数，且第三阵列是操作对象存储。

我期待看到3-4次降算术性能，使'4000X真的把我吓倒了。即使是最慢的跨preTED语言似乎提供更高的性能。那么，我与我的方法去错了，我怎么能提高，而不诉诸JIT编译机器code性能？

的实施是...基本上是最简单的我能想出：

 开始：
    {
        开关（*（OP +（C ++）））
        {
        情况下0：
            添加（IN1 + C，IN2 + C，OUT + C）;转到开始;        情况1：
            子（IN1 + C，IN2 + C，OUT + C）;转到开始;        案例2：
            MUL（IN1 + C，IN2 + C，OUT + C）;转到开始;        案例3：
            DIV（IN1 + C，IN2 + C，OUT + C）;转到开始;        情况4：
            COUT＆LT;＆LT; ＆LT节目结束;＆LT; ENDL;
            转到结束;        默认：
            COUT＆LT;＆LT; 错误！！！ ＆LT;＆LT; ENDL;        }
    }结束：

更新：
我是用程序的长度玩弄时，我注意到我使用个人资料，实际上打破了QElapsedTimer。现在，我使用从时钟（）函数，并根据其计算goto实际上看齐运行与本地code，也许一点点降低。是合法的结果？下面是完整的源代码（它是丑陋的，我知道，这只是因为毕竟测试）：

 的#include＆LT; QtGlobal＆GT;
＃包括LT＆;＆iostream的GT;
＃包括LT＆;＆stdio.h中GT;
＃包括LT＆;＆的ctime GT;使用命名空间std;＃定义长度7000无效添加（INT和放大器;一，INT和b，诠释和再现性）{R = A * B;}
无效子（INT和放大器;一，INT和b，诠释和再现性）{R = A  -  B;}
MUL无效（INT和放大器;一，INT和b，诠释和再现性）{R = A * B;}
无效格（INT和放大器;一，INT和b，诠释和再现性）{R = A / B;}诠释的main（）
{
    字符* OP =新的char [长度];
    为int * IN1 =新INT [长度];
    为int * IN2 =新INT [长度];
    为int *出新= INT [长度];    的for（int i = 0; I＆LT;长度+ I）
    {
        *（OP + I）= I％4;
        *（IN1 + I）= qrand（）;
        *（IN2 + I）= qrand（）+ 1;
    }    *（OP +长度-1）= 4; //程序结束
    长长的SCLOCK，且fCLOCK;
    unsigned int类型C = 0;
    SCLOCK =时钟（）;    COUT＆LT;＆LT; 程序开始＆LT;＆LT; ENDL;    静态无效*表[] = {
        ＆功放;＆安培; do_add，
        ＆功放;＆安培; do_sub，
        ＆功放;＆安培; do_mul，
        ＆功放;＆安培; do_div，
        ＆功放;＆安培; do_end，
        ＆功放;＆安培; do_err，
        ＆功放;＆安培; do_fin};＃定义跳转（）转到*表[OP [C ++]    跳（）;
do_add：
    添加（IN1 [C]，IN2 [C]，走出[C]）;跳（）;
do_sub：
    子（IN1 [C]，IN2 [C]，走出[C]）;跳（）;
do_mul：
    MUL（IN1 [C]，IN2 [C]，走出[C]）;跳（）;
do_div：
    DIV（IN1 [C]，IN2 [C]，走出[C]）;跳（）;
do_end：
    COUT＆LT;＆LT; ＆LT节目结束;＆LT; ENDL;转到*表[6];
do_err：
    COUT＆LT;＆LT; 错误！！！ ＆LT;＆LT; ENDL;转到*表[6];
do_fin：    且fCLOCK =时钟（）;
    COUT＆LT;＆LT;且fCLOCK  -  SCLOCK＆LT;＆LT; ENDL;    删除[]运算;
    删除[] IN1;
    删除[]平方英寸;
    删除[]的;    IN1 =新INT [长度];
    IN2 =新INT [长度];
    OUT =新INT [长度];    的for（int i = 0; I＆LT;长度+ I）
    {
        *（IN1 + I）= qrand（）;
        *（IN2 + I）= qrand（）+ 1;
    }    COUT＆LT;＆LT; 本土开始＆LT;＆LT; ENDL;    SCLOCK =时钟（）;    的for（int i = 0; I＆LT;长度，I + = 4）
    {        *（OUT + I）= *（IN1 + I）+ *（IN2 + I）;
        *（OUT + I + 1）= *（IN1 + I + 1） -  *（IN2 + I + 1）;
        *（OUT + I + 2）= *（IN1 + I + 2）*（IN2 + I + 2）;
        *（OUT + I + 3）= *（IN1 + I + 3）/ *（IN2 + I + 3）;
    }    且fCLOCK =时钟（）;
    COUT＆LT;＆LT;且fCLOCK  -  SCLOCK＆LT;＆LT; ENDL;    删除[] IN1;
    删除[]平方英寸;
    删除[]的;    返回0;
}

解决方案

Darek Mihocka对创造可移植的C快间preters一个良好而深入的新手必看：<一href=\"http://www.emulators.com/docs/nx25_nostradamus.htm\">http://www.emulators.com/docs/nx25_nostradamus.htm

I have written a simple VM in C, using a simple switch of instructions, without any instruction decoding whatsoever, but performance is terrible.

For simple aritmetic operations the VM is about 4000 times slower than native C code for the same operations. I tested with a group of arrays of length 10 million, the first consisting of the program instructions, random + - * / operations, 2 arrays holding random integers and the third array being the operation target storage.

I was expecting to see 3-4 times drop in arithmetic performance, so that `4000x really blew me away. Even the slowest interpreted languages seem to offer higher performance. So where I am going wrong with my approach and how can I improve performance without resorting to JIT compilation to machine code?

The implementation is... basically the simplest I could come up with:

begin:
    {
        switch (*(op+(c++)))
        {
        case 0:
            add(in1+c, in2+c, out+c); goto begin;

        case 1:
            sub(in1+c, in2+c, out+c); goto begin;

        case 2:
            mul(in1+c, in2+c, out+c); goto begin;

        case 3:
            div(in1+c, in2+c, out+c); goto begin;

        case 4:
            cout << "end of program" << endl;
            goto end;

        default:
            cout << "ERROR!!!" << endl;

        }
    }

end:

UPDATE: I was toying with the length of the program when I noticed the QElapsedTimer I was using to profile was actually broken. Now I am using the clock() function from and according to it the computed goto is actually running on par with the native code, maybe a tad lower. Is that result legit??? Here is the full source (it is ugly I know, it's just for testing after all):

#include <QtGlobal>
#include <iostream>
#include <stdio.h>
#include <ctime>

using namespace std;

#define LENGTH 70000000

void add(int & a, int & b, int & r) {r = a * b;}
void sub(int & a, int & b, int & r) {r = a - b;}
void mul(int & a, int & b, int & r) {r = a * b;}
void div(int & a, int & b, int & r) {r = a / b;}

int main()
{
    char * op = new char[LENGTH];
    int * in1 = new int[LENGTH];
    int * in2 = new int[LENGTH];
    int * out = new int[LENGTH];

    for (int i = 0; i < LENGTH; ++i)
    {
        *(op+i) = i % 4;
        *(in1+i) = qrand();
        *(in2+i) = qrand()+1;
    }

    *(op+LENGTH-1) = 4; // end of program


    long long  sClock, fClock;


    unsigned int c = 0;
    sClock = clock();

    cout << "Program begins" << endl;

    static void* table[] = {
        &&do_add,
        &&do_sub,
        &&do_mul,
        &&do_div,
        &&do_end,
        &&do_err,
        &&do_fin};

#define jump() goto *table[op[c++]]

    jump();
do_add:
    add(in1[c], in2[c], out[c]); jump();
do_sub:
    sub(in1[c], in2[c], out[c]); jump();
do_mul:
    mul(in1[c], in2[c], out[c]); jump();
do_div:
    div(in1[c], in2[c], out[c]); jump();
do_end:
    cout << "end of program" << endl; goto *table[6];
do_err:
    cout << "ERROR!!!" << endl; goto *table[6];
do_fin:

    fClock = clock();
    cout << fClock - sClock << endl;

    delete [] op;
    delete [] in1;
    delete [] in2;
    delete [] out;

    in1 = new int[LENGTH];
    in2 = new int[LENGTH];
    out = new int[LENGTH];

    for (int i = 0; i < LENGTH; ++i)
    {
        *(in1+i) = qrand();
        *(in2+i) = qrand()+1;
    }

    cout << "Native begins" << endl;

    sClock = clock();

    for (int i = 0; i < LENGTH; i += 4)
    {

        *(out+i) = *(in1+i) + *(in2+i);
        *(out+i+1) = *(in1+i+1) - *(in2+i+1);
        *(out+i+2) = *(in1+i+2) * *(in2+i+2);
        *(out+i+3) = *(in1+i+3) / *(in2+i+3);
    }

    fClock = clock();
    cout << fClock - sClock << endl;

    delete [] in1;
    delete [] in2;
    delete [] out;

    return 0;
}

解决方案

Darek Mihocka has a good and in-depth writeup on creating fast interpreters in portable C: http://www.emulators.com/docs/nx25_nostradamus.htm

这篇关于对于VM /间preTER性能改进策略？的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

对于VM /间preTER性能改进策略？ [英] Performance improvement strategies for VM / interpreter?

问题描述

相关文章

C/C++开发最新文章

热门教程

热门工具

登录关闭

对于VM /间preTER性能改进策略？ [英] Performance improvement strategies for VM / interpreter?

问题描述

相关文章

C/C++开发最新文章

热门教程

热门工具

登录 关闭

登录关闭