Python和C之间不寻常的速度差异++ [英] Unusual Speed Difference between Python and C++

查看:115
本文介绍了Python和C之间不寻常的速度差异++的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我最近写了一个简短的算法来计算快乐号码的Python编写的。该计划允许你选择一个上限,这将决定所有快乐的数字下面。对于速度的比较,我决定把该算法我知道,从蟒蛇到C ++的最直接的翻译。

出人意料的是,C ++版本的运行速度比Python版本显著慢。执行时间之间的精确的速度测试发现前10,000个快乐的数字表示Python程序平均运行0.59秒,C ++版本,平均8.5秒运行。

我会认为这速度差的事实,我不得不写辅助函数的计算部分(例如确定如果一个元素是在一个列表/阵列/向量)在C ++版本,它已经内置于Python语言。

首先,这是真正的原因,这种荒谬的速度差异,其次,我怎样才能改变C ++版本执行比更快的Python版本(这应该是我的意见的方式)。

两件code,具有速度测试在这里: Python版本,的 C ++版本。感谢您的帮助。

 的#include<的iostream>
#包括<载体>
#包括<字符串>
#包括<的ctime>
#包括< WINDOWS.H>

使用名字空间std;

布尔inVector(INT inQuestion,矢量< INT>知道);
INT总和(矢量< INT>给出);
INT POW(INT给出,内部功率);
无效calcMain(INT上界);

诠释的main()
{
    而(真)
    {
    INT上界;
    COUT<< 选择一个上限:;
    CIN>>上界;
    长开始,结束;
    开始= GetTickCount的();
    calcMain(上界);
    结束=的GetTickCount();
    双秒=(双)(完启动)/ 1000.0;
    COUT<<秒-1;&其中; 秒。 << ENDL<< ENDL;
    }
    返回0;
}

无效calcMain(INT上界)
{
    矢量< int的>已知的;
    的for(int i = 0; I< =上界;我++)
    {
    布尔下一= FALSE;
    INT电流= I;
    矢量< int的>历史;
    而(!旁边)
    {
    字符*缓冲区=新的字符[10];
    itoa(电流,缓冲液,10);
    串位=缓冲区;
    删除缓存;
    矢量< int的>广场;
    对于(INT J = 0; J< digits.size(); J ++)
    {
    焦炭charDigit =位数[J]。
    INT位=的atoi(安培; charDigit);
    INT平方= POW(数字,2);
    squares.push_back(广场);
    }
    INT squaresum =总和(正方形);
    电流= squaresum;
    如果(inVector(当前,历史))
    {
    接下来= TRUE;
    如果(当前== 1)
    {
    known.push_back(ⅰ);
    // cout的<< I<< \ t的;
    }
    }
    history.push_back(电流);
    }
    }
    // cout的<< \ñ\ N的;
}

布尔inVector(INT inQuestion,矢量< INT>知道)
{
    对于(矢量< INT>:迭代它= known.begin(!);它= known.end();它++)
    如果(*它== inQuestion)
    返回true;
    返回false;
}

INT总和(矢量< INT>给出)
{
    INT总和= 0;
    对于(矢量< INT>:迭代它= given.begin(!);它= given.end();它++)
    总和+ = *它;
    返回总和;
}

INT POW(INT给出,INT功率)
{
    INT原=给定;
    INT电流=给定;
    的for(int i = 0; I<电力-1;我++)
    目前* =原件;
    返回电流;
}
 


 #!的/ usr /斌/包膜蟒蛇

进口timeit

上界= 0

高清calcMain():
    已知= []
    对于i在范围(0,上界+ 1):
        接下来=假
        电流= I
        历史= []
        而不是下一个:
            位数= STR(电流)
            正方形= [POW(INT(位),2),用于在数字数字]
            squaresum =总和(正方形)
            电流= squaresum
            如果目前的历史:
                接下来= TRUE
                如果目前的== 1:
                    known.append㈠
                    ##打印我,\ t的,
            history.append(电流)
    ##打印\ NEND

而真正的:
    上界=输入(选择一个上限:)
    结果= timeit.Timer(calcMain).timeit(1)
    打印结果,秒。\ N
 

解决方案

有关100000元,Python的code了6.9秒而C ++最初花了超过37秒。

我做了一些基本的优化您的code和设法让C ++ code以上比Python实现快100倍。现在确实在0.06秒100000元。这比原来的C ++ code快617倍。

最重要的是要编译发布时,所有的优化。这code是字面上的数量级慢在调试模式下。

接下来,我将解释我做了优化。

  • 感动循环之外的所有矢量声明;由一个清晰的()操作,这是比调用构造快得多取而代之。
  • 替换调用POW(价值2)通过乘法:值*值
  • 而不是有一个广场载体,并呼吁金额上,我总结只用一个整数就地值。
  • 避免的所有字符串操作,相比于整数运算,这是非常缓慢的。例如,有可能通过由10反复分割和提取所得到的值的模数10,代替的值的每个字符转换为字符串,然后回到INT来计算每个数字的平方。
  • 通过更换通过完全省去了辅助功能按值传递与引用传递,并最终避免的所有向量副本,第一。
  • 在淘汰了一些临时变量。
  • 而可能很多小细节我忘了。比较你的code和矿方并排看正是我所做的。

它可能会更受使用pre-分配的数组,而不是载体,优化了code,但是这将是一个多一点工作,我会离开它作为一个练习的读者。 :P

下面是优化code:

 的#include<的iostream>
#包括<载体>
#包括<字符串>
#包括<的ctime>
#包括<算法>
#包括< WINDOWS.H>

使用名字空间std;

无效calcMain(中​​间体上界,矢量< INT>&安培;已知的);

诠释的main()
{
    而(真)
    {
        矢量< int的>结果;
        INT上界;
        COUT<< 选择一个上限:;
        CIN>>上界;
        长开始,结束;
        开始= GetTickCount的();
        calcMain(上界,结果);
        结束=的GetTickCount();
        用于(为size_t I = 0; I< results.size(); ++ I){
            COUT<<结果[1]  - ;&其中; ,;
        }
        COUT<< ENDL;
        双秒=(双)(完启动)/ 1000.0;
        COUT<<秒-1;&其中; 秒。 << ENDL<< ENDL;
    }
    返回0;
}

无效calcMain(中​​间体上界,矢量< INT>&安培;已知的)
{
    矢量< int的>历史;
    的for(int i = 0; I< =上界;我++)
    {
        INT电流= I;
        history.clear();
        而(真)
        {
                INT TEMP =电流;
                INT总和= 0;
                而(温度大于0){
                    总和+ =(TEMP%10)*(TEMP%10);
                    温度/ = 10;
                }
                电流=总和;
                如果(找到(history.begin(),history.end(),电流)!= history.end())
                {
                        如果(当前== 1)
                        {
                                known.push_back(ⅰ);
                        }
                        打破;
                }
                history.push_back(电流);
        }
    }
}
 

I recently wrote a short algorithm to calculate happy numbers in python. The program allows you to pick an upper bound and it will determine all the happy numbers below it. For a speed comparison I decided to make the most direct translation of the algorithm I knew of from python to c++.

Surprisingly, the c++ version runs significantly slower than the python version. Accurate speed tests between the execution times for discovering the first 10,000 happy numbers indicate the python program runs on average in 0.59 seconds and the c++ version runs on average in 8.5 seconds.

I would attribute this speed difference to the fact that I had to write helper functions for parts of the calculations (for example determining if an element is in a list/array/vector) in the c++ version which were already built in to the python language.

Firstly, is this the true reason for such an absurd speed difference, and secondly, how can I change the c++ version to execute more quickly than the python version (the way it should be in my opinion).

The two pieces of code, with speed testing are here: Python Version, C++ Version. Thanks for the help.

#include <iostream>
#include <vector>
#include <string>
#include <ctime>
#include <windows.h>

using namespace std;

bool inVector(int inQuestion, vector<int> known);
int sum(vector<int> given);
int pow(int given, int power);
void calcMain(int upperBound);

int main()
{
    while(true)
    {
    	int upperBound;
    	cout << "Pick an upper bound: ";
    	cin >> upperBound;
    	long start, end;
    	start = GetTickCount();
    	calcMain(upperBound);
    	end = GetTickCount();
    	double seconds = (double)(end-start) / 1000.0;
    	cout << seconds << " seconds." << endl << endl;
    }
    return 0;
}

void calcMain(int upperBound)
{
    vector<int> known;
    for(int i = 0; i <= upperBound; i++)
    {
    	bool next = false;
    	int current = i;
    	vector<int> history;
    	while(!next)
    	{
    		char* buffer = new char[10];
    		itoa(current, buffer, 10);
    		string digits = buffer;
    		delete buffer;
    		vector<int> squares;
    		for(int j = 0; j < digits.size(); j++)
    		{
    			char charDigit = digits[j];
    			int digit = atoi(&charDigit);
    			int square = pow(digit, 2);
    			squares.push_back(square);
    		}
    		int squaresum = sum(squares);
    		current = squaresum;
    		if(inVector(current, history))
    		{
    			next = true;
    			if(current == 1)
    			{
    				known.push_back(i);
    				//cout << i << "\t";
    			}
    		}
    		history.push_back(current);
    	}
    }
    //cout << "\n\n";
}

bool inVector(int inQuestion, vector<int> known)
{
    for(vector<int>::iterator it = known.begin(); it != known.end(); it++)
    	if(*it == inQuestion)
    		return true;
    return false;
}

int sum(vector<int> given)
{
    int sum = 0;
    for(vector<int>::iterator it = given.begin(); it != given.end(); it++)
    	sum += *it;
    return sum;
}

int pow(int given, int power)
{
    int original = given;
    int current = given;
    for(int i = 0; i < power-1; i++)
    	current *= original;
    return current;
}


#!/usr/bin/env python

import timeit

upperBound = 0

def calcMain():
    known = []
    for i in range(0,upperBound+1):
        next = False
        current = i
        history = []
        while not next:
            digits = str(current)
            squares = [pow(int(digit), 2) for digit in digits]
            squaresum = sum(squares)
            current = squaresum
            if current in history:
                next = True
                if current == 1:
                    known.append(i)
                    ##print i, "\t",
            history.append(current)
    ##print "\nend"

while True:    
    upperBound = input("Pick an upper bound: ")
    result = timeit.Timer(calcMain).timeit(1)
    print result, "seconds.\n"

解决方案

For 100000 elements, the Python code took 6.9 seconds while the C++ originally took above 37 seconds.

I did some basic optimizations on your code and managed to get the C++ code above 100 times faster than the Python implementation. It now does 100000 elements in 0.06 seconds. That is 617 times faster than the original C++ code.

The most important thing is to compile in Release mode, with all optimizations. This code is literally orders of magnitude slower in Debug mode.

Next, I will explain the optimizations I did.

  • Moved all vector declarations outside of the loop; replaced them by a clear() operation, which is much faster than calling the constructor.
  • Replaced the call to pow(value, 2) by a multiplication : value * value.
  • Instead of having a squares vector and calling sum on it, I sum the values in-place using just an integer.
  • Avoided all string operations, which are very slow compared to integer operations. For instance, it is possible to compute the squares of each digit by repeatedly dividing by 10 and fetching the modulus 10 of the resulting value, instead of converting the value to a string and then each character back to int.
  • Avoided all vector copies, first by replacing passing by value with passing by reference, and finally by eliminating the helper functions completely.
  • Eliminated a few temporary variables.
  • And probably many small details I forgot. Compare your code and mine side-by-side to see exactly what I did.

It may be possible to optimize the code even more by using pre-allocated arrays instead of vectors, but this would be a bit more work and I'll leave it as an exercise to the reader. :P

Here's the optimized code :

#include <iostream>
#include <vector>
#include <string>
#include <ctime>
#include <algorithm>
#include <windows.h>

using namespace std;

void calcMain(int upperBound, vector<int>& known);

int main()
{
    while(true)
    {
        vector<int> results;
        int upperBound;
        cout << "Pick an upper bound: ";
        cin >> upperBound;
        long start, end;
        start = GetTickCount();
        calcMain(upperBound, results);
        end = GetTickCount();
        for (size_t i = 0; i < results.size(); ++i) {
            cout << results[i] << ", ";
        }
        cout << endl;
        double seconds = (double)(end-start) / 1000.0;
        cout << seconds << " seconds." << endl << endl;
    }
    return 0;
}

void calcMain(int upperBound, vector<int>& known)
{
    vector<int> history;
    for(int i = 0; i <= upperBound; i++)
    {
        int current = i;
        history.clear();
        while(true)
        {
                int temp = current;
                int sum = 0;
                while (temp > 0) {
                    sum += (temp % 10) * (temp % 10);
                    temp /= 10;
                }
                current = sum;
                if(find(history.begin(), history.end(), current) != history.end())
                {
                        if(current == 1)
                        {
                                known.push_back(i);
                        }
                        break;
                }
                history.push_back(current);
        }
    }
}

这篇关于Python和C之间不寻常的速度差异++的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆