Python和C ++之间的exception速度差异

我最近写了一个简短的algorithm来计算python中的快乐数字 。 该程序允许你select一个上限,它将确定下面所有的幸福数字。 对于速度比较,我决定将我从Python知道的algorithm的最直接的翻译成C ++。

令人惊讶的是,c ++版本的运行速度比Python版本慢得多。 在发现前10000个快乐数字的执行时间之间进行精确的速度testing表明,python程序平均在0.59秒内运行,而c ++版本平均在8.5秒内运行。

我将这种速度差异归结为这样一个事实,即我必须在已经内置到python语言的c ++版本中为部分计算编写助手函数(例如,确定元素是否在列表/数组/向量中) 。

首先,这是这样一个荒谬的速度差异的真正原因,其次,我怎么能改变c + +版本执行比python版本更快(应该在我看来)。

这两个代码,速度testing在这里: Python版本 , C ++版本 。 谢谢您的帮助。

#include <iostream> #include <vector> #include <string> #include <ctime> #include <windows.h> using namespace std; bool inVector(int inQuestion, vector<int> known); int sum(vector<int> given); int pow(int given, int power); void calcMain(int upperBound); int main() { while(true) { int upperBound; cout << "Pick an upper bound: "; cin >> upperBound; long start, end; start = GetTickCount(); calcMain(upperBound); end = GetTickCount(); double seconds = (double)(end-start) / 1000.0; cout << seconds << " seconds." << endl << endl; } return 0; } void calcMain(int upperBound) { vector<int> known; for(int i = 0; i <= upperBound; i++) { bool next = false; int current = i; vector<int> history; while(!next) { char* buffer = new char[10]; itoa(current, buffer, 10); string digits = buffer; delete buffer; vector<int> squares; for(int j = 0; j < digits.size(); j++) { char charDigit = digits[j]; int digit = atoi(&charDigit); int square = pow(digit, 2); squares.push_back(square); } int squaresum = sum(squares); current = squaresum; if(inVector(current, history)) { next = true; if(current == 1) { known.push_back(i); //cout << i << "\t"; } } history.push_back(current); } } //cout << "\n\n"; } bool inVector(int inQuestion, vector<int> known) { for(vector<int>::iterator it = known.begin(); it != known.end(); it++) if(*it == inQuestion) return true; return false; } int sum(vector<int> given) { int sum = 0; for(vector<int>::iterator it = given.begin(); it != given.end(); it++) sum += *it; return sum; } int pow(int given, int power) { int original = given; int current = given; for(int i = 0; i < power-1; i++) current *= original; return current; } 

 #!/usr/bin/env python import timeit upperBound = 0 def calcMain(): known = [] for i in range(0,upperBound+1): next = False current = i history = [] while not next: digits = str(current) squares = [pow(int(digit), 2) for digit in digits] squaresum = sum(squares) current = squaresum if current in history: next = True if current == 1: known.append(i) ##print i, "\t", history.append(current) ##print "\nend" while True: upperBound = input("Pick an upper bound: ") result = timeit.Timer(calcMain).timeit(1) print result, "seconds.\n" 

对于100000个元素,Python代码花了6.9秒,而C ++花了37秒以上。

我对代码做了一些基本的优化,并设法使C ++代码比Python实现快100倍以上。 现在它在0.06秒内完成了100000个元素。 这比原来的C ++代码快617倍。

最重要的是在Release模式下进行编译,并进行所有优化。 这个代码在debugging模式下的字面速度要慢几个数量级。

接下来,我将解释我所做的优化。

  • 将所有的向量声明移到了循环之外; 用clear()操作replace它们,这比调用构造器快得多。
  • 用乘法:value *值replace对pow(value,2)的调用。
  • 而不是有一个正方形向量和调用它的总和,我总结值就地使用一个整数。
  • 避免了所有的string操作,与整数操作相比非常慢。 例如,可以通过重复除以10并获取结果值的模数10来计算每个数字的平方,而不是将该值转换为string,然后将每个字符返回到int。
  • 避免所有的vector副本,首先通过引用传递来replace传值,最后完全消除辅助函数。
  • 消除了一些临时variables。
  • 也许很多细节我忘了。 比较你的代码和我的并排,看看我做了什么。

通过使用预先分配的数组而不是向量来更好地优化代码是可能的,但是这样做可能会多一点,我将把它作为练习给读者。 :P

这里是优化的代码:

 #include <iostream> #include <vector> #include <string> #include <ctime> #include <algorithm> #include <windows.h> using namespace std; void calcMain(int upperBound, vector<int>& known); int main() { while(true) { vector<int> results; int upperBound; cout << "Pick an upper bound: "; cin >> upperBound; long start, end; start = GetTickCount(); calcMain(upperBound, results); end = GetTickCount(); for (size_t i = 0; i < results.size(); ++i) { cout << results[i] << ", "; } cout << endl; double seconds = (double)(end-start) / 1000.0; cout << seconds << " seconds." << endl << endl; } return 0; } void calcMain(int upperBound, vector<int>& known) { vector<int> history; for(int i = 0; i <= upperBound; i++) { int current = i; history.clear(); while(true) { int temp = current; int sum = 0; while (temp > 0) { sum += (temp % 10) * (temp % 10); temp /= 10; } current = sum; if(find(history.begin(), history.end(), current) != history.end()) { if(current == 1) { known.push_back(i); } break; } history.push_back(current); } } } 

有一个新的,更快的版本作为一个单独的答案 ,所以这个答案已被弃用。


我重写了你的algorithm,只要它发现号码快乐或不快乐就把它caching起来。 我也尝试尽可能使pythonic,例如通过创build单独的函数digits()happy() 。 对不起使用Python 3,但我也可以炫耀一些有用的东西。

这个版本要快得多 。 它运行在1.7s ,比原来需要18s的程序快10倍 (好吧,我的MacBook是相当老,慢:))

 #!/usr/bin/env python3 from timeit import Timer from itertools import count print_numbers = False upperBound = 10**5 # Default value, can be overidden by user. def digits(x:'nonnegative number') -> "yields number's digits": if not (x >= 0): raise ValueError('Number should be nonnegative') while x: yield x % 10 x //= 10 def happy(number, known = {1}, happies = {1}) -> 'True/None': '''This function tells if the number is happy or not, caching results. It uses two static variables, parameters known and happies; the first one contains known happy and unhappy numbers; the second contains only happy ones. If you want, you can pass your own known and happies arguments. If you do, you should keep the assumption commented out on the 1 line. ''' # assert 1 in known and happies <= known # <= is expensive if number in known: return number in happies history = set() while True: history.add(number) number = sum(x**2 for x in digits(number)) if number in known or number in history: break known.update(history) if number in happies: happies.update(history) return True def calcMain(): happies = {x for x in range(upperBound) if happy(x) } if print_numbers: print(happies) if __name__ == '__main__': upperBound = eval( input("Pick an upper bound [default {0}]: " .format(upperBound)).strip() or repr(upperBound)) result = Timer(calcMain).timeit(1) print ('This computation took {0} seconds'.format(result)) 

看起来你正在向其他函数传递值。 这将是一个重大的放缓,因为该程序实际上将它的vector完整副本,然后将其传递到您的函数。 为了解决这个问题,传递一个常量引用而不是副本。 所以,而不是:

 int sum(vector<int> given) 

使用:

 int sum(const vector<int>& given) 

当你这样做时,你将不再能够使用vector :: iterator,因为它不是常量。 你需要用vector :: const_iteratorreplace它。

您也可以传入非常量引用,但在这种情况下,根本不需要修改参数。

我可以看到你有不less堆分配是不必要的

例如:

 while(!next) { char* buffer = new char[10]; 

这看起来不是非常优化。 所以,你可能想要预先分配数组,并在循环中使用它。 这是一个很容易发现和做的基本的优化技术。 它可能会变成一团糟,所以要小心。

你也使用atoi()函数,我不知道它是否真正优化。 也许做一个模数10,并得到数字可能会更好(你必须衡量你,我没有testing这个)。

你有一个线性search(inVector)的事实可能是不好的。 用std :: setreplacevector数据结构可能会加快速度。 hash_set也可以做到这一点。

但是我认为最糟糕的问题是在这个循环内部堆栈中的string和这个东西的分配。 那看起来不太好 我会先在那些地方尝试。

这是我的第二个答案; 它caching的东西像平方和的值<= 10**6

  happy_list[sq_list[x%happy_base] + sq_list[x//happy_base]] 

那是,

  • 该号码被分成3位数字+3位数字
  • 预先计算的表格用于获得这两个部分的平方和
  • 这两个结果被添加
  • 预先计算好的表格可以得到数字的幸福:

我不认为Python版本可以做得比这更快(好吧,如果你扔掉旧版本,那就是try:开销,它快了10%)。

我认为这是一个很好的问题

  • 必须快的东西应该用C写成
  • 然而,通常情况下,你并不需要快速的工作(即使你需要运行一天的程序,也不如程序员优化它的时间)
  • 用Python编写程序更容易,更快捷
  • 但是对于一些问题,尤其是计算问题,像上面那样的C ++解决scheme实际上比尝试优化Python程序更可读,更美观。

好吧,这里(第2版现在…):

 #!/usr/bin/env python3 '''Provides slower and faster versions of a function to compute happy numbers. slow_happy() implements the algorithm as in the definition of happy numbers (but also caches the results). happy() uses the precomputed lists of sums of squares and happy numbers to return result in just 3 list lookups and 3 arithmetic operations for numbers less than 10**6; it falls back to slow_happy() for big numbers. Utilities: digits() generator, my_timeit() context manager. ''' from time import time # For my_timeit. from random import randint # For example with random number. upperBound = 10**5 # Default value, can be overridden by user. class my_timeit: '''Very simple timing context manager.''' def __init__(self, message): self.message = message self.start = time() def __enter__(self): return self def __exit__(self, *data): print(self.message.format(time() - self.start)) def digits(x:'nonnegative number') -> "yields number's digits": if not (x >= 0): raise ValueError('Number should be nonnegative') while x: yield x % 10 x //= 10 def slow_happy(number, known = {1}, happies = {1}) -> 'True/None': '''Tell if the number is happy or not, caching results. It uses two static variables, parameters known and happies; the first one contains known happy and unhappy numbers; the second contains only happy ones. If you want, you can pass your own known and happies arguments. If you do, you should keep the assumption commented out on the 1 line. ''' # This is commented out because <= is expensive. # assert {1} <= happies <= known if number in known: return number in happies history = set() while True: history.add(number) number = sum(x**2 for x in digits(number)) if number in known or number in history: break known.update(history) if number in happies: happies.update(history) return True # This will define new happy() to be much faster ------------------------. with my_timeit('Preparation time was {0} seconds.\n'): LogAbsoluteUpperBound = 6 # The maximum possible number is 10**this. happy_list = [slow_happy(x) for x in range(81*LogAbsoluteUpperBound + 1)] happy_base = 10**((LogAbsoluteUpperBound + 1)//2) sq_list = [sum(d**2 for d in digits(x)) for x in range(happy_base + 1)] def happy(x): '''Tell if the number is happy, optimized for smaller numbers. This function works fast for numbers <= 10**LogAbsoluteUpperBound. ''' try: return happy_list[sq_list[x%happy_base] + sq_list[x//happy_base]] except IndexError: return slow_happy(x) # End of happy()'s redefinition -----------------------------------------. def calcMain(print_numbers, upper_bound): happies = [x for x in range(upper_bound + 1) if happy(x)] if print_numbers: print(happies) if __name__ == '__main__': while True: upperBound = eval(input( "Pick an upper bound [{0} default, 0 ends, negative number prints]: " .format(upperBound)).strip() or repr(upperBound)) if not upperBound: break with my_timeit('This computation took {0} seconds.'): calcMain(upperBound < 0, abs(upperBound)) single = 0 while not happy(single): single = randint(1, 10**12) print('FYI, {0} is {1}.\n'.format(single, 'happy' if happy(single) else 'unhappy')) print('Nice to see you, goodbye!') 

那么,我也给了它一次。 虽然我没有testing甚至编译。

数字程序的一般规则:

  • 切勿将文字处理为数字。 这就是使得Python的语言比Python慢​​的原因,所以如果你用C语言来做,程序将比Python慢​​。

  • 如果可以避免使用数据结构,请不要使用它们。 你正在build立一个数组,只是为了增加数字。 更好地保持运行总量。

  • 保持STL参考的副本打开,以便您可以使用它,而不是编写自己的function。


 void calcMain(int upperBound) { vector<int> known; for(int i = 0; i <= upperBound; i++) { int current = i; vector<int> history; do { squaresum = 0 for ( ; current; current /= 10 ) { int digit = current % 10; squaresum += digit * digit; } current = squaresum; history.push_back(current); } while ( ! count(history.begin(), history.end() - 1, current) ); if(current == 1) { known.push_back(i); //cout << i << "\t"; } } //cout << "\n\n"; } 

为了在这个问题上看到我能够真正find这些数字有多快,我写了一个Dr_Asikalgorithm的multithreadingC ++实现。 有两件事对于这个实现是multithreading的事实来说是非常重要的。

  1. 更多的线程不一定会导致更好的执行时间,对于每种情况,都有一个令人满意的媒介,具体取决于您要计算的数字量。

  2. 如果比较使用一个线程运行的版本与原始版本之间的时间,则可能导致时间差异的唯一因素是启动线程和可变系统性能问题的开销。 否则,algorithm是一样的。

这个实现的代码(algorithm的所有function都去Dr_Asik)在这里 。 此外,我还写了一些速度testing,并对每个testing进行一次双重检查,以帮助备份这3分。

计算前100,000,000个欢乐数字:

原始 – 39.061 / 39.000(Dr_Asik的原始实施)
1线程 – 39.000 / 39.079
2线程 – 19.750 / 19.890
10线程 – 11.872 / 11.888
30个线程 – 10.764 / 10.827
50根线 – 10.624 / 10.561 < –
100线程 – 11.060 / 11.216
500线程 – 13.385 / 12.527

从这些结果看来,我们喜欢的媒体大约有50个线程,正负十个左右。

其他优化 :通过使用数组和直接访问使用循环索引,而不是searchvector,并通过caching先前的总和,下面的代码(由Asik博士的答案启发,但可能没有优化)运行速度比原来的C ++快2445倍代码,比Python代码快400倍。

 #include <iostream> #include <windows.h> #include <vector> void calcMain(int upperBound, std::vector<int>& known) { int tempDigitCounter = upperBound; int numDigits = 0; while (tempDigitCounter > 0) { numDigits++; tempDigitCounter /= 10; } int maxSlots = numDigits * 9 * 9; int* history = new int[maxSlots + 1]; int* cache = new int[upperBound+1]; for (int jj = 0; jj <= upperBound; jj++) { cache[jj] = 0; } int current, sum, temp; for(int i = 0; i <= upperBound; i++) { current = i; while(true) { sum = 0; temp = current; bool inRange = temp <= upperBound; if (inRange) { int cached = cache[temp]; if (cached) { sum = cached; } } if (sum == 0) { while (temp > 0) { int tempMod = temp % 10; sum += tempMod * tempMod; temp /= 10; } if (inRange) { cache[current] = sum; } } current = sum; if(history[current] == i) { if(current == 1) { known.push_back(i); } break; } history[current] = i; } } } int main() { while(true) { int upperBound; std::vector<int> known; std::cout << "Pick an upper bound: "; std::cin >> upperBound; long start, end; start = GetTickCount(); calcMain(upperBound, known); end = GetTickCount(); for (size_t i = 0; i < known.size(); ++i) { std::cout << known[i] << ", "; } double seconds = (double)(end-start) / 1000.0; std::cout << std::endl << seconds << " seconds." << std::endl << std::endl; } return 0; } 

我不是C ++优化方面的专家,但是我相信速度的不同可能是由于Python列表在开始时预先分配了更多的空间,而您的C ++向量必须重新分配并且每次增长时都可能复制。

至于GMan对发现的评论,我相信Python“in”运算符也是一个线性search,速度也是一样的。

编辑

另外我只是注意到你推出了自己的pow函数。 没有必要这样做,stdlib可能会更快。

这是依靠记住所有已经探索的数字的另一种方法。 我得到了一个因子x4-5,这对于DrAsik的代码1000和1000000是奇怪的稳定的,我期望caching更有效,我们正在探索的数量越多。 否则,同样的经典优化已经被应用。 顺便说一句,如果编译器接受NRVO (/ RNVO?我不记得确切的术语)或右值引用,我们不需要将向量作为输出parameter passing。

注意:微观优化仍然是可能的恕我直言,而且caching是天真的,因为它分配比真正需要更多的内存。

 enum Status { never_seen, being_explored, happy, unhappy }; char const* toString[] = { "never_seen", "being_explored", "happy", "unhappy" }; inline size_t sum_squares(size_t i) { size_t s = 0; while (i) { const size_t digit = i%10; s += digit * digit; i /= 10; } return s ; } struct Cache { Cache(size_t dim) : m_cache(dim, never_seen) {} void set(size_t n, Status status) { if (m_cache.size() <= n) { m_cache.resize(n+1, never_seen); } m_cache[n] = status; // std::cout << "(c[" << n << "]<-"<<toString[status] << ")"; } Status operator[](size_t n) const { if (m_cache.size() <= n) { return never_seen; } else { return m_cache[n]; } } private: std::vector<Status> m_cache; }; void search_happy_lh(size_t upper_bound, std::vector<size_t> & happy_numbers) { happy_numbers.clear(); happy_numbers.reserve(upper_bound); // it doesn't improve much the performances Cache cache(upper_bound+1); std::vector<size_t> current_stack; cache.set(1,happy); happy_numbers.push_back(1); for (size_t i = 2; i<=upper_bound ; ++i) { // std::cout << "\r" << i << std::flush; current_stack.clear(); size_t s= i; while ( s != 1 && cache[s]==never_seen) { current_stack.push_back(s); cache.set(s, being_explored); s = sum_squares(s); // std::cout << " - " << s << std::flush; } const Status update_with = (cache[s]==being_explored ||cache[s]==unhappy) ? unhappy : happy; // std::cout << " => " << s << ":" << toString[update_with] << std::endl; for (size_t j=0; j!=current_stack.size(); ++j) { cache.set(current_stack[j], update_with); } if (cache[i] == happy) { happy_numbers.push_back(i); } } } 

在无聊的时候偶然发现了这个网页,我想我会在js打高尔夫。 该algorithm是我自己的,我没有彻底检查,除了我自己的计算以外(所以这可能是错误的)。 它计算出第一个1e7开心的数字,并将它们存储在h中。 如果你想改变它,改变这两个7s。

 m=1e7,C=7*81,h=[1],t=true,U=[,,,,t],n=w=2; while(n<m){ z=w,s=0;while(z)y=z%10,s+=y*y,z=0|z/10;w=s; if(U[w]){if(n<C)U[n]=t;w=++n;}else if(w<n)h.push(n),w=++n;} 

这将在控制台或浏览器中为您打印前1000个项目:

 o=h.slice(0,m>1e3?1e3:m); (!this.document?print(o):document.load=document.write(o.join('\n'))); 

155个字符的function部分,它看起来像Asick博士提供的Firefox或V8一样快(当运行时间为d8时,系统上的原始python程序的速度是350-400倍happygolf.js或js -a – 在spidermonkey j -p happygolf.js)。
我会对分析技能感到敬畏,任何人都可以找出为什么这个algorithm做得很好,而不需要引用更长的评论的Fortran版本。

我对它的速度感到非常好奇,所以我学习了Fortran来比较一下这个algorithm,如果有新鲜的错误发现,那就好了,这是我的第一个fortran程序。 http://pastebin.com/q9WFaP5C这是静态内存明智的,所以公平对待其他人,它是在一个自编译的shell脚本,如果你没有gcc / bash / etc去掉预处理器和bash的东西在顶部,手动设置macros并将其编译为fortran95。

即使你包括编译时间,也会击败这里的大部分。 如果你不这样做的话,它的速度大约是原始Python版本的3000-3500倍(尽pipe我没有运行任何C ++程序,扩展速度是C ++ *的4万倍)。

令人惊讶的是,我在fortran版本中尝试过的许多优化(包括一些像循环展开,由于小的效果和可读性而离开了粘贴版本)对js版本是不利的。 这个练习表明,现代的跟踪编译器是非常好的(在精心优化的静态内存fortran的7到10倍),如果你走出困境,不要尝试任何棘手的东西。 走出自己的路,并试图做一些棘手的事情最后,这是一个更好,更recursion的js版本。

 // to s, then integer divides x by 10. // Repeats until x is 0. function sumsq(x) { var y,s=0; while(x) { y = x % 10; s += y * y; x = 0| x / 10; } return s; } // A boolean cache for happy(). // The terminating happy number and an unhappy number in // the terminating sequence. var H=[]; H[1] = true; H[4] = false; // Test if a number is happy. // First check the cache, if that's empty // Perform one round of sumsq, then check the cache // For that. If that's empty, recurse. function happy(x) { // If it already exists. if(H[x] !== undefined) { // Return whatever is already in cache. return H[x]; } else { // Else calc sumsq, set and return cached val, or if undefined, recurse. var w = sumsq(x); return (H[x] = H[w] !== undefined? H[w]: happy(w)); } } //Main program loop. var i, hN = []; for(i = 1; i < 1e7; i++) { if(happy(i)) { hN.push(i); } } 

令人惊讶的是,即使它是相当高的水平,它几乎完全和spidermonkey中的命令式algorithm(优化)并且在v8中closures(1.2倍长)一样。

我想这个故事的道德性,如果它很重要的话,花一些时间思考你的algorithm。 另外高级语言已经有很多开销(有时候会有自己的技巧来减less开销),所以有时候做更直接的事情或者利用高级特性的东西同样快。 微型优化并不总是有帮助。

*除非我的python安装非常慢,直接的时间是没有什么意义的,因为这是第一代eee。 时代是:
12分为fortran版本,没有输出,1e8开心号码。
fortran版本为40s,pipe道通过gzip输出到磁盘。
两个js版本的8-12s。 1e7快乐的数字,没有完全优化10-100s的输出与两个js版本1e7较less/没有优化(取决于没有优化的定义,100s与eval())没有输出

我有兴趣在一台真正的电脑上看到这些程序的时间。

这里有一些值得深思的东西:如果给出select在1979年的计算机上运行1979年的algorithm,或者在1979年的计算机上search2009年的algorithm,你会select哪一个?

古代硬件上的新algorithm将是更大的select。 看看你的“帮手”function。

有很多优化可能:

(1)使用const引用

 bool inVector(int inQuestion, const vector<int>& known) { for(vector<int>::const_iterator it = known.begin(); it != known.end(); ++it) if(*it == inQuestion) return true; return false; } int sum(const vector<int>& given) { int sum = 0; for(vector<int>::const_iterator it = given.begin(); it != given.end(); ++it) sum += *it; return sum; } 

(2)使用倒数循环

 int pow(int given, int power) { int current = 1; while(power--) current *= given; return current; } 

或者像其他人所说的那样,使用标准库代码。

(3)不要在不需要的地方分配缓冲区

  vector<int> squares; for (int temp = current; temp != 0; temp /= 10) { squares.push_back(pow(temp % 10, 2)); } 

With similar optimizations as PotatoSwatter I got time for 10000 numbers down from 1.063 seconds to 0.062 seconds (except I replaced itoa with standard sprintf in the original).

With all the memory optimizations (don't pass containers by value – in C++ you have to explicitly decide whether you want a copy or a reference; move operations that allocate memory out of inner loops; if you already have the number in a char buffer, what's the point of copying it to std::string etc) I got it down to 0.532.

The rest of the time came from using %10 to access digits, rather than converting numbers to string.

I suppose there might be another algorithmic level optimization (numbers that you have encountered while finding a happy number are themselves also happy numbers?) but I don't know how much that gains (there is not that many happy numbers in the first place) and this optimization is not in the Python version either.

By the way, by not using string conversion and a list to square digits, I got the Python version from 0.825 seconds down to 0.33 too.

 #!/usr/bin/env python import timeit upperBound = 0 def calcMain(): known = set() for i in xrange(0,upperBound+1): next = False current = i history = set() while not next: squaresum=0 while current > 0: current, digit = divmod(current, 10) squaresum += digit * digit current = squaresum if current in history: next = True if current == 1: known.add(i) history.add(current) while True: upperBound = input("Pick an upper bound: ") result = timeit.Timer(calcMain).timeit(1) print result, "seconds.\n" 

I made a couple of minor changes to your original python code example that make a better than 16x improvement to the performance of the code. The changes I made took the 100,000 case from about 9.64 seconds to about 3.38 seconds.

The major change was to make the mod 10 and accumulator changes to run in a while loop. I made a couple of other changes that improved execution time in only fractions of hundredths of seconds. The first minor change was changing the main for loop from a range list comprehension to an xrange iterator. The second minor change was substituting the set class for the list class for both the known and history variables. I also experimented with iterator comprehensions and precalculating the squares but they both had negative effects on the efficiency. I seem to be running a slower version of python or on a slower processor than some of the other contributers. I would be interest in the results of someone else's timing comparison of my python code against one of the optimized C++ versions of the same algorithm. I also tried using the python -O and -OO optimizations but they had the reverse of the intended effect.

Why is everyone using a vector in the c++ version? Lookup time is O(N).

Even though it's not as efficient as the python set, use std::set. Lookup time is O(log(N)).

Here's a C# version:

 using System; using System.Collections.Generic; using System.Text; namespace CSharp { class Program { static void Main (string [] args) { while (true) { Console.Write ("Pick an upper bound: "); String input = Console.ReadLine (); uint upper_bound; if (uint.TryParse (input, out upper_bound)) { DateTime start = DateTime.Now; CalcHappyNumbers (upper_bound); DateTime end = DateTime.Now; TimeSpan span = end - start; Console.WriteLine ("Time taken = " + span.TotalSeconds + " seconds."); } else { Console.WriteLine ("Error in input, unable to parse '" + input + "'."); } } } enum State { Happy, Sad, Unknown } static void CalcHappyNumbers (uint upper_bound) { SortedDictionary<uint, State> happy = new SortedDictionary<uint, State> (); SortedDictionary<uint, bool> happy_numbers = new SortedDictionary<uint, bool> (); happy [1] = State.Happy; happy_numbers [1] = true; for (uint current = 2 ; current < upper_bound ; ++current) { FindState (ref happy, ref happy_numbers, current); } //foreach (KeyValuePair<uint, bool> pair in happy_numbers) //{ // Console.Write (pair.Key.ToString () + ", "); //} //Console.WriteLine (""); } static State FindState (ref SortedDictionary<uint, State> happy, ref SortedDictionary<uint,bool> happy_numbers, uint value) { State current_state; if (happy.TryGetValue (value, out current_state)) { if (current_state == State.Unknown) { happy [value] = State.Sad; } } else { happy [value] = current_state = State.Unknown; uint new_value = 0; for (uint i = value ; i != 0 ; i /= 10) { uint lsd = i % 10; new_value += lsd * lsd; } if (new_value == 1) { current_state = State.Happy; } else { current_state = FindState (ref happy, ref happy_numbers, new_value); } if (current_state == State.Happy) { happy_numbers [value] = true; } happy [value] = current_state; } return current_state; } } } 

I compared it against Dr_Asik's C++ code. For an upper bound of 100000 the C++ version ran in about 2.9 seconds and the C# version in 0.35 seconds. Both were compiled using Dev Studio 2005 using default release build options and both were executed from a command prompt.