# numpy：数组中唯一值的最有效的频率计数

`numpy` / `scipy` ，是否有一种有效的方法来获取数组中唯一值的频率计数？

` `x = array( [1,1,1,2,2,2,5,25,1,1] ) y = freq_count( x ) print y >> [[1, 5], [2,3], [5,1], [25,1]]` `

（对于你，R用户在那里，我基本上是在寻找`table()`函数）

` `import numpy as np x = np.array([1,1,1,2,2,2,5,25,1,1]) y = np.bincount(x) ii = np.nonzero(y)[0]` `

` `zip(ii,y[ii]) # [(1, 5), (2, 3), (5, 1), (25, 1)]` `

` `np.vstack((ii,y[ii])).T # array([[ 1, 5], [ 2, 3], [ 5, 1], [25, 1]])` `

` `import numpy as np x = np.array([1,1,1,2,2,2,5,25,1,1]) unique, counts = np.unique(x, return_counts=True) print np.asarray((unique, counts)).T` `

` ` [[ 1 5] [ 2 3] [ 5 1] [25 1]]` `

`scipy.stats.itemfreq`快速比较：

` `In [4]: x = np.random.random_integers(0,100,1e6) In [5]: %timeit unique, counts = np.unique(x, return_counts=True) 10 loops, best of 3: 31.5 ms per loop In [6]: %timeit scipy.stats.itemfreq(x) 10 loops, best of 3: 170 ms per loop` `

` `>>> from scipy.stats import itemfreq >>> x = [1,1,1,2,2,2,5,25,1,1] >>> itemfreq(x) array([[ 1., 5.], [ 2., 3.], [ 5., 1.], [ 25., 1.]])` `

` `import numpy as np def unique_count(a): unique, inverse = np.unique(a, return_inverse=True) count = np.zeros(len(unique), np.int) np.add.at(count, inverse, 1) return np.vstack(( unique, count)).T print unique_count(np.random.randint(-10,10,100))` `

` `>>> import pandas as pd >>> import numpy as np >>> x = np.array([1,1,1,2,2,2,5,25,1,1]) >>> pd.value_counts(pd.Series(x)) 1 5 2 3 25 1 5 1` `

dtype：int64

`numpy.bincount`可能是最好的select。 如果你的数组除了小的密集整数外还包含任何东西，把它包装起来可能是有用的：

` `def count_unique(keys): uniq_keys = np.unique(keys) bins = uniq_keys.searchsorted(keys) return uniq_keys, np.bincount(bins)` `

` `>>> x = array([1,1,1,2,2,2,5,25,1,1]) >>> count_unique(x) (array([ 1, 2, 5, 25]), array([5, 3, 1, 1]))` `

` `y = np.bincount(a) ii = np.nonzero(y)[0] out = np.vstack((ii, y[ii])).T` `

` `import numpy as np import pandas as pd import perfplot from scipy.stats import itemfreq def bincount(a): y = np.bincount(a) ii = np.nonzero(y)[0] return np.vstack((ii, y[ii])).T def unique(a): unique, counts = np.unique(a, return_counts=True) return np.asarray((unique, counts)).T def unique_count(a): unique, inverse = np.unique(a, return_inverse=True) count = np.zeros(len(unique), np.int) np.add.at(count, inverse, 1) return np.vstack((unique, count)).T def pandas_value_counts(a): out = pd.value_counts(pd.Series(a)) out.sort_index(inplace=True) out = np.stack([out.keys().values, out.values]).T return out perfplot.show( setup=lambda n: np.random.randint(0, 1000, n), kernels=[bincount, unique, itemfreq, unique_count, pandas_value_counts], n_range=[2**k for k in range(22)], logx=True, logy=True, xlabel='len(a)' )` `

` `>>> from numpy import histogram >>> y = histogram (x, bins=x.max()-1) >>> y (array([5, 3, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]), array([ 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.]))` `

` `def count(a): results = {} for x in a: if x not in results: results[x] = 1 else: results[x] += 1 return results` `

` `>>>timeit count([1,1,1,2,2,2,5,25,1,1]) would return:` `

100000个循环，最好是3：每个循环2.26μs

` `>>>timeit count(np.array([1,1,1,2,2,2,5,25,1,1]))` `

100000个循环，最好是3：每个循环8.8μs

` `>>>timeit count(np.array([1,1,1,2,2,2,5,25,1,1]).tolist())` `

100000个循环，最好为3：每个循环5.85μs

` `from zmq import Stopwatch aZmqSTOPWATCH = Stopwatch() aDataSETasARRAY = ( 100 * abs( np.random.randn( 150000 ) ) ).astype( np.int ) aDataSETasLIST = aDataSETasARRAY.tolist() import numba @numba.jit def numba_bincount( anObject ): np.bincount( anObject ) return aZmqSTOPWATCH.start();np.bincount( aDataSETasARRAY );aZmqSTOPWATCH.stop() 14328L aZmqSTOPWATCH.start();numba_bincount( aDataSETasARRAY );aZmqSTOPWATCH.stop() 592L aZmqSTOPWATCH.start();count( aDataSETasLIST );aZmqSTOPWATCH.stop() 148609L` `

` `#create 100 random numbers arr = numpy.random.random_integers(0,50,100) #create a dictionary of the unique values d = dict([(i,0) for i in numpy.unique(arr)]) for number in arr: d[j]+=1 #increment when that value is found` `

` `import numpy as np from scipy import weave def count_unique(datain): """ Similar to numpy.unique function for returning unique members of data, but also returns their counts """ data = np.sort(datain) uniq = np.unique(data) nums = np.zeros(uniq.shape, dtype='int') code=""" int i,count,j; j=0; count=0; for(i=1; i<Ndata[0]; i++){ count++; if(data(i) > data(i-1)){ nums(j) = count; count = 0; j++; } } // Handle last value nums(j) = count+1; """ weave.inline(code, ['data', 'nums'], extra_compile_args=['-O2'], type_converters=weave.converters.blitz) return uniq, nums` `

` `> %timeit count_unique(data) > 10000 loops, best of 3: 55.1 µs per loop` `

Eelco纯粹的`numpy`版本：

` `> %timeit unique_count(data) > 1000 loops, best of 3: 284 µs per loop` `

` `import pandas as pd import numpy as np x = np.array( [1,1,1,2,2,2,5,25,1,1] ) print(dict(pd.Series(x).value_counts()))` `