是否可以使用scikit-learn K-Means Clustering来指定自己的距离函数?

是否可以使用scikit-learn K-Means Clustering来指定自己的距离函数?

这里有一个使用scipy.spatial.distance中的任何20多个距离的小kmeans,或者一个用户函数。
评论将是受欢迎的(迄今只有一个用户,不够); 特别是你的N,dim,k,metric是什么?

#!/usr/bin/env python # kmeans.py using any of the 20-odd metrics in scipy.spatial.distance # kmeanssample 2 pass, first sample sqrt(N) from __future__ import division import random import numpy as np from scipy.spatial.distance import cdist # $scipy/spatial/distance.py # http://docs.scipy.org/doc/scipy/reference/spatial.html from scipy.sparse import issparse # $scipy/sparse/csr.py __date__ = "2011-11-17 Nov denis" # X sparse, any cdist metric: real app ? # centres get dense rapidly, metrics in high dim hit distance whiteout # vs unsupervised / semi-supervised svm #............................................................................... def kmeans( X, centres, delta=.001, maxiter=10, metric="euclidean", p=2, verbose=1 ): """ centres, Xtocentre, distances = kmeans( X, initial centres ... ) in: XN x dim may be sparse centres kx dim: initial centres, eg random.sample( X, k ) delta: relative error, iterate until the average distance to centres is within delta of the previous average distance maxiter metric: any of the 20-odd in scipy.spatial.distance "chebyshev" = max, "cityblock" = L1, "minkowski" with p= or a function( Xvec, centrevec ), eg Lqmetric below p: for minkowski metric -- local mod cdist for 0 < p < 1 too verbose: 0 silent, 2 prints running distances out: centres, kx dim Xtocentre: each X -> its nearest centre, ints N -> k distances, N see also: kmeanssample below, class Kmeans below. """ if not issparse(X): X = np.asanyarray(X) # ? centres = centres.todense() if issparse(centres) \ else centres.copy() N, dim = X.shape k, cdim = centres.shape if dim != cdim: raise ValueError( "kmeans: X %s and centres %s must have the same number of columns" % ( X.shape, centres.shape )) if verbose: print "kmeans: X %s centres %s delta=%.2g maxiter=%d metric=%s" % ( X.shape, centres.shape, delta, maxiter, metric) allx = np.arange(N) prevdist = 0 for jiter in range( 1, maxiter+1 ): D = cdist_sparse( X, centres, metric=metric, p=p ) # |X| x |centres| xtoc = D.argmin(axis=1) # X -> nearest centre distances = D[allx,xtoc] avdist = distances.mean() # median ? if verbose >= 2: print "kmeans: av |X - nearest centre| = %.4g" % avdist if (1 - delta) * prevdist <= avdist <= prevdist \ or jiter == maxiter: break prevdist = avdist for jc in range(k): # (1 pass in C) c = np.where( xtoc == jc )[0] if len(c) > 0: centres[jc] = X[c].mean( axis=0 ) if verbose: print "kmeans: %d iterations cluster sizes:" % jiter, np.bincount(xtoc) if verbose >= 2: r50 = np.zeros(k) r90 = np.zeros(k) for j in range(k): dist = distances[ xtoc == j ] if len(dist) > 0: r50[j], r90[j] = np.percentile( dist, (50, 90) ) print "kmeans: cluster 50 % radius", r50.astype(int) print "kmeans: cluster 90 % radius", r90.astype(int) # scale L1 / dim, L2 / sqrt(dim) ? return centres, xtoc, distances #............................................................................... def kmeanssample( X, k, nsample=0, **kwargs ): """ 2-pass kmeans, fast for large N: 1) kmeans a random sample of nsample ~ sqrt(N) from X 2) full kmeans, starting from those centres """ # merge w kmeans ? mttiw # v large N: sample N^1/2, N^1/2 of that # seed like sklearn ? N, dim = X.shape if nsample == 0: nsample = max( 2*np.sqrt(N), 10*k ) Xsample = randomsample( X, int(nsample) ) pass1centres = randomsample( X, int(k) ) samplecentres = kmeans( Xsample, pass1centres, **kwargs )[0] return kmeans( X, samplecentres, **kwargs ) def cdist_sparse( X, Y, **kwargs ): """ -> |X| x |Y| cdist array, any cdist metric X or Y may be sparse -- best csr """ # todense row at a time, v slow if both v sparse sxy = 2*issparse(X) + issparse(Y) if sxy == 0: return cdist( X, Y, **kwargs ) d = np.empty( (X.shape[0], Y.shape[0]), np.float64 ) if sxy == 2: for j, x in enumerate(X): d[j] = cdist( x.todense(), Y, **kwargs ) [0] elif sxy == 1: for k, y in enumerate(Y): d[:,k] = cdist( X, y.todense(), **kwargs ) [0] else: for j, x in enumerate(X): for k, y in enumerate(Y): d[j,k] = cdist( x.todense(), y.todense(), **kwargs ) [0] return d def randomsample( X, n ): """ random.sample of the rows of X X may be sparse -- best csr """ sampleix = random.sample( xrange( X.shape[0] ), int(n) ) return X[sampleix] def nearestcentres( X, centres, metric="euclidean", p=2 ): """ each X -> nearest centre, any metric euclidean2 (~ withinss) is more sensitive to outliers, cityblock (manhattan, L1) less sensitive """ D = cdist( X, centres, metric=metric, p=p ) # |X| x |centres| return D.argmin(axis=1) def Lqmetric( x, y=None, q=.5 ): # yes a metric, may increase weight of near matches; see ... return (np.abs(x - y) ** q) .mean() if y is not None \ else (np.abs(x) ** q) .mean() #............................................................................... class Kmeans: """ km = Kmeans( X, k= or centres=, ... ) in: either initial centres= for kmeans or k= [nsample=] for kmeanssample out: km.centres, km.Xtocentre, km.distances iterator: for jcentre, J in km: clustercentre = centres[jcentre] J indexes eg X[J], classes[J] """ def __init__( self, X, k=0, centres=None, nsample=0, **kwargs ): self.X = X if centres is None: self.centres, self.Xtocentre, self.distances = kmeanssample( X, k=k, nsample=nsample, **kwargs ) else: self.centres, self.Xtocentre, self.distances = kmeans( X, centres, **kwargs ) def __iter__(self): for jc in range(len(self.centres)): yield jc, (self.Xtocentre == jc) #............................................................................... if __name__ == "__main__": import random import sys from time import time N = 10000 dim = 10 ncluster = 10 kmsample = 100 # 0: random centres, > 0: kmeanssample kmdelta = .001 kmiter = 10 metric = "cityblock" # "chebyshev" = max, "cityblock" L1, Lqmetric seed = 1 exec( "\n".join( sys.argv[1:] )) # run this.py N= ... np.set_printoptions( 1, threshold=200, edgeitems=5, suppress=True ) np.random.seed(seed) random.seed(seed) print "N %d dim %d ncluster %d kmsample %d metric %s" % ( N, dim, ncluster, kmsample, metric) X = np.random.exponential( size=(N,dim) ) # cf scikits-learn datasets/ t0 = time() if kmsample > 0: centres, xtoc, dist = kmeanssample( X, ncluster, nsample=kmsample, delta=kmdelta, maxiter=kmiter, metric=metric, verbose=2 ) else: randomcentres = randomsample( X, ncluster ) centres, xtoc, dist = kmeans( X, randomcentres, delta=kmdelta, maxiter=kmiter, metric=metric, verbose=2 ) print "%.0f msec" % ((time() - t0) * 1000) # also ~/py/np/kmeans/test-kmeans.py 

一些笔记添加26mar 2012:

1)对于余弦距离,首先将所有数据向量归一化为| X | = 1; 然后

 cosinedistance( X, Y ) = 1 - X . Y = Euclidean distance |X - Y|^2 / 2 

很快。 对于位向量,将向量与向量分开,而不是向外扩展(尽pipe某些程序可能会扩展)。 对于稀疏vector,比如N,X的1%。 Y应该花时间O(2%N),空间O(N); 但我不知道哪个程序可以做到这一点。

2) Scikit学习聚类提供了k-means,mini-batch-k-means …的优秀概述,带有在scipy.sparsematrix上工作的代码。

3)在k-means之后总是检查簇大小。 如果你期待的是大致相等的团簇,但是它们会出来[44 37 9 5 5] % …(头部划伤的声音)。

不幸的是没有:scikit学习目前的k-means实现只使用欧几里德距离。

只需使用nltk,而不是在那里你可以做到这一点,例如

 from nltk.cluster.kmeans import KMeansClusterer NUM_CLUSTERS = <choose a value> data = <sparse matrix that you would normally give to scikit>.toarray() kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = kclusterer.cluster(data, assign_clusters=True) 

是的,您可以使用差异度量函数; 然而,根据定义,k均值聚类algorithm依赖于每个聚类的均值的eucldiean距离。

你可以使用不同的度量,所以即使你仍然在计算平均值,你可以使用像mahalnobis距离的东西。

Python的k-means允许使用L1(曼哈顿)距离。