LogLog和HyperLogLogalgorithm来计算大基数

我在哪里可以findLogLogalgorithm的有效实现? 试图自己实现,但我的草案实施产生了奇怪的结果。

这里是:

function LogLog(max_error, max_count) { function log2(x) { return Math.log(x) / Math.LN2; } var m = 1.30 / max_error; var k = Math.ceil(log2(m * m)); m = Math.pow(2, k); var k_comp = 32 - k; var l = log2(log2(max_count / m)); if (isNaN(l)) l = 1; else l = Math.ceil(l); var l_mask = ((1 << l) - 1) >>> 0; var M = []; for (var i = 0; i < m; ++i) M[i] = 0; function count(hash) { if (hash !== undefined) { var j = hash >>> k_comp; var rank = 0; for (var i = 0; i < k_comp; ++i) { if ((hash >>> i) & 1) { rank = i + 1; break; } } M[j] = Math.max(M[j], rank & l_mask); } else { var c = 0; for (var i = 0; i < m; ++i) c += M[i]; return 0.79402 * m * Math.pow(2, c / m); } } return {count: count}; } function fnv1a(text) { var hash = 2166136261; for (var i = 0; i < text.length; ++i) { hash ^= text.charCodeAt(i); hash += (hash << 1) + (hash << 4) + (hash << 7) + (hash << 8) + (hash << 24); } return hash >>> 0; } var words = ['aardvark', 'abyssinian', ... ,'zoology']; // about 2 300 words var log_log = LogLog(0.01, 100000); for (var i = 0; i < words.length; ++i) log_log.count(fnv1a(words[i])); alert(log_log.count()); 

由于不明原因,实现对max_error参数非常敏感,是决定结果大小的主要因素。 我敢肯定,有一些愚蠢的错误:)

更新:这个问题在新版本的algorithm中解决。 我稍后会发布它的实施。

这里是基于较新论文的algorithm的更新版本:

 var pow_2_32 = 0xFFFFFFFF + 1; function HyperLogLog(std_error) { function log2(x) { return Math.log(x) / Math.LN2; } function rank(hash, max) { var r = 1; while ((hash & 1) == 0 && r <= max) { ++r; hash >>>= 1; } return r; } var m = 1.04 / std_error; var k = Math.ceil(log2(m * m)), k_comp = 32 - k; m = Math.pow(2, k); var alpha_m = m == 16 ? 0.673 : m == 32 ? 0.697 : m == 64 ? 0.709 : 0.7213 / (1 + 1.079 / m); var M = []; for (var i = 0; i < m; ++i) M[i] = 0; function count(hash) { if (hash !== undefined) { var j = hash >>> k_comp; M[j] = Math.max(M[j], rank(hash, k_comp)); } else { var c = 0.0; for (var i = 0; i < m; ++i) c += 1 / Math.pow(2, M[i]); var E = alpha_m * m * m / c; // -- make corrections if (E <= 5/2 * m) { var V = 0; for (var i = 0; i < m; ++i) if (M[i] == 0) ++V; if (V > 0) E = m * Math.log(m / V); } else if (E > 1/30 * pow_2_32) E = -pow_2_32 * Math.log(1 - E / pow_2_32); // -- return E; } } return {count: count}; } function fnv1a(text) { var hash = 2166136261; for (var i = 0; i < text.length; ++i) { hash ^= text.charCodeAt(i); hash += (hash << 1) + (hash << 4) + (hash << 7) + (hash << 8) + (hash << 24); } return hash >>> 0; } var words = ['aardvark', 'abyssinian', ..., 'zoology']; // 2336 words var seed = Math.floor(Math.random() * pow_2_32); // make more fun var log_log = HyperLogLog(0.065); for (var i = 0; i < words.length; ++i) log_log.count(fnv1a(words[i]) ^ seed); var count = log_log.count(); alert(count + ', error ' + (count - words.length) / (words.length / 100.0) + '%'); 

这是一个稍加修改的版本,增加了合并操作。

合并允许您从HyperLogLog的多个实例中获取计数器,并确定整个独特的计数器。

例如,如果您在星期一,星期二和星期三收集了独特的访问者,那么您可以将这些存储桶合并在一起,并统计三天内的唯一访问者数量:

 var pow_2_32 = 0xFFFFFFFF + 1; function HyperLogLog(std_error) { function log2(x) { return Math.log(x) / Math.LN2; } function rank(hash, max) { var r = 1; while ((hash & 1) == 0 && r <= max) { ++r; hash >>>= 1; } return r; } var m = 1.04 / std_error; var k = Math.ceil(log2(m * m)), k_comp = 32 - k; m = Math.pow(2, k); var alpha_m = m == 16 ? 0.673 : m == 32 ? 0.697 : m == 64 ? 0.709 : 0.7213 / (1 + 1.079 / m); var M = []; for (var i = 0; i < m; ++i) M[i] = 0; function merge(other) { for (var i = 0; i < m; i++) M[i] = Math.max(M[i], other.buckets[i]); } function count(hash) { if (hash !== undefined) { var j = hash >>> k_comp; M[j] = Math.max(M[j], rank(hash, k_comp)); } else { var c = 0.0; for (var i = 0; i < m; ++i) c += 1 / Math.pow(2, M[i]); var E = alpha_m * m * m / c; // -- make corrections if (E <= 5/2 * m) { var V = 0; for (var i = 0; i < m; ++i) if (M[i] == 0) ++V; if (V > 0) E = m * Math.log(m / V); } else if (E > 1/30 * pow_2_32) E = -pow_2_32 * Math.log(1 - E / pow_2_32); // -- return E; } } return {count: count, merge: merge, buckets: M}; } function fnv1a(text) { var hash = 2166136261; for (var i = 0; i < text.length; ++i) { hash ^= text.charCodeAt(i); hash += (hash << 1) + (hash << 4) + (hash << 7) + (hash << 8) + (hash << 24); } return hash >>> 0; } 

那么你可以做这样的事情:

 // initialize one counter per day var ll_monday = HyperLogLog(0.01); var ll_tuesday = HyperLogLog(0.01); var ll_wednesday = HyperLogLog(0.01); // add 5000 unique values in each day for(var i=0; i<5000; i++) ll_monday.count(fnv1a('' + Math.random())); for(var i=0; i<5000; i++) ll_tuesday.count(fnv1a('' + Math.random())); for(var i=0; i<5000; i++) ll_wednesday.count(fnv1a('' + Math.random())); // add 5000 values which appear every day for(var i=0; i<5000; i++) {ll_monday.count(fnv1a(''+i)); ll_tuesday.count(fnv1a('' + i)); ll_wednesday.count(fnv1a('' + i));} // merge three days together together = HyperLogLog(0.01); together.merge(ll_monday); together.merge(ll_tuesday); together.merge(ll_wednesday); // report console.log('unique per day: ' + Math.round(ll_monday.count()) + ' ' + Math.round(ll_tuesday.count()) + ' ' + Math.round(ll_wednesday.count())); console.log('unique numbers overall: ' + Math.round(together.count())); 

我们已经开源了一个名为Stream-Lib的项目,它有一个LogLog实现 。 这项工作是基于这个文件 。

使用提供的js版本@actual,我试图在C#中实现它,这似乎足够接近。 刚刚改变了一点fnv1a函数,并重命名为getHashCode。 (感谢Jenkins散列函数, http://en.wikipedia.org/wiki/Jenkins_hash_function )

 public class HyperLogLog { private double mapSize, alpha_m, k; private int kComplement; private Dictionary<int, int> Lookup = new Dictionary<int, int>(); private const double pow_2_32 = 4294967297; public HyperLogLog(double stdError) { mapSize = (double)1.04 / stdError; k = (long)Math.Ceiling(log2(mapSize * mapSize)); kComplement = 32 - (int)k; mapSize = (long)Math.Pow(2, k); alpha_m = mapSize == 16 ? (double)0.673 : mapSize == 32 ? (double)0.697 : mapSize == 64 ? (double)0.709 : (double)0.7213 / (double)(1 + 1.079 / mapSize); for (int i = 0; i < mapSize; i++) Lookup[i] = 0; } private static double log2(double x) { return Math.Log(x) / 0.69314718055994530941723212145818;//Ln2 } private static int getRank(uint hash, int max) { int r = 1; uint one = 1; while ((hash & one) == 0 && r <= max) { ++r; hash >>= 1; } return r; } public static uint getHashCode(string text) { uint hash = 0; for (int i = 0, l = text.Length; i < l; i++) { hash += (uint)text[i]; hash += hash << 10; hash ^= hash >> 6; } hash += hash << 3; hash ^= hash >> 6; hash += hash << 16; return hash; } public int Count() { double c = 0, E; for (var i = 0; i < mapSize; i++) c += 1d / Math.Pow(2, (double)Lookup[i]); E = alpha_m * mapSize * mapSize / c; // Make corrections & smoothen things. if (E <= (5 / 2) * mapSize) { double V = 0; for (var i = 0; i < mapSize; i++) if (Lookup[i] == 0) V++; if (V > 0) E = mapSize * Math.Log(mapSize / V); } else if (E > (1 / 30) * pow_2_32) E = -pow_2_32 * Math.Log(1 - E / pow_2_32); // Made corrections & smoothen things, or not. return (int)E; } public void Add(object val) { uint hashCode = getHashCode(val.ToString()); int j = (int)(hashCode >> kComplement); Lookup[j] = Math.Max(Lookup[j], getRank(hashCode, kComplement)); } } 

我知道这是一个旧的post,但@buryat的实施已经移动,并在任何情况下不完整,有点慢(对不起o_o)。

我已经采取了新的Redis版本所使用的实现,可以在这里find并将其移植到PHP。 回购是在这里https://github.com/joegreen0991/HyperLogLog

 <?php class HyperLogLog { private $HLL_P_MASK; private $HLL_REGISTERS; private $ALPHA; private $registers; public function __construct($HLL_P = 14) { $this->HLL_REGISTERS = (1 << $HLL_P); /* With P=14, 16384 registers. */ $this->HLL_P_MASK = ($this->HLL_REGISTERS - 1); /* Mask to index register. */ $this->ALPHA = 0.7213 / (1 + 1.079 / $this->HLL_REGISTERS); $this->registers = new SplFixedArray($this->HLL_REGISTERS); for ($i = 0; $i < $this->HLL_REGISTERS; $i++) { $this->registers[$i] = 0; } } public function add($v) { $h = crc32(md5($v)); $h |= 1 << 63; /* Make sure the loop terminates. */ $bit = $this->HLL_REGISTERS; /* First bit not used to address the register. */ $count = 1; /* Initialized to 1 since we count the "00000...1" pattern. */ while(($h & $bit) == 0) { $count++; $bit <<= 1; } /* Update the register if this element produced a longer run of zeroes. */ $index = $h & $this->HLL_P_MASK; /* Index a register inside registers. */ if ($this->registers[$index] < $count) { $this->registers[$index] = $count; } } public function export() { $str = ''; for ($i = 0; $i < $this->HLL_REGISTERS; $i++) { $str .= chr($this->registers[$i]); } return $str; } public function import($str) { for ($i = 0; $i < $this->HLL_REGISTERS; $i++) { $this->registers[$i] = isset($str[$i]) ? ord($str[$i]) : 0; } } public function merge($str) { for ($i = 0; $i < $this->HLL_REGISTERS; $i++) { if(isset($str[$i])) { $ord = ord($str[$i]); if ($this->registers[$i] < $ord) { $this->registers[$i] = $ord; } } } } /** * @static * @param $arr * @return int Number of unique items in $arr */ public function count() { $E = 0; $ez = 0; for ($i = 0; $i < $this->HLL_REGISTERS; $i++) { if ($this->registers[$i] !== 0) { $E += (1.0 / pow(2, $this->registers[$i])); } else { $ez++; $E += 1.0; } } $E = (1 / $E) * $this->ALPHA * $this->HLL_REGISTERS * $this->HLL_REGISTERS; /* Use the LINEARCOUNTING algorithm for small cardinalities. * For larger values but up to 72000 HyperLogLog raw approximation is * used since linear counting error starts to increase. However HyperLogLog * shows a strong bias in the range 2.5*16384 - 72000, so we try to * compensate for it. */ if ($E < $this->HLL_REGISTERS * 2.5 && $ez != 0) { $E = $this->HLL_REGISTERS * log($this->HLL_REGISTERS / $ez); } else if ($this->HLL_REGISTERS == 16384 && $E < 72000) { // We did polynomial regression of the bias for this range, this // way we can compute the bias for a given cardinality and correct // according to it. Only apply the correction for P=14 that's what // we use and the value the correction was verified with. $bias = 5.9119 * 1.0e-18 * ($E*$E*$E*$E) -1.4253 * 1.0e-12 * ($E*$E*$E)+ 1.2940 * 1.0e-7 * ($E*$E) -5.2921 * 1.0e-3 * $E+ 83.3216; $E -= $E * ($bias/100); } return floor($E); } } 

我在JS和PHP中实现了loglog和hyperloglog,以及很好的代码https://github.com/buryat/loglog