Python，计算列表差异

在Python中，计算两个列表之间差异的最好方法是什么？

例

A = [1,2,3,4] B = [2,5] A - B = [1,3,4] B - A = [5]

如果您不关心项目顺序或重复，请使用set 。如果你这样做，使用列表parsing ：

 >>> def diff(first, second): second = set(second) return [item for item in first if item not in second] >>> diff(A, B) [1, 3, 4] >>> diff(B, A) [5] >>>

 >>> set([1,2,3,4]) - set([2,5]) set([1, 3, 4]) >>> set([2,5]) - set([1,2,3,4]) set([5])

你可以做一个

 list(set(A)-set(B))

和

 list(set(B)-set(A))

一个class轮：

 diff = lambda l1,l2: [x for x in l1 if x not in l2] diff(A,B) diff(B,A)

要么：

 diff = lambda l1,l2: filter(lambda x: x not in l2, l1) diff(A,B) diff(B,A)

上面的例子简化了计算差异的问题。假设sorting或重复数据删除肯定会更容易计算差异，但如果您的比较不能承受这些假设，那么您将需要一个不重复的差异algorithm实现。请参阅python标准库中的difflib。

 from difflib import SequenceMatcher squeeze=SequenceMatcher( None, A, B ) print "A - B = [%s]"%( reduce( lambda p,q: p+q, map( lambda t: squeeze.a[t[1]:t[2]], filter(lambda x:x[0]!='equal', squeeze.get_opcodes() ) ) ) )

A – B = [[1,3,4]]

Python 2.7.3（默认，2014年2月27日，19:58:35） – IPython 1.1.0 – timeit：（github gist）

 def diff(a, b): b = set(b) return [aa for aa in a if aa not in b] def set_diff(a, b): return list(set(a) - set(b)) diff_lamb_hension = lambda l1,l2: [x for x in l1 if x not in l2] diff_lamb_filter = lambda l1,l2: filter(lambda x: x not in l2, l1) from difflib import SequenceMatcher def squeezer(a, b): squeeze = SequenceMatcher(None, a, b) return reduce(lambda p,q: p+q, map( lambda t: squeeze.a[t[1]:t[2]], filter(lambda x:x[0]!='equal', squeeze.get_opcodes())))

结果：

 # Small a = range(10) b = range(10/2) timeit[diff(a, b)] 100000 loops, best of 3: 1.97 µs per loop timeit[set_diff(a, b)] 100000 loops, best of 3: 2.71 µs per loop timeit[diff_lamb_hension(a, b)] 100000 loops, best of 3: 2.1 µs per loop timeit[diff_lamb_filter(a, b)] 100000 loops, best of 3: 3.58 µs per loop timeit[squeezer(a, b)] 10000 loops, best of 3: 36 µs per loop # Medium a = range(10**4) b = range(10**4/2) timeit[diff(a, b)] 1000 loops, best of 3: 1.17 ms per loop timeit[set_diff(a, b)] 1000 loops, best of 3: 1.27 ms per loop timeit[diff_lamb_hension(a, b)] 1 loops, best of 3: 736 ms per loop timeit[diff_lamb_filter(a, b)] 1 loops, best of 3: 732 ms per loop timeit[squeezer(a, b)] 100 loops, best of 3: 12.8 ms per loop # Big a = xrange(10**7) b = xrange(10**7/2) timeit[diff(a, b)] 1 loops, best of 3: 1.74 s per loop timeit[set_diff(a, b)] 1 loops, best of 3: 2.57 s per loop timeit[diff_lamb_filter(a, b)] # too long to wait for timeit[diff_lamb_filter(a, b)] # too long to wait for timeit[diff_lamb_filter(a, b)] # TypeError: sequence index must be integer, not 'slice'

@ roman-bodnarchuk列表推导函数def diff（a，b）似乎更快。

你会想使用一个set而不是一个list 。

 A = [1,2,3,4] B = [2,5] #A - B x = list(set(A) - set(B)) #B - A y = list(set(B) - set(A)) print x print y

如果你想recursion深入到你的列表项，我已经写了一个python包： https ： //github.com/erasmose/deepdiff

安装

从PyPi安装：

 pip install deepdiff

如果你是Python3，你还需要安装：

 pip install future six

用法示例

 >>> from deepdiff import DeepDiff >>> from pprint import pprint >>> from __future__ import print_function

相同的对象返回空

 >>> t1 = {1:1, 2:2, 3:3} >>> t2 = t1 >>> ddiff = DeepDiff(t1, t2) >>> print (ddiff.changes) {}

项目的types已更改

 >>> t1 = {1:1, 2:2, 3:3} >>> t2 = {1:1, 2:"2", 3:3} >>> ddiff = DeepDiff(t1, t2) >>> print (ddiff.changes) {'type_changes': ["root[2]: 2=<type 'int'> vs. 2=<type 'str'>"]}

一个项目的价值已经改变

 >>> t1 = {1:1, 2:2, 3:3} >>> t2 = {1:1, 2:4, 3:3} >>> ddiff = DeepDiff(t1, t2) >>> print (ddiff.changes) {'values_changed': ['root[2]: 2 ====>> 4']}

项目添加和/或删除

 >>> t1 = {1:1, 2:2, 3:3, 4:4} >>> t2 = {1:1, 2:4, 3:3, 5:5, 6:6} >>> ddiff = DeepDiff(t1, t2) >>> pprint (ddiff.changes) {'dic_item_added': ['root[5, 6]'], 'dic_item_removed': ['root[4]'], 'values_changed': ['root[2]: 2 ====>> 4']}

string差异

 >>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":"world"}} >>> t2 = {1:1, 2:4, 3:3, 4:{"a":"hello", "b":"world!"}} >>> ddiff = DeepDiff(t1, t2) >>> pprint (ddiff.changes, indent = 2) { 'values_changed': [ 'root[2]: 2 ====>> 4', "root[4]['b']:\n--- \n+++ \n@@ -1 +1 @@\n-world\n+world!"]} >>> >>> print (ddiff.changes['values_changed'][1]) root[4]['b']: --- +++ @@ -1 +1 @@ -world +world!

string差异2

 >>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":"world!\nGoodbye!\n1\n2\nEnd"}} >>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":"world\n1\n2\nEnd"}} >>> ddiff = DeepDiff(t1, t2) >>> pprint (ddiff.changes, indent = 2) { 'values_changed': [ "root[4]['b']:\n--- \n+++ \n@@ -1,5 +1,4 @@\n-world!\n-Goodbye!\n+world\n 1\n 2\n End"]} >>> >>> print (ddiff.changes['values_changed'][0]) root[4]['b']: --- +++ @@ -1,5 +1,4 @@ -world! -Goodbye! +world 1 2 End

types更改

 >>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, 3]}} >>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":"world\n\n\nEnd"}} >>> ddiff = DeepDiff(t1, t2) >>> pprint (ddiff.changes, indent = 2) { 'type_changes': [ "root[4]['b']: [1, 2, 3]=<type 'list'> vs. world\n\n\nEnd=<type 'str'>"]}

列表差异

 >>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, 3]}} >>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2]}} >>> ddiff = DeepDiff(t1, t2) >>> pprint (ddiff.changes, indent = 2) { 'list_removed': ["root[4]['b']: [3]"]}

清单差异2：请注意，它不考虑订单

 >>> # Note that it DOES NOT take order into account ... t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, 3]}} >>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 3, 2]}} >>> ddiff = DeepDiff(t1, t2) >>> pprint (ddiff.changes, indent = 2) { }

包含词典的列表：

 >>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, {1:1, 2:2}]}} >>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, {1:3}]}} >>> ddiff = DeepDiff(t1, t2) >>> pprint (ddiff.changes, indent = 2) { 'dic_item_removed': ["root[4]['b'][2][2]"], 'values_changed': ["root[4]['b'][2][1]: 1 ====>> 3"]}

在词典列表的情况下，完整列表理解解决scheme在set解决scheme提出时起作用

 TypeError: unhashable type: 'dict'

testing用例

 def diff(a, b): return [aa for aa in a if aa not in b] d1 = {"a":1, "b":1} d2 = {"a":2, "b":2} d3 = {"a":3, "b":3} >>> diff([d1, d2, d3], [d2, d3]) [{'a': 1, 'b': 1}] >>> diff([d1, d2, d3], [d1]) [{'a': 2, 'b': 2}, {'a': 3, 'b': 3}]

在查看In-operator的TimeComplexity时，最糟糕的情况是它与O（n）一起工作。即使为集。

所以在比较两个数组的时候，最好的情况是O（n）的时间复杂度，最坏的情况是O（n ^ 2）。

另一个（但不幸的是更复杂的）解决scheme，在最好和最坏的情况下与O（n）一起工作是这样的：

 # Compares the difference of list a and b # uses a callback function to compare items def diff(a, b, callback): a_missing_in_b = [] ai = 0 bi = 0 a = sorted(a, callback) b = sorted(b, callback) while (ai < len(a)) and (bi < len(b)): cmp = callback(a[ai], b[bi]) if cmp < 0: a_missing_in_b.append(a[ai]) ai += 1 elif cmp > 0: # Item b is missing in a bi += 1 else: # a and b intersecting on this item ai += 1 bi += 1 # if a and b are not of same length, we need to add the remaining items for ai in xrange(ai, len(a)): a_missing_in_b.append(a[ai]) return a_missing_in_b

例如

 >>> a=[1,2,3] >>> b=[2,4,6] >>> diff(a, b, cmp) [1, 3]

Python，计算列表差异

安装

用法示例

testing用例

列表到数组的转换

algorithm来生成列表的所有可能的排列？

为什么我们有map，fmap和liftM？

不能修改循环中的列表元素Python

如何切换Python列表中的两个项目的位置？

自定义Python列表sorting

检查列表<t>是否包含任何其他列表

recursion子文件夹search和返回列表python中的文件

将DataSet转换为List

检查平面列表中的重复项