获取两个列表之间的区别

我有两个Python列表,如下所示:

temp1 = ['One', 'Two', 'Three', 'Four'] temp2 = ['One', 'Two'] 

我需要创建第三个列表,第一个列表中的项目不在第二个列表中。 从我必须得到的例子:

 temp3 = ['Three', 'Four'] 

有没有周期和检查的快速方法?

 In [5]: list(set(temp1) - set(temp2)) Out[5]: ['Four', 'Three'] 

当心

 In [5]: set([1, 2]) - set([2, 3]) Out[5]: set([1]) 

你可能期望/希望它等于set([1, 3]) 。 如果你想set([1, 3])作为你的答案,你需要使用set([1, 2]).symmetric_difference(set([2, 3]))

现有的解决方案都提供以下两种方法之一:

  • 比O(n * m)的性能更快。
  • 保留输入列表的顺序。

但到目前为止,没有解决方案。 如果你想要两个,试试这个:

 s = set(temp2) temp3 = [x for x in temp1 if x not in s] 

性能测试

 import timeit init = 'temp1 = list(range(100)); temp2 = [i * 2 for i in range(50)]' print timeit.timeit('list(set(temp1) - set(temp2))', init, number = 100000) print timeit.timeit('s = set(temp2);[x for x in temp1 if x not in s]', init, number = 100000) print timeit.timeit('[item for item in temp1 if item not in temp2]', init, number = 100000) 

结果:

 4.34620224079 # ars' answer 4.2770634955 # This answer 30.7715615392 # matt b's answer 

我提出的方法以及保存顺序也比设置的减法(略快)快,因为它不需要构造不必要的集合。 如果第一个列表比第二个列表长得多,并且如果散列值很高,则性​​能差异将更加明显。 这是第二个测试,证明这一点:

 init = ''' temp1 = [str(i) for i in range(100000)] temp2 = [str(i * 2) for i in range(50)] ''' 

结果:

 11.3836875916 # ars' answer 3.63890368748 # this answer (3 times faster!) 37.7445402279 # matt b's answer 
 temp3 = [item for item in temp1 if item not in temp2] 

如果你想递归的差异,我已经写了一个python包: https : //github.com/seperman/deepdiff

安装

从PyPi安装:

 pip install deepdiff 

用法示例

输入

 >>> from deepdiff import DeepDiff >>> from pprint import pprint >>> from __future__ import print_function # In case running on Python 2 

相同的对象返回空

 >>> t1 = {1:1, 2:2, 3:3} >>> t2 = t1 >>> print(DeepDiff(t1, t2)) {} 

项目的类型已更改

 >>> t1 = {1:1, 2:2, 3:3} >>> t2 = {1:1, 2:"2", 3:3} >>> pprint(DeepDiff(t1, t2), indent=2) { 'type_changes': { 'root[2]': { 'newtype': <class 'str'>, 'newvalue': '2', 'oldtype': <class 'int'>, 'oldvalue': 2}}} 

一个项目的价值已经改变

 >>> t1 = {1:1, 2:2, 3:3} >>> t2 = {1:1, 2:4, 3:3} >>> pprint(DeepDiff(t1, t2), indent=2) {'values_changed': {'root[2]': {'newvalue': 4, 'oldvalue': 2}}} 

项目添加和/或删除

 >>> t1 = {1:1, 2:2, 3:3, 4:4} >>> t2 = {1:1, 2:4, 3:3, 5:5, 6:6} >>> ddiff = DeepDiff(t1, t2) >>> pprint (ddiff) {'dic_item_added': ['root[5]', 'root[6]'], 'dic_item_removed': ['root[4]'], 'values_changed': {'root[2]': {'newvalue': 4, 'oldvalue': 2}}} 

字符串差异

 >>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":"world"}} >>> t2 = {1:1, 2:4, 3:3, 4:{"a":"hello", "b":"world!"}} >>> ddiff = DeepDiff(t1, t2) >>> pprint (ddiff, indent = 2) { 'values_changed': { 'root[2]': {'newvalue': 4, 'oldvalue': 2}, "root[4]['b']": { 'newvalue': 'world!', 'oldvalue': 'world'}}} 

字符串差异2

 >>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":"world!\nGoodbye!\n1\n2\nEnd"}} >>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":"world\n1\n2\nEnd"}} >>> ddiff = DeepDiff(t1, t2) >>> pprint (ddiff, indent = 2) { 'values_changed': { "root[4]['b']": { 'diff': '--- \n' '+++ \n' '@@ -1,5 +1,4 @@\n' '-world!\n' '-Goodbye!\n' '+world\n' ' 1\n' ' 2\n' ' End', 'newvalue': 'world\n1\n2\nEnd', 'oldvalue': 'world!\n' 'Goodbye!\n' '1\n' '2\n' 'End'}}} >>> >>> print (ddiff['values_changed']["root[4]['b']"]["diff"]) --- +++ @@ -1,5 +1,4 @@ -world! -Goodbye! +world 1 2 End 

类型更改

 >>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, 3]}} >>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":"world\n\n\nEnd"}} >>> ddiff = DeepDiff(t1, t2) >>> pprint (ddiff, indent = 2) { 'type_changes': { "root[4]['b']": { 'newtype': <class 'str'>, 'newvalue': 'world\n\n\nEnd', 'oldtype': <class 'list'>, 'oldvalue': [1, 2, 3]}}} 

列表差异

 >>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, 3, 4]}} >>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2]}} >>> ddiff = DeepDiff(t1, t2) >>> pprint (ddiff, indent = 2) {'iterable_item_removed': {"root[4]['b'][2]": 3, "root[4]['b'][3]": 4}} 

清单差异2:

 >>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, 3]}} >>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 3, 2, 3]}} >>> ddiff = DeepDiff(t1, t2) >>> pprint (ddiff, indent = 2) { 'iterable_item_added': {"root[4]['b'][3]": 3}, 'values_changed': { "root[4]['b'][1]": {'newvalue': 3, 'oldvalue': 2}, "root[4]['b'][2]": {'newvalue': 2, 'oldvalue': 3}}} 

列表差异忽略顺序或重复:(与上面相同的字典)

 >>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, 3]}} >>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 3, 2, 3]}} >>> ddiff = DeepDiff(t1, t2, ignore_order=True) >>> print (ddiff) {} 

包含词典的列表:

 >>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, {1:1, 2:2}]}} >>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, {1:3}]}} >>> ddiff = DeepDiff(t1, t2) >>> pprint (ddiff, indent = 2) { 'dic_item_removed': ["root[4]['b'][2][2]"], 'values_changed': {"root[4]['b'][2][1]": {'newvalue': 3, 'oldvalue': 1}}} 

集:

 >>> t1 = {1, 2, 8} >>> t2 = {1, 2, 3, 5} >>> ddiff = DeepDiff(t1, t2) >>> pprint (DeepDiff(t1, t2)) {'set_item_added': ['root[3]', 'root[5]'], 'set_item_removed': ['root[8]']} 

命名的元组:

 >>> from collections import namedtuple >>> Point = namedtuple('Point', ['x', 'y']) >>> t1 = Point(x=11, y=22) >>> t2 = Point(x=11, y=23) >>> pprint (DeepDiff(t1, t2)) {'values_changed': {'root.y': {'newvalue': 23, 'oldvalue': 22}}} 

自定义对象:

 >>> class ClassA(object): ... a = 1 ... def __init__(self, b): ... self.b = b ... >>> t1 = ClassA(1) >>> t2 = ClassA(2) >>> >>> pprint(DeepDiff(t1, t2)) {'values_changed': {'root.b': {'newvalue': 2, 'oldvalue': 1}}} 

添加对象属性:

 >>> t2.c = "new attribute" >>> pprint(DeepDiff(t1, t2)) {'attribute_added': ['root.c'], 'values_changed': {'root.b': {'newvalue': 2, 'oldvalue': 1}}} 

如果你真的在看性能,那就用numpy吧!

这里是完整的笔记本作为Github上的要点,列表,numpy和熊猫之间的比较。

https://gist.github.com/denfromufa/2821ff59b02e9482be15d27f2bbd4451

在这里输入图像描述

两个列表(如list1和list2)之间的区别可以使用下面的简单函数找到。

 def diff(list1, list2): c = set(list1).union(set(list2)) # or c = set(list1) | set(list2) d = set(list1).intersection(set(list2)) # or d = set(list1) & set(list2) return list(c - d) 

要么

 def diff(list1, list2): return list(set(list1).symmetric_difference(set(list2))) # or return list(set(list1) ^ set(list2)) 

通过使用上述函数,可以使用diff(temp2, temp1)diff(temp1, temp2)来找到diff(temp2, temp1) 。 两者都会给出结果['Four', 'Three'] 。 您不必担心列表的顺序或首先给出哪个列表。

Python文档参考

我会折腾,因为目前的解决方案没有产生一个元组:

 temp3 = tuple(set(temp1) - set(temp2)) 

或者:

 #edited using @Mark Byers idea. If you accept this one as answer, just accept his instead. temp3 = tuple(x for x in temp1 if x not in set(temp2)) 

像其他非元组在这个方向上产生答案一样,它保留了顺序

可以使用python XOR操作符来完成。

  • 这将删除每个列表中的重复项
  • 这将显示temp1与temp1的temp1和temp2的差异。

 set(temp1) ^ set(temp2) 

尝试这个:

 temp3 = set(temp1) - set(temp2) 

这可能比Mark的列表理解还要快:

 filterfalse(set(temp2).__contains__, temp1) 

我想要一些需要两个列表,并可以做bash中的diff 。 由于这个问题首先弹出搜索“python比较两个列表”,而不是非常具体,我会张贴我想出了。

使用difflib SequenceMather可以比较两个列表,比如diff 。 没有其他答案会告诉你的差异发生的位置,但这一个。 有些答案仅在一个方向上有所不同。 一些重新排列元素。 有些不处理重复。 但是这个解决方案给了你两个列表之间真正的区别:

 a = 'A quick fox jumps the lazy dog'.split() b = 'A quick brown mouse jumps over the dog'.split() from difflib import SequenceMatcher for tag, i, j, k, l in SequenceMatcher(None, a, b).get_opcodes(): if tag == 'equal': print('both have', a[i:j]) if tag in ('delete', 'replace'): print(' 1st has', a[i:j]) if tag in ('insert', 'replace'): print(' 2nd has', b[k:l]) 

这输出:

 both have ['A', 'quick'] 1st has ['fox'] 2nd has ['brown', 'mouse'] both have ['jumps'] 2nd has ['over'] both have ['the'] 1st has ['lazy'] both have ['dog'] 

当然,如果你的应用程序做出与其他答案相同的假设,你将从中受益最多。 但是,如果你正在寻找一个真正的diff功能,那么这是唯一的出路。

例如,没有其他答案可以处理:

 a = [1,2,3,4,5] b = [5,4,3,2,1] 

但是这个呢:

  2nd has [5, 4, 3, 2] both have [1] 1st has [2, 3, 4, 5] 

如果碰到TypeError: unhashable type: 'list'你需要将列表或集合转换为元组,例如:

 set(map(tuple, list_of_lists1)).symmetric_difference(set(map(tuple, list_of_lists2))) 

另请参见如何比较Python中的列表/集列表?

如果对difflist的元素进行排序和设置,则可以使用朴素的方法。

 list1=[1,2,3,4,5] list2=[1,2,3] print list1[len(list2):] 

或使用本机设置的方法:

 subset=set(list1).difference(list2) print subset import timeit init = 'temp1 = list(range(100)); temp2 = [i * 2 for i in range(50)]' print "Naive solution: ", timeit.timeit('temp1[len(temp2):]', init, number = 100000) print "Native set solution: ", timeit.timeit('set(temp1).difference(temp2)', init, number = 100000) 

天真的解决方案:0.0787101593292

本机解决方案:0.998837615564

如果你想要更像一个变更集…可以使用计数器

 from collections import Counter def diff(a, b): """ more verbose than needs to be, for clarity """ ca, cb = Counter(a), Counter(b) to_add = cb - ca to_remove = ca - cb changes = Counter(to_add) changes.subtract(to_remove) return changes lista = ['one', 'three', 'four', 'four', 'one'] listb = ['one', 'two', 'three'] In [127]: diff(lista, listb) Out[127]: Counter({'two': 1, 'one': -1, 'four': -2}) # in order to go from lista to list b, you need to add a "two", remove a "one", and remove two "four"s In [128]: diff(listb, lista) Out[128]: Counter({'four': 2, 'one': 1, 'two': -1}) # in order to go from listb to lista, you must add two "four"s, add a "one", and remove a "two" 

这是最简单的情况的答案。

这比上面那个比双向差的短,因为它只是完成问题所要求的:生成第一个列表中的列表,而不是第二个列表中的列表。

 from collections import Counter lst1 = ['One', 'Two', 'Three', 'Four'] lst2 = ['One', 'Two'] c1 = Counter(lst1) c2 = Counter(lst2) diff = list((c1 - c2).elements()) 

另外,根据您的可读性偏好,这也是一个不错的选择:

 diff = list((Counter(lst1) - Counter(lst2)).elements()) 

输出:

 ['Three', 'Four'] 

请注意,如果您只是遍历它,您可以删除list(...)调用。

因为这个解决方案使用计数器,所以它可以正确处理数量和许多基于集合的答案。 例如在这个输入上:

 lst1 = ['One', 'Two', 'Two', 'Two', 'Three', 'Three', 'Four'] lst2 = ['One', 'Two'] 

输出是:

 ['Two', 'Two', 'Three', 'Three', 'Four'] 

这是另一个解决方案:

 def diff(a, b): xa = [i for i in set(a) if i not in b] xb = [i for i in set(b) if i not in a] return xa + xb 

arulmr解决方案的单线版本

 def diff(listA, listB): return set(listA) - set(listB) | set(listA) -set(listB) 

我们可以计算交集减去列表的联合:

 temp1 = ['One', 'Two', 'Three', 'Four'] temp2 = ['One', 'Two', 'Five'] set(temp1+temp2)-(set(temp1)&set(temp2)) Out: set(['Four', 'Five', 'Three']) 

这可以用一行来解决。 问题是给出两个列表(temp1和temp2)在第三个列表(temp3)中返​​回它们的差异。

 temp3 = list(set(temp1).difference(set(temp2))) 
 (list(set(a)-set(b))+list(set(b)-set(a))) 

我在游戏中为时太晚,但是你可以把上面提到的一些代码的性能与这个进行比较,其中两个最快的竞争者是,

 list(set(x).symmetric_difference(set(y))) list(set(x) ^ set(y)) 

我对编码的基本层面表示歉意。

 import time import random from itertools import filterfalse # 1 - performance (time taken) # 2 - correctness (answer - 1,4,5,6) # set performance performance = 1 numberoftests = 7 def answer(x,y,z): if z == 0: start = time.clock() lists = (str(list(set(x)-set(y))+list(set(y)-set(y)))) times = ("1 = " + str(time.clock() - start)) return (lists,times) elif z == 1: start = time.clock() lists = (str(list(set(x).symmetric_difference(set(y))))) times = ("2 = " + str(time.clock() - start)) return (lists,times) elif z == 2: start = time.clock() lists = (str(list(set(x) ^ set(y)))) times = ("3 = " + str(time.clock() - start)) return (lists,times) elif z == 3: start = time.clock() lists = (filterfalse(set(y).__contains__, x)) times = ("4 = " + str(time.clock() - start)) return (lists,times) elif z == 4: start = time.clock() lists = (tuple(set(x) - set(y))) times = ("5 = " + str(time.clock() - start)) return (lists,times) elif z == 5: start = time.clock() lists = ([tt for tt in x if tt not in y]) times = ("6 = " + str(time.clock() - start)) return (lists,times) else: start = time.clock() Xarray = [iDa for iDa in x if iDa not in y] Yarray = [iDb for iDb in y if iDb not in x] lists = (str(Xarray + Yarray)) times = ("7 = " + str(time.clock() - start)) return (lists,times) n = numberoftests if performance == 2: a = [1,2,3,4,5] b = [3,2,6] for c in range(0,n): d = answer(a,b,c) print(d[0]) elif performance == 1: for tests in range(0,10): print("Test Number" + str(tests + 1)) a = random.sample(range(1, 900000), 9999) b = random.sample(range(1, 900000), 9999) for c in range(0,n): #if c not in (1,4,5,6): d = answer(a,b,c) print(d[1])