
我需要在python中获得大文件(数十万行)的行数。 什么是记忆和时间最有效的方式?


def file_len(fname): with open(fname) as f: for i, l in enumerate(f): pass return i + 1 



毕竟,任何解决scheme都必须读取整个文件,找出有多less\n ,并返回结果。

没有阅读整个文件,你有更好的方式吗? 不知道…最好的解决scheme将永远是I / O绑定的,最好你可以做的是确保你不使用不必要的内存,但看起来你已经覆盖了。


我相信内存映射文件将是最快的解决scheme。 我尝试了四个函数:OP所发布的函数( opcount ); 对文件中的行进行简单迭代( simplecount ); 带有内存映射mapcount (mmap)的readline( mapcount ); 以及由Mykola Kharechko( bufcount )提供的缓冲区读取解决scheme。


Windows XP,Python 2.5,2GB内存,2GHz AMD处理器


 mapcount : 0.465599966049 simplecount : 0.756399965286 bufcount : 0.546800041199 opcount : 0.718600034714 

编辑 :Python 2.6的数字:

 mapcount : 0.471799945831 simplecount : 0.634400033951 bufcount : 0.468800067902 opcount : 0.602999973297 

所以缓冲区读取策略似乎是Windows / Python 2.6中最快的


 from __future__ import with_statement import time import mmap import random from collections import defaultdict def mapcount(filename): f = open(filename, "r+") buf = mmap.mmap(f.fileno(), 0) lines = 0 readline = buf.readline while readline(): lines += 1 return lines def simplecount(filename): lines = 0 for line in open(filename): lines += 1 return lines def bufcount(filename): f = open(filename) lines = 0 buf_size = 1024 * 1024 read_f = f.read # loop optimization buf = read_f(buf_size) while buf: lines += buf.count('\n') buf = read_f(buf_size) return lines def opcount(fname): with open(fname) as f: for i, l in enumerate(f): pass return i + 1 counts = defaultdict(list) for i in range(5): for func in [mapcount, simplecount, bufcount, opcount]: start_time = time.time() assert func("big_file.txt") == 1209138 counts[func].append(time.time() - start_time) for key, vals in counts.items(): print key.__name__, ":", sum(vals) / float(len(vals)) 

你可以执行一个subprocess并运行wc -l filename

 import subprocess def file_len(fname): p = subprocess.Popen(['wc', '-l', fname], stdout=subprocess.PIPE, stderr=subprocess.PIPE) result, err = p.communicate() if p.returncode != 0: raise IOError(err) return int(result.strip().split()[0]) 


所有这些解决scheme忽略了使其运行速度相当快的一种方法,即通过使用未缓冲的(原始)接口,使用字节码和进行自己的缓冲。 (这仅适用于Python 3.在Python 2中,默认情况下可能使用或不使用原始接口,但在Python 3中,将默认使用Unicode。)


 def rawcount(filename): f = open(filename, 'rb') lines = 0 buf_size = 1024 * 1024 read_f = f.raw.read buf = read_f(buf_size) while buf: lines += buf.count(b'\n') buf = read_f(buf_size) return lines 


 def _make_gen(reader): b = reader(1024 * 1024) while b: yield b b = reader(1024*1024) def rawgencount(filename): f = open(filename, 'rb') f_gen = _make_gen(f.raw.read) return sum( buf.count(b'\n') for buf in f_gen ) 


 from itertools import (takewhile,repeat) def rawincount(filename): f = open(filename, 'rb') bufgen = takewhile(lambda x: x, (f.raw.read(1024*1024) for _ in repeat(None))) return sum( buf.count(b'\n') for buf in bufgen ) 


 function average, s min, s ratio rawincount 0.0043 0.0041 1.00 rawgencount 0.0044 0.0042 1.01 rawcount 0.0048 0.0045 1.09 bufcount 0.008 0.0068 1.64 wccount 0.01 0.0097 2.35 itercount 0.014 0.014 3.41 opcount 0.02 0.02 4.83 kylecount 0.021 0.021 5.05 simplecount 0.022 0.022 5.25 mapcount 0.037 0.031 7.46 

这里是一个python程序,用于使用多处理库来在机器/内核之间分配行数。 我的testing使用8核心的Windows 64服务器,将一个2000万行的文件从26秒提高到了7秒。 注意:不使用内存映射使事情变得更慢。

 import multiprocessing, sys, time, os, mmap import logging, logging.handlers def init_logger(pid): console_format = 'P{0} %(levelname)s %(message)s'.format(pid) logger = logging.getLogger() # New logger at root level logger.setLevel( logging.INFO ) logger.handlers.append( logging.StreamHandler() ) logger.handlers[0].setFormatter( logging.Formatter( console_format, '%d/%m/%y %H:%M:%S' ) ) def getFileLineCount( queues, pid, processes, file1 ): init_logger(pid) logging.info( 'start' ) physical_file = open(file1, "r") # mmap.mmap(fileno, length[, tagname[, access[, offset]]] m1 = mmap.mmap( physical_file.fileno(), 0, access=mmap.ACCESS_READ ) #work out file size to divide up line counting fSize = os.stat(file1).st_size chunk = (fSize / processes) + 1 lines = 0 #get where I start and stop _seedStart = chunk * (pid) _seekEnd = chunk * (pid+1) seekStart = int(_seedStart) seekEnd = int(_seekEnd) if seekEnd < int(_seekEnd + 1): seekEnd += 1 if _seedStart < int(seekStart + 1): seekStart += 1 if seekEnd > fSize: seekEnd = fSize #find where to start if pid > 0: m1.seek( seekStart ) #read next line l1 = m1.readline() # need to use readline with memory mapped files seekStart = m1.tell() #tell previous rank my seek start to make their seek end if pid > 0: queues[pid-1].put( seekStart ) if pid < processes-1: seekEnd = queues[pid].get() m1.seek( seekStart ) l1 = m1.readline() while len(l1) > 0: lines += 1 l1 = m1.readline() if m1.tell() > seekEnd or len(l1) == 0: break logging.info( 'done' ) # add up the results if pid == 0: for p in range(1,processes): lines += queues[0].get() queues[0].put(lines) # the total lines counted else: queues[0].put(lines) m1.close() physical_file.close() if __name__ == '__main__': init_logger( 'main' ) if len(sys.argv) > 1: file_name = sys.argv[1] else: logging.fatal( 'parameters required: file-name [processes]' ) exit() t = time.time() processes = multiprocessing.cpu_count() if len(sys.argv) > 2: processes = int(sys.argv[2]) queues=[] # a queue for each process for pid in range(processes): queues.append( multiprocessing.Queue() ) jobs=[] prev_pipe = 0 for pid in range(processes): p = multiprocessing.Process( target = getFileLineCount, args=(queues, pid, processes, file_name,) ) p.start() jobs.append(p) jobs[0].join() #wait for counting to finish lines = queues[0].get() logging.info( 'finished {} Lines:{}'.format( time.time() - t, lines ) ) 

我将使用Python的文件对象方法readlines ,如下所示:

 def file_len(full_path): """ Count number of lines in a file.""" f = open(full_path) nr_of_lines = sum(1 for line in f) f.close() return nr_of_lines 


 lines = 0 buffer = bytearray(2048) with open(filename) as f: while f.readinto(buffer) > 0: lines += buffer.count('\n') 


 In [20]: timeit sum(1 for line in open('Charts.ipynb')) 100000 loops, best of 3: 9.79 µs per loop In [21]: timeit len(open('Charts.ipynb').read().splitlines()) 100000 loops, best of 3: 12 µs per loop 

此代码更短,更清晰。 这可能是最好的方法:

 with open(filename) as f: return len(list(f)) 



 import fileinput as fi def filecount(fname): for line in fi.input(fname): pass return fi.lineno() 


 mapcount : 6.1331050396 simplecount : 4.588793993 opcount : 4.42918205261 filecount : 43.2780818939 bufcount : 0.170812129974 


这是我用纯pythonfind的最快的东西。 您可以通过设置缓冲区来使用任意数量的内存,尽pipe2 ** 16似乎是我电脑上的最佳select。

 from functools import partial buffer=2**16 with open(myfile) as f: print sum(x.count('\n') for x in iter(partial(f.read,buffer), '')) 

我在这里find答案为什么从stdin中读取的行比在Python中慢得多? 并微调了一下。 它是一个非常好的阅读理解如何快速计数线,虽然wc -l仍然比任何其他快75%左右。


 def CountLines(filename): f = open(filename) try: lines = 1 buf_size = 1024 * 1024 read_f = f.read # loop optimization buf = read_f(buf_size) # Empty file if not buf: return 0 while buf: lines += buf.count('\n') buf = read_f(buf_size) return lines finally: f.close() 

现在也是空的文件和最后一行(没有\ n)被计数。


 def file_len(fname): counts = itertools.count() with open(fname) as f: for _ in f: counts.next() return counts.next() 


 #!/usr/bin/env python def main(): f = open('filename') lines = 0 buf_size = 1024 * 1024 read_f = f.read # loop optimization buf = read_f(buf_size) while buf: lines += buf.count('\n') buf = read_f(buf_size) print lines if __name__ == '__main__': main() 



为什么不读取前100行和最后100行,并估计平均行长度,然后通过这些数字除总文件大小? 如果你不需要一个确切的值,这可以工作。


 def c(): import time s = time.time() file_length = len(open('myfile.txt','r').read().split('\n')) print time.time() - s 
 def line_count(path): count = 0 with open(path) as lines: for count, l in enumerate(lines, start=1): pass return count 


 import subprocess def count_file_lines(file_path): """ Counts the number of lines in a file using wc utility. :param file_path: path to file :return: int, no of lines """ num = subprocess.check_output(['wc', '-l', file_path]) num = num.split(' ') return int(num[0]) 

更新:这比在内存使用情况下使用纯python慢​​得多。 当subprocess执行你的命令时,subprocess将派生一个与父进程具有相同内存占用空间的新进程。


 lines = 0 with open(path) as f: for line in f: lines += 1