如何用C#中的迭代器反向读取文本文件

我需要处理一个大的文件,大约400K行和200M。但有时我必须从下往上处理。 我如何在这里使用迭代器(yield return)? 基本上我不喜欢加载内存中的所有内容。 我知道在.NET中使用迭代器会更有效率。

反向读取文本文件是非常棘手的,除非你使用固定大小的编码(例如ASCII)。 当你有可变尺寸的编码(比如UTF-8)时,当你获取数据时,你将不得不检查你是否在字符中间。

这个框架没有内置任何东西,我怀疑你必须为每个可变宽度编码分别进行硬编码。

编辑:这已经有些testing – 但这并不是说它不会有一些微妙的错误周围。 它使用MiscUtil的StreamUtil,但是我已经从底部包含了必要的(新的)方法。 哦,它需要重构 – 有一个相当大的方法,你会看到:

using System; using System.Collections; using System.Collections.Generic; using System.IO; using System.Text; namespace MiscUtil.IO { /// <summary> /// Takes an encoding (defaulting to UTF-8) and a function which produces a seekable stream /// (or a filename for convenience) and yields lines from the end of the stream backwards. /// Only single byte encodings, and UTF-8 and Unicode, are supported. The stream /// returned by the function must be seekable. /// </summary> public sealed class ReverseLineReader : IEnumerable<string> { /// <summary> /// Buffer size to use by default. Classes with internal access can specify /// a different buffer size - this is useful for testing. /// </summary> private const int DefaultBufferSize = 4096; /// <summary> /// Means of creating a Stream to read from. /// </summary> private readonly Func<Stream> streamSource; /// <summary> /// Encoding to use when converting bytes to text /// </summary> private readonly Encoding encoding; /// <summary> /// Size of buffer (in bytes) to read each time we read from the /// stream. This must be at least as big as the maximum number of /// bytes for a single character. /// </summary> private readonly int bufferSize; /// <summary> /// Function which, when given a position within a file and a byte, states whether /// or not the byte represents the start of a character. /// </summary> private Func<long,byte,bool> characterStartDetector; /// <summary> /// Creates a LineReader from a stream source. The delegate is only /// called when the enumerator is fetched. UTF-8 is used to decode /// the stream into text. /// </summary> /// <param name="streamSource">Data source</param> public ReverseLineReader(Func<Stream> streamSource) : this(streamSource, Encoding.UTF8) { } /// <summary> /// Creates a LineReader from a filename. The file is only opened /// (or even checked for existence) when the enumerator is fetched. /// UTF8 is used to decode the file into text. /// </summary> /// <param name="filename">File to read from</param> public ReverseLineReader(string filename) : this(filename, Encoding.UTF8) { } /// <summary> /// Creates a LineReader from a filename. The file is only opened /// (or even checked for existence) when the enumerator is fetched. /// </summary> /// <param name="filename">File to read from</param> /// <param name="encoding">Encoding to use to decode the file into text</param> public ReverseLineReader(string filename, Encoding encoding) : this(() => File.OpenRead(filename), encoding) { } /// <summary> /// Creates a LineReader from a stream source. The delegate is only /// called when the enumerator is fetched. /// </summary> /// <param name="streamSource">Data source</param> /// <param name="encoding">Encoding to use to decode the stream into text</param> public ReverseLineReader(Func<Stream> streamSource, Encoding encoding) : this(streamSource, encoding, DefaultBufferSize) { } internal ReverseLineReader(Func<Stream> streamSource, Encoding encoding, int bufferSize) { this.streamSource = streamSource; this.encoding = encoding; this.bufferSize = bufferSize; if (encoding.IsSingleByte) { // For a single byte encoding, every byte is the start (and end) of a character characterStartDetector = (pos, data) => true; } else if (encoding is UnicodeEncoding) { // For UTF-16, even-numbered positions are the start of a character. // TODO: This assumes no surrogate pairs. More work required // to handle that. characterStartDetector = (pos, data) => (pos & 1) == 0; } else if (encoding is UTF8Encoding) { // For UTF-8, bytes with the top bit clear or the second bit set are the start of a character // See http://www.cl.cam.ac.uk/~mgk25/unicode.html characterStartDetector = (pos, data) => (data & 0x80) == 0 || (data & 0x40) != 0; } else { throw new ArgumentException("Only single byte, UTF-8 and Unicode encodings are permitted"); } } /// <summary> /// Returns the enumerator reading strings backwards. If this method discovers that /// the returned stream is either unreadable or unseekable, a NotSupportedException is thrown. /// </summary> public IEnumerator<string> GetEnumerator() { Stream stream = streamSource(); if (!stream.CanSeek) { stream.Dispose(); throw new NotSupportedException("Unable to seek within stream"); } if (!stream.CanRead) { stream.Dispose(); throw new NotSupportedException("Unable to read within stream"); } return GetEnumeratorImpl(stream); } private IEnumerator<string> GetEnumeratorImpl(Stream stream) { try { long position = stream.Length; if (encoding is UnicodeEncoding && (position & 1) != 0) { throw new InvalidDataException("UTF-16 encoding provided, but stream has odd length."); } // Allow up to two bytes for data from the start of the previous // read which didn't quite make it as full characters byte[] buffer = new byte[bufferSize + 2]; char[] charBuffer = new char[encoding.GetMaxCharCount(buffer.Length)]; int leftOverData = 0; String previousEnd = null; // TextReader doesn't return an empty string if there's line break at the end // of the data. Therefore we don't return an empty string if it's our *first* // return. bool firstYield = true; // A line-feed at the start of the previous buffer means we need to swallow // the carriage-return at the end of this buffer - hence this needs declaring // way up here! bool swallowCarriageReturn = false; while (position > 0) { int bytesToRead = Math.Min(position > int.MaxValue ? bufferSize : (int)position, bufferSize); position -= bytesToRead; stream.Position = position; StreamUtil.ReadExactly(stream, buffer, bytesToRead); // If we haven't read a full buffer, but we had bytes left // over from before, copy them to the end of the buffer if (leftOverData > 0 && bytesToRead != bufferSize) { // Buffer.BlockCopy doesn't document its behaviour with respect // to overlapping data: we *might* just have read 7 bytes instead of // 8, and have two bytes to copy... Array.Copy(buffer, bufferSize, buffer, bytesToRead, leftOverData); } // We've now *effectively* read this much data. bytesToRead += leftOverData; int firstCharPosition = 0; while (!characterStartDetector(position + firstCharPosition, buffer[firstCharPosition])) { firstCharPosition++; // Bad UTF-8 sequences could trigger this. For UTF-8 we should always // see a valid character start in every 3 bytes, and if this is the start of the file // so we've done a short read, we should have the character start // somewhere in the usable buffer. if (firstCharPosition == 3 || firstCharPosition == bytesToRead) { throw new InvalidDataException("Invalid UTF-8 data"); } } leftOverData = firstCharPosition; int charsRead = encoding.GetChars(buffer, firstCharPosition, bytesToRead - firstCharPosition, charBuffer, 0); int endExclusive = charsRead; for (int i = charsRead - 1; i >= 0; i--) { char lookingAt = charBuffer[i]; if (swallowCarriageReturn) { swallowCarriageReturn = false; if (lookingAt == '\r') { endExclusive--; continue; } } // Anything non-line-breaking, just keep looking backwards if (lookingAt != '\n' && lookingAt != '\r') { continue; } // End of CRLF? Swallow the preceding CR if (lookingAt == '\n') { swallowCarriageReturn = true; } int start = i + 1; string bufferContents = new string(charBuffer, start, endExclusive - start); endExclusive = i; string stringToYield = previousEnd == null ? bufferContents : bufferContents + previousEnd; if (!firstYield || stringToYield.Length != 0) { yield return stringToYield; } firstYield = false; previousEnd = null; } previousEnd = endExclusive == 0 ? null : (new string(charBuffer, 0, endExclusive) + previousEnd); // If we didn't decode the start of the array, put it at the end for next time if (leftOverData != 0) { Buffer.BlockCopy(buffer, 0, buffer, bufferSize, leftOverData); } } if (leftOverData != 0) { // At the start of the final buffer, we had the end of another character. throw new InvalidDataException("Invalid UTF-8 data at start of stream"); } if (firstYield && string.IsNullOrEmpty(previousEnd)) { yield break; } yield return previousEnd ?? ""; } finally { stream.Dispose(); } } IEnumerator IEnumerable.GetEnumerator() { return GetEnumerator(); } } } // StreamUtil.cs: public static class StreamUtil { public static void ReadExactly(Stream input, byte[] buffer, int bytesToRead) { int index = 0; while (index < bytesToRead) { int read = input.Read(buffer, index, bytesToRead - index); if (read == 0) { throw new EndOfStreamException (String.Format("End of stream reached with {0} byte{1} left to read.", bytesToRead - index, bytesToRead - index == 1 ? "s" : "")); } index += read; } } } 

反馈非常欢迎。 这很有趣:)

您可以使用File.ReadLines获取行迭代器

 foreach (var line in File.ReadLines(@"C:\temp\ReverseRead.txt").Reverse()) { if (noNeedToReadFurther) break; // process line here Console.WriteLine(line); } 

编辑:

阅读applejacks01的评论后,我运行了一些testing,它看起来像 .Reverse()实际上加载整个文件。

我使用File.ReadLines()来打印40MB文件的第一行 – 控制台应用程序的内存使用量为5MB 。 然后,使用File.ReadLines().Reverse()打印同一文件的最后一行 – 内存使用量为95MB

结论

无论“Reverse()”在做什么,读大文件底部都不是一个好的select

我把文件放入一行一行,然后用List.Reverse();

  StreamReader objReader = new StreamReader(filename); string sLine = ""; ArrayList arrText = new ArrayList(); while (sLine != null) { sLine = objReader.ReadLine(); if (sLine != null) arrText.Add(sLine); } objReader.Close(); arrText.Reverse(); foreach (string sOutput in arrText) { 

要创build一个文件迭代器,你可以这样做:

编辑:

这是我固定宽度的反向文件阅读器的固定版本:

 public static IEnumerable<string> readFile() { using (FileStream reader = new FileStream(@"c:\test.txt",FileMode.Open,FileAccess.Read)) { int i=0; StringBuilder lineBuffer = new StringBuilder(); int byteRead; while (-i < reader.Length) { reader.Seek(--i, SeekOrigin.End); byteRead = reader.ReadByte(); if (byteRead == 10 && lineBuffer.Length > 0) { yield return Reverse(lineBuffer.ToString()); lineBuffer.Remove(0, lineBuffer.Length); } lineBuffer.Append((char)byteRead); } yield return Reverse(lineBuffer.ToString()); reader.Close(); } } public static string Reverse(string str) { char[] arr = new char[str.Length]; for (int i = 0; i < str.Length; i++) arr[i] = str[str.Length - 1 - i]; return new string(arr); } 

您可以一次向后读取文件中的一个字符,并caching所有字符,直到达到回车和/或换行。

然后,您将所收集的string反转,并将其作为一行。

我想做类似的事情。 这是我的代码。 这个类将创build包含大文件块的临时文件。 这将避免内存膨胀。 用户可以指定是否希望文件反转。 因此,它将以相反的方式返回内容。

这个类也可以用来在单个文件中写入大数据,而不会使内存膨胀。

请提供反馈。

  using System; using System.Collections.Generic; using System.Diagnostics; using System.IO; using System.Linq; using System.Text; using System.Threading.Tasks; namespace BigFileService { public class BigFileDumper { /// <summary> /// Buffer that will store the lines until it is full. /// Then it will dump it to temp files. /// </summary> public int CHUNK_SIZE = 1000; public bool ReverseIt { get; set; } public long TotalLineCount { get { return totalLineCount; } } private long totalLineCount; private int BufferCount = 0; private StreamWriter Writer; /// <summary> /// List of files that would store the chunks. /// </summary> private List<string> LstTempFiles; private string ParentDirectory; private char[] trimchars = { '/', '\\'}; public BigFileDumper(string FolderPathToWrite) { this.LstTempFiles = new List<string>(); this.ParentDirectory = FolderPathToWrite.TrimEnd(trimchars) + "\\" + "BIG_FILE_DUMP"; this.totalLineCount = 0; this.BufferCount = 0; this.Initialize(); } private void Initialize() { // Delete existing directory. if (Directory.Exists(this.ParentDirectory)) { Directory.Delete(this.ParentDirectory, true); } // Create a new directory. Directory.CreateDirectory(this.ParentDirectory); } public void WriteLine(string line) { if (this.BufferCount == 0) { string newFile = "DumpFile_" + LstTempFiles.Count(); LstTempFiles.Add(newFile); Writer = new StreamWriter(this.ParentDirectory + "\\" + newFile); } // Keep on adding in the buffer as long as size is okay. if (this.BufferCount < this.CHUNK_SIZE) { this.totalLineCount++; // main count this.BufferCount++; // Chunk count. Writer.WriteLine(line); } else { // Buffer is full, time to create a new file. // Close the existing file first. Writer.Close(); // Make buffer count 0 again. this.BufferCount = 0; this.WriteLine(line); } } public void Close() { if (Writer != null) Writer.Close(); } public string GetFullFile() { if (LstTempFiles.Count <= 0) { Debug.Assert(false, "There are no files created."); return ""; } string returnFilename = this.ParentDirectory + "\\" + "FullFile"; if (File.Exists(returnFilename) == false) { // Create a consolidated file from the existing small dump files. // Now this is interesting. We will open the small dump files one by one. // Depending on whether the user require inverted file, we will read them in descending order & reverted, // or ascending order in normal way. if (this.ReverseIt) this.LstTempFiles.Reverse(); foreach (var fileName in LstTempFiles) { string fullFileName = this.ParentDirectory + "\\" + fileName; // FileLines will use small memory depending on size of CHUNK. User has control. var fileLines = File.ReadAllLines(fullFileName); // Time to write in the writer. if (this.ReverseIt) fileLines = fileLines.Reverse().ToArray(); // Write the lines File.AppendAllLines(returnFilename, fileLines); } } return returnFilename; } } } 

这项服务可以使用如下 –

 void TestBigFileDump_File(string BIG_FILE, string FOLDER_PATH_FOR_CHUNK_FILES) { // Start processing the input Big file. StreamReader reader = new StreamReader(BIG_FILE); // Create a dump file class object to handle efficient memory management. var bigFileDumper = new BigFileDumper(FOLDER_PATH_FOR_CHUNK_FILES); // Set to reverse the output file. bigFileDumper.ReverseIt = true; bigFileDumper.CHUNK_SIZE = 100; // How much at a time to keep in RAM before dumping to local file. while (reader.EndOfStream == false) { string line = reader.ReadLine(); bigFileDumper.WriteLine(line); } bigFileDumper.Close(); reader.Close(); // Get back full reversed file. var reversedFilename = bigFileDumper.GetFullFile(); Console.WriteLine("Check output file - " + reversedFilename); } 

这里已经有了很好的答案,这里有另外一个LINQ兼容的类,可以用来关注大文件的性能和支持。 它假设一个“\ r \ n”行结束符。

用法

 var reader = new ReverseTextReader(@"C:\Temp\ReverseTest.txt"); while (!reader.EndOfStream) Console.WriteLine(reader.ReadLine()); 

ReverseTextReader类

 /// <summary> /// Reads a text file backwards, line-by-line. /// </summary> /// <remarks>This class uses file seeking to read a text file of any size in reverse order. This /// is useful for needs such as reading a log file newest-entries first.</remarks> public sealed class ReverseTextReader : IEnumerable<string> { private const int BufferSize = 16384; // The number of bytes read from the uderlying stream. private readonly Stream _stream; // Stores the stream feeding data into this reader private readonly Encoding _encoding; // Stores the encoding used to process the file private byte[] _leftoverBuffer; // Stores the leftover partial line after processing a buffer private readonly Queue<string> _lines; // Stores the lines parsed from the buffer #region Constructors /// <summary> /// Creates a reader for the specified file. /// </summary> /// <param name="filePath"></param> public ReverseTextReader(string filePath) : this(new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.Read), Encoding.Default) { } /// <summary> /// Creates a reader using the specified stream. /// </summary> /// <param name="stream"></param> public ReverseTextReader(Stream stream) : this(stream, Encoding.Default) { } /// <summary> /// Creates a reader using the specified path and encoding. /// </summary> /// <param name="filePath"></param> /// <param name="encoding"></param> public ReverseTextReader(string filePath, Encoding encoding) : this(new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.Read), encoding) { } /// <summary> /// Creates a reader using the specified stream and encoding. /// </summary> /// <param name="stream"></param> /// <param name="encoding"></param> public ReverseTextReader(Stream stream, Encoding encoding) { _stream = stream; _encoding = encoding; _lines = new Queue<string>(128); // The stream needs to support seeking for this to work if(!_stream.CanSeek) throw new InvalidOperationException("The specified stream needs to support seeking to be read backwards."); if (!_stream.CanRead) throw new InvalidOperationException("The specified stream needs to support reading to be read backwards."); // Set the current position to the end of the file _stream.Position = _stream.Length; _leftoverBuffer = new byte[0]; } #endregion #region Overrides /// <summary> /// Reads the next previous line from the underlying stream. /// </summary> /// <returns></returns> public string ReadLine() { // Are there lines left to read? If so, return the next one if (_lines.Count != 0) return _lines.Dequeue(); // Are we at the beginning of the stream? If so, we're done if (_stream.Position == 0) return null; #region Read and Process the Next Chunk // Remember the current position var currentPosition = _stream.Position; var newPosition = currentPosition - BufferSize; // Are we before the beginning of the stream? if (newPosition < 0) newPosition = 0; // Calculate the buffer size to read var count = (int)(currentPosition - newPosition); // Set the new position _stream.Position = newPosition; // Make a new buffer but append the previous leftovers var buffer = new byte[count + _leftoverBuffer.Length]; // Read the next buffer _stream.Read(buffer, 0, count); // Move the position of the stream back _stream.Position = newPosition; // And copy in the leftovers from the last buffer if (_leftoverBuffer.Length != 0) Array.Copy(_leftoverBuffer, 0, buffer, count, _leftoverBuffer.Length); // Look for CrLf delimiters var end = buffer.Length - 1; var start = buffer.Length - 2; // Search backwards for a line feed while (start >= 0) { // Is it a line feed? if (buffer[start] == 10) { // Yes. Extract a line and queue it (but exclude the \r\n) _lines.Enqueue(_encoding.GetString(buffer, start + 1, end - start - 2)); // And reset the end end = start; } // Move to the previous character start--; } // What's left over is a portion of a line. Save it for later. _leftoverBuffer = new byte[end + 1]; Array.Copy(buffer, 0, _leftoverBuffer, 0, end + 1); // Are we at the beginning of the stream? if (_stream.Position == 0) // Yes. Add the last line. _lines.Enqueue(_encoding.GetString(_leftoverBuffer, 0, end - 1)); #endregion // If we have something in the queue, return it return _lines.Count == 0 ? null : _lines.Dequeue(); } #endregion #region IEnumerator<string> Interface public IEnumerator<string> GetEnumerator() { string line; // So long as the next line isn't null... while ((line = ReadLine()) != null) // Read and return it. yield return line; } IEnumerator IEnumerable.GetEnumerator() { throw new NotImplementedException(); } #endregion } 
 private void Rev_file() { FileStream fs1 = new FileStream(@"d:\papers\inputfile2.txt", FileMode.Open, FileAccess.Write); byte[] file = File.ReadAllBytes(@"d:\papers\inputfile1.txt"); byte s2; for (int i = file.Length - 2; i >= 0; i--) { s2 = file[i]; fs1.WriteByte(s2); } fs1.SetLength(file.Length); fs1.Close(); }