Reading archived files by line in Nodejs

2020-02-20

Why

You might want to read archived files in Node without decompressing them. Doing this via the composition of 2 streams can be quite handy.

How

Let's say you have a directory with a some zip archives. With text inside them. You can stream read the files, uncompress the stream, and read the lines, all in memory.

echo -e "line 1\nline 2" >> file1.txt
echo -e "line 3\nline 4" >> file2.txt
echo -e "line 5\nline 6" >> file3.txt

gzip *txt
const fs = require("fs");
const readline = require("readline");
const zlib = require("zlib");

function readAllLines(cb) {
  let totalLines = 0;
  let total = 0;
  let completed = 0;

  fs.readdir("./", function readDir(err, files) {
    files = files.filter((x) => x.endsWith("gz"));
    total = files.length;

    files.forEach(function forEachFile(file, index) {
      let input = fs.createReadStream(`./${file}`);
      if (file.endsWith("gz")) input = input.pipe(zlib.createGunzip());

      const lineReader = readline.createInterface({
        input: input,
      });

      lineReader.on("line", function onLine(line) {
        totalLines += 1;
      });

      lineReader.on("close", function onEnd(line) {
        completed += 1;
        // important, make sure all files have had the end event
        if (completed === total) {
          cb(totalLines);
        }
      });
    });
  });
}

readAllLines((totalLines) => console.log("Total Lines:", totalLines));
> node app.js
Total Lines: 6

Conclusion

Streams are quite handy for processing data without having to read it all in memory. You can have a service that processes compressed logs in the background, consuming almost no memory, without needing any intermediary files.