diff --git a/README.md b/README.md index 2de1a84..02a2723 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,32 @@ lineStream.pipe(output); ``` +# Line Endings + +By default, byline matches end of line according to the following regular expression: + +```regexp +/\r\n|[\n\v\f\r\x85\u2028\u2029]/g +``` + +You can override this default and specify your own regular expression for matching end of line for +your data. + +```javascript +var stream = fs.createReadStream('sample.txt'); +stream = byline.createStream(stream,{newLines: /\n/g}); + +// ... +``` + +or + +```javascript +var stream = byline(fs.createReadStream('sample.txt', { encoding: 'utf8' }),{newLines: /\n/g}); + +// ... +``` + # Empty Lines By default byline skips empty lines, if you want to keep them, pass the `keepEmptyLines` option in diff --git a/lib/byline.js b/lib/byline.js index 21843cb..2669a2e 100644 --- a/lib/byline.js +++ b/lib/byline.js @@ -68,6 +68,7 @@ function LineStream(options) { // which re-concatanates the lines, just without newlines. this._readableState.objectMode = true; this._lineBuffer = []; + this._split = options.newLines || /\r\n|[\n\v\f\r\x85\u2028\u2029]/g; this._keepEmptyLines = options.keepEmptyLines || false; this._lastChunkEndedWithCR = false; @@ -100,7 +101,7 @@ LineStream.prototype._transform = function(chunk, encoding, done) { this._chunkEncoding = encoding; // see: http://www.unicode.org/reports/tr18/#Line_Boundaries - var lines = chunk.split(/\r\n|[\n\v\f\r\x85\u2028\u2029]/g); + var lines = chunk.split(this._split); // don't split CRLF which spans chunks if (this._lastChunkEndedWithCR && chunk[0] == '\n') { diff --git a/package.json b/package.json index f2e6c41..afddff8 100644 --- a/package.json +++ b/package.json @@ -21,7 +21,7 @@ ], "devDependencies": { "mocha": "~2.1.0", - "request": "~2.27.0" + "request": "^2.88.0" }, "scripts": { "test": "mocha -R spec --timeout 60000" diff --git a/test/rfc-DOS.txt b/test/rfc-DOS.txt new file mode 100644 index 0000000..8c1bc1b Binary files /dev/null and b/test/rfc-DOS.txt differ diff --git a/test/tests.js b/test/tests.js index 7a23aa7..a9cfee1 100644 --- a/test/tests.js +++ b/test/tests.js @@ -92,6 +92,22 @@ describe('byline', function() { }); }); + it('should match given newline separators using newLine option', function(done) { + var input = fs.createReadStream('test/rfc-DOS.txt'); + var lineStream = byline(input, { keepEmptyLines: true, newLines: /\r\n/g }); + lineStream.setEncoding('utf8'); + + var lines = []; + lineStream.on('data', function(line) { + lines.push(line); + }); + + lineStream.on('end', function() { + assert.equal(9859, lines.length); + done(); + }); + }); + it('should not split a CRLF which spans two chunks', function(done) { var input = fs.createReadStream('test/CRLF.txt'); var lineStream = byline(input, { keepEmptyLines: true });