Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 20 additions & 13 deletions services/datalad/datalad_service/common/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,26 +137,33 @@ async def _normalize_line_endings_in_file(full_path, repo_temp_path):
"""
Given a file path, normalize line endings from CRLF to LF.
"""
needs_change = False
async with aiofiles.open(full_path, 'rb') as f:
while chunk := await f.read(CHUNK_SIZE_BYTES):
if b'\r\n' in chunk:
needs_change = True
break

if not needs_change:
return

# If changes are needed, stream-edit the file via a temporary file
changed = False
async with aiofiles.tempfile.NamedTemporaryFile(
'wb', dir=repo_temp_path, delete=False
) as tmp:
temp_path = tmp.name
try:
async with aiofiles.open(full_path, 'rb') as f:
carry = b''
while chunk := await f.read(CHUNK_SIZE_BYTES):
await tmp.write(chunk.replace(b'\r\n', b'\n'))
os.rename(temp_path, full_path)
chunk = carry + chunk
# Hold back a trailing \r — it might be half of \r\n
if chunk.endswith(b'\r'):
carry = b'\r'
chunk = chunk[:-1]
else:
carry = b''
normalized = chunk.replace(b'\r\n', b'\n')
if not changed and len(normalized) != len(chunk):
changed = True
await tmp.write(normalized)
# Flush any held-back \r (lone \r at EOF)
if carry:
await tmp.write(carry)
if changed:
os.rename(temp_path, full_path)
else:
os.unlink(temp_path)
except:
os.unlink(temp_path)
raise
Expand Down
37 changes: 37 additions & 0 deletions services/datalad/tests/test_git.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import os
import zlib
from unittest.mock import patch

import falcon
from falcon import testing
import pygit2
import pytest

from datalad_service.common import git
from datalad_service.common.git import _normalize_line_endings_in_file
from datalad_service.handlers.git import _parse_commit
from datalad.api import Dataset

Expand Down Expand Up @@ -198,3 +201,37 @@ def test_git_tree(new_dataset):
repo = pygit2.Repository(new_dataset.path)
tree = git.git_tree(repo, str(repo.head.target), 'dataset_description.json')
assert tree.id == repo.get(repo.head.target).tree_id


@pytest.mark.parametrize(
'chunk_size,content,expected',
[
# \r\n fully within a chunk
(64, b'hello\r\nworld\r\n', b'hello\nworld\n'),
# \r\n split across chunk boundary: \r at end of first chunk, \n at start of second
(6, b'hello\r\nworld\r\n', b'hello\nworld\n'),
# Multiple boundary splits with a 5-byte chunk: "ab\r\n" + "cd\r\n"
(3, b'ab\r\ncd\r\n', b'ab\ncd\n'),
# No CRLF at all — file should be unchanged
(4, b'hello\nworld\n', b'hello\nworld\n'),
# Lone \r not followed by \n should be preserved
(4, b'hello\rworld\r\n', b'hello\rworld\n'),
],
ids=[
'crlf-within-chunk',
'crlf-split-across-boundary',
'multiple-boundary-splits',
'no-crlf-unchanged',
'lone-cr-preserved',
],
)
@pytest.mark.asyncio
async def test_normalize_line_endings_boundary(tmp_path, chunk_size, content, expected):
"""Test CRLF normalization including the case where \\r\\n is split across chunk boundaries."""
test_file = tmp_path / 'test.tsv'
test_file.write_bytes(content)

with patch('datalad_service.common.git.CHUNK_SIZE_BYTES', chunk_size):
await _normalize_line_endings_in_file(str(test_file), str(tmp_path))

assert test_file.read_bytes() == expected