Skip to content

Commit 03c242e

Browse files
committed
move detect_jsonl_format to utils.py
1 parent fd23378 commit 03c242e

3 files changed

Lines changed: 141 additions & 70 deletions

File tree

leaderboard/models.py

Lines changed: 5 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from leaderboard.utils import process_xml_to_text
3030
from leaderboard.utils import analyze_jsonl_file, process_jsonl_to_text
3131
from leaderboard.utils import analyze_json_file, process_json_to_text
32+
from leaderboard.utils import detect_jsonl_format
3233
from ocelot.settings import MEDIA_ROOT
3334

3435
MAX_CODE_LENGTH = 10 # ISO 639 codes need 3 chars, but better add buffer
@@ -411,60 +412,14 @@ def validate_xml_schema(xml_file):
411412
raise ValidationError(_msg)
412413

413414

414-
def _detect_jsonl_format(json_file):
415-
"""Detect whether a JSONL file uses the new wmtslavicllm2025_ format."""
416-
json_file.seek(0)
417-
418-
# Handle compressed files
419-
if json_file.name.endswith('.jsonl.gz'):
420-
if hasattr(json_file, 'temporary_file_path'):
421-
file_path = json_file.temporary_file_path()
422-
else:
423-
# For in-memory files, write to temp file first
424-
import tempfile
425-
with tempfile.NamedTemporaryFile(delete=False, suffix='.jsonl.gz') as temp_file:
426-
json_file.seek(0)
427-
temp_file.write(json_file.read())
428-
file_path = temp_file.name
429-
430-
with smart_open(file_path, 'rt', encoding='utf-8') as f:
431-
for line in f:
432-
text = line.strip()
433-
if text:
434-
try:
435-
obj = json.loads(text)
436-
dataset_id = obj.get('dataset_id', '')
437-
json_file.seek(0)
438-
return dataset_id.startswith('wmtslavicllm2025_')
439-
except json.JSONDecodeError:
440-
json_file.seek(0)
441-
return False
442-
else:
443-
# Handle uncompressed files
444-
for line in json_file:
445-
text = line.decode('utf-8').strip() if isinstance(line, bytes) else line.strip()
446-
if text:
447-
try:
448-
obj = json.loads(text)
449-
dataset_id = obj.get('dataset_id', '')
450-
json_file.seek(0)
451-
return dataset_id.startswith('wmtslavicllm2025_')
452-
except json.JSONDecodeError:
453-
json_file.seek(0)
454-
return False
455-
456-
json_file.seek(0)
457-
return False
458-
459-
460415
def validate_jsonl_schema(json_file):
461416
"""Validates JSONL file based on appropriate schema."""
462417
# Skip validation for non‐JSONL uploads
463418
if not (json_file.name.endswith('.jsonl') or json_file.name.endswith('.jsonl.gz')):
464419
return
465420

466421
# Detect format and choose appropriate schema
467-
is_st_mt_format = _detect_jsonl_format(json_file)
422+
is_st_mt_format = detect_jsonl_format(json_file)
468423
schema = JSONL_WMT25_ST_MT_SCHEMA if is_st_mt_format else JSONL_WMT25_SCHEMA
469424

470425
try:
@@ -712,7 +667,7 @@ def validate_jsonl_src_testset(json_file):
712667
src_langs = set()
713668

714669
# Detect format
715-
is_st_mt_format = _detect_jsonl_format(json_file)
670+
is_st_mt_format = detect_jsonl_format(json_file)
716671

717672
# Handle compressed files
718673
if json_file.name.endswith('.jsonl.gz'):
@@ -802,7 +757,7 @@ def validate_jsonl_ref_testset(json_file):
802757
ref_langs = set()
803758

804759
# Detect format
805-
is_st_mt_format = _detect_jsonl_format(json_file)
760+
is_st_mt_format = detect_jsonl_format(json_file)
806761

807762
# Handle compressed files
808763
if json_file.name.endswith('.jsonl.gz'):
@@ -899,7 +854,7 @@ def validate_jsonl_submission(json_file):
899854
has_hyps = False
900855

901856
# Detect format
902-
is_st_mt_format = _detect_jsonl_format(json_file)
857+
is_st_mt_format = detect_jsonl_format(json_file)
903858

904859
# Handle compressed files
905860
if json_file.name.endswith('.jsonl.gz'):

leaderboard/tests/test_utils.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from pathlib import Path
66

77
from .common import TestCase, TESTDATA_DIR, analyze_xml_file, analyze_jsonl_file, process_xml_to_text, process_jsonl_to_text
8+
from leaderboard.utils import detect_jsonl_format
89

910

1011
class UtilsTests(TestCase):
@@ -123,6 +124,79 @@ def test_analyze_jsonl_file_with_multiple_languages(self):
123124
]
124125
self.assertSetEqual(tgt_langs, set(expected_langs))
125126

127+
def test_detect_jsonl_format_standard_wmt25(self):
128+
"""Checks if standard WMT25 JSONL format is detected correctly."""
129+
import io
130+
131+
# Create a file-like object with standard WMT25 format
132+
jsonl_content = '{"dataset_id": "newssample2021", "src_text": "Hello", "doc_id": "test", "src_lang": "en", "segment_id": "1"}\n'
133+
jsonl_file = io.BytesIO(jsonl_content.encode('utf-8'))
134+
jsonl_file.name = 'test.jsonl'
135+
136+
result = detect_jsonl_format(jsonl_file)
137+
self.assertFalse(result)
138+
139+
def test_detect_jsonl_format_st_mt_format(self):
140+
"""Checks if ST MT JSONL format is detected correctly."""
141+
import io
142+
143+
# Create a file-like object with ST MT format
144+
jsonl_content = '{"dataset_id": "wmtslavicllm2025_de-dsb", "sent_id": "de-dsb-00001", "source": "Source text", "target": "Target text"}\n'
145+
jsonl_file = io.BytesIO(jsonl_content.encode('utf-8'))
146+
jsonl_file.name = 'test.jsonl'
147+
148+
result = detect_jsonl_format(jsonl_file)
149+
self.assertTrue(result)
150+
151+
def test_detect_jsonl_format_compressed_st_mt(self):
152+
"""Checks if compressed ST MT JSONL format is detected correctly."""
153+
import io
154+
import gzip
155+
156+
# Create a compressed file-like object with ST MT format
157+
jsonl_content = '{"dataset_id": "wmtslavicllm2025_de-dsb", "sent_id": "de-dsb-00001", "source": "Source text", "target": "Target text"}\n'
158+
159+
# Create a temporary file for testing compressed format
160+
import tempfile
161+
with tempfile.NamedTemporaryFile(suffix='.jsonl.gz', delete=False) as tmp:
162+
with gzip.open(tmp.name, 'wt', encoding='utf-8') as gz_file:
163+
gz_file.write(jsonl_content)
164+
165+
# Open the compressed file for testing
166+
with open(tmp.name, 'rb') as f:
167+
jsonl_file = io.BytesIO(f.read())
168+
jsonl_file.name = 'test.jsonl.gz'
169+
170+
result = detect_jsonl_format(jsonl_file)
171+
self.assertTrue(result)
172+
173+
# Clean up
174+
import os
175+
os.unlink(tmp.name)
176+
177+
def test_detect_jsonl_format_empty_file(self):
178+
"""Checks if empty JSONL file is handled correctly."""
179+
import io
180+
181+
# Create an empty file-like object
182+
jsonl_file = io.BytesIO(b'')
183+
jsonl_file.name = 'empty.jsonl'
184+
185+
result = detect_jsonl_format(jsonl_file)
186+
self.assertFalse(result)
187+
188+
def test_detect_jsonl_format_invalid_json(self):
189+
"""Checks if invalid JSON in JSONL file is handled correctly."""
190+
import io
191+
192+
# Create a file-like object with invalid JSON
193+
jsonl_content = '{"dataset_id": "test", invalid json}\n'
194+
jsonl_file = io.BytesIO(jsonl_content.encode('utf-8'))
195+
jsonl_file.name = 'invalid.jsonl'
196+
197+
result = detect_jsonl_format(jsonl_file)
198+
self.assertFalse(result)
199+
126200

127201
#################################################################
128202
# Tests for process_xyz_to_text functions

leaderboard/utils.py

Lines changed: 62 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import re
66
from typing import Optional
77
import json
8+
import tempfile
89

910
import lxml.etree as ET
1011
from sacrebleu.utils import smart_open
@@ -13,6 +14,51 @@
1314
MISSING_TRANSLATION_MESSAGE = "NO TRANSLATION AVAILABLE"
1415

1516

17+
def detect_jsonl_format(json_file):
18+
"""Detect whether a JSONL file uses the new wmtslavicllm2025_ format."""
19+
json_file.seek(0)
20+
21+
# Handle compressed files
22+
if json_file.name.endswith('.jsonl.gz'):
23+
if hasattr(json_file, 'temporary_file_path'):
24+
file_path = json_file.temporary_file_path()
25+
else:
26+
# For in-memory files, write to temp file first
27+
with tempfile.NamedTemporaryFile(delete=False, suffix='.jsonl.gz') as temp_file:
28+
json_file.seek(0)
29+
temp_file.write(json_file.read())
30+
file_path = temp_file.name
31+
32+
with smart_open(file_path, 'rt', encoding='utf-8') as f:
33+
for line in f:
34+
text = line.strip()
35+
if text:
36+
try:
37+
obj = json.loads(text)
38+
dataset_id = obj.get('dataset_id', '')
39+
json_file.seek(0)
40+
return dataset_id.startswith('wmtslavicllm2025_')
41+
except json.JSONDecodeError:
42+
json_file.seek(0)
43+
return False
44+
else:
45+
# Handle uncompressed files
46+
for line in json_file:
47+
text = line.decode('utf-8').strip() if isinstance(line, bytes) else line.strip()
48+
if text:
49+
try:
50+
obj = json.loads(text)
51+
dataset_id = obj.get('dataset_id', '')
52+
json_file.seek(0)
53+
return dataset_id.startswith('wmtslavicllm2025_')
54+
except json.JSONDecodeError:
55+
json_file.seek(0)
56+
return False
57+
58+
json_file.seek(0)
59+
return False
60+
61+
1662
def analyze_xml_file(xml_path):
1763
"""
1864
Return all collection names, source languages, reference languages,
@@ -49,6 +95,20 @@ def analyze_xml_file(xml_path):
4995
return collections, src_langs, ref_langs, translators, systems
5096

5197

98+
def detect_jsonl_format_from_path(jsonl_path):
99+
"""Detect whether a JSONL file uses the new wmtslavicllm2025_ format from file path."""
100+
with smart_open(jsonl_path, 'rt', encoding='utf-8') as f:
101+
first_line = f.readline().strip()
102+
if first_line:
103+
try:
104+
obj = json.loads(first_line)
105+
dataset_id = obj.get('dataset_id', '')
106+
return dataset_id.startswith('wmtslavicllm2025_')
107+
except json.JSONDecodeError:
108+
pass
109+
return False
110+
111+
52112
def analyze_jsonl_file(jsonl_path):
53113
"""
54114
Return all collection IDs, source languages, reference languages,
@@ -65,16 +125,7 @@ def analyze_jsonl_file(jsonl_path):
65125
}
66126

67127
# First, detect format by checking the first line
68-
is_st_mt_format = False
69-
with smart_open(jsonl_path, 'rt', encoding='utf-8') as f:
70-
first_line = f.readline().strip()
71-
if first_line:
72-
try:
73-
obj = json.loads(first_line)
74-
dataset_id = obj.get('dataset_id', '')
75-
is_st_mt_format = dataset_id.startswith('wmtslavicllm2025_')
76-
except json.JSONDecodeError:
77-
pass
128+
is_st_mt_format = detect_jsonl_format_from_path(jsonl_path)
78129

79130
# Read the JSONL file and extract the required information
80131
with smart_open(jsonl_path, 'rt', encoding='utf-8') as f:
@@ -353,16 +404,7 @@ def process_jsonl_to_text(
353404
)
354405

355406
# First, detect format by checking the first line
356-
is_st_mt_format = False
357-
with smart_open(jsonl_path, 'rt', encoding='utf-8') as f:
358-
first_line = f.readline().strip()
359-
if first_line:
360-
try:
361-
obj = json.loads(first_line)
362-
dataset_id = obj.get('dataset_id', '')
363-
is_st_mt_format = dataset_id.startswith('wmtslavicllm2025_')
364-
except json.JSONDecodeError:
365-
pass
407+
is_st_mt_format = detect_jsonl_format_from_path(jsonl_path)
366408

367409
# Read and collect JSONL entries
368410
entries = []

0 commit comments

Comments
 (0)