move detect_jsonl_format to utils.py

snukky · snukky · commit 03c242e4cbf3 · 2025-07-07T16:30:03.000-07:00
diff --git a/leaderboard/models.py b/leaderboard/models.py
@@ -29,6 +29,7 @@
 from leaderboard.utils import process_xml_to_text
 from leaderboard.utils import analyze_jsonl_file, process_jsonl_to_text
 from leaderboard.utils import analyze_json_file, process_json_to_text
+from leaderboard.utils import detect_jsonl_format
 from ocelot.settings import MEDIA_ROOT
 
 MAX_CODE_LENGTH = 10  # ISO 639 codes need 3 chars, but better add buffer
@@ -411,60 +412,14 @@ def validate_xml_schema(xml_file):
         raise ValidationError(_msg)
 
 
-def _detect_jsonl_format(json_file):
-    """Detect whether a JSONL file uses the new wmtslavicllm2025_ format."""
-    json_file.seek(0)
-    
-    # Handle compressed files
-    if json_file.name.endswith('.jsonl.gz'):
-        if hasattr(json_file, 'temporary_file_path'):
-            file_path = json_file.temporary_file_path()
-        else:
-            # For in-memory files, write to temp file first
-            import tempfile
-            with tempfile.NamedTemporaryFile(delete=False, suffix='.jsonl.gz') as temp_file:
-                json_file.seek(0)
-                temp_file.write(json_file.read())
-                file_path = temp_file.name
-
-        with smart_open(file_path, 'rt', encoding='utf-8') as f:
-            for line in f:
-                text = line.strip()
-                if text:
-                    try:
-                        obj = json.loads(text)
-                        dataset_id = obj.get('dataset_id', '')
-                        json_file.seek(0)
-                        return dataset_id.startswith('wmtslavicllm2025_')
-                    except json.JSONDecodeError:
-                        json_file.seek(0)
-                        return False
-    else:
-        # Handle uncompressed files
-        for line in json_file:
-            text = line.decode('utf-8').strip() if isinstance(line, bytes) else line.strip()
-            if text:
-                try:
-                    obj = json.loads(text)
-                    dataset_id = obj.get('dataset_id', '')
-                    json_file.seek(0)
-                    return dataset_id.startswith('wmtslavicllm2025_')
-                except json.JSONDecodeError:
-                    json_file.seek(0)
-                    return False
-    
-    json_file.seek(0)
-    return False
-
-
 def validate_jsonl_schema(json_file):
     """Validates JSONL file based on appropriate schema."""
     # Skip validation for non‐JSONL uploads
     if not (json_file.name.endswith('.jsonl') or json_file.name.endswith('.jsonl.gz')):
         return
 
     # Detect format and choose appropriate schema
-    is_st_mt_format = _detect_jsonl_format(json_file)
+    is_st_mt_format = detect_jsonl_format(json_file)
     schema = JSONL_WMT25_ST_MT_SCHEMA if is_st_mt_format else JSONL_WMT25_SCHEMA
 
     try:
@@ -712,7 +667,7 @@ def validate_jsonl_src_testset(json_file):
     src_langs = set()
     
     # Detect format
-    is_st_mt_format = _detect_jsonl_format(json_file)
+    is_st_mt_format = detect_jsonl_format(json_file)
 
     # Handle compressed files
     if json_file.name.endswith('.jsonl.gz'):
@@ -802,7 +757,7 @@ def validate_jsonl_ref_testset(json_file):
     ref_langs = set()
     
     # Detect format
-    is_st_mt_format = _detect_jsonl_format(json_file)
+    is_st_mt_format = detect_jsonl_format(json_file)
 
     # Handle compressed files
     if json_file.name.endswith('.jsonl.gz'):
@@ -899,7 +854,7 @@ def validate_jsonl_submission(json_file):
     has_hyps = False
     
     # Detect format
-    is_st_mt_format = _detect_jsonl_format(json_file)
+    is_st_mt_format = detect_jsonl_format(json_file)
 
     # Handle compressed files
     if json_file.name.endswith('.jsonl.gz'):
diff --git a/leaderboard/tests/test_utils.py b/leaderboard/tests/test_utils.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 
 from .common import TestCase, TESTDATA_DIR, analyze_xml_file, analyze_jsonl_file, process_xml_to_text, process_jsonl_to_text
+from leaderboard.utils import detect_jsonl_format
 
 
 class UtilsTests(TestCase):
@@ -123,6 +124,79 @@ def test_analyze_jsonl_file_with_multiple_languages(self):
         ]
         self.assertSetEqual(tgt_langs, set(expected_langs))
 
+    def test_detect_jsonl_format_standard_wmt25(self):
+        """Checks if standard WMT25 JSONL format is detected correctly."""
+        import io
+        
+        # Create a file-like object with standard WMT25 format
+        jsonl_content = '{"dataset_id": "newssample2021", "src_text": "Hello", "doc_id": "test", "src_lang": "en", "segment_id": "1"}\n'
+        jsonl_file = io.BytesIO(jsonl_content.encode('utf-8'))
+        jsonl_file.name = 'test.jsonl'
+        
+        result = detect_jsonl_format(jsonl_file)
+        self.assertFalse(result)
+
+    def test_detect_jsonl_format_st_mt_format(self):
+        """Checks if ST MT JSONL format is detected correctly."""
+        import io
+        
+        # Create a file-like object with ST MT format
+        jsonl_content = '{"dataset_id": "wmtslavicllm2025_de-dsb", "sent_id": "de-dsb-00001", "source": "Source text", "target": "Target text"}\n'
+        jsonl_file = io.BytesIO(jsonl_content.encode('utf-8'))
+        jsonl_file.name = 'test.jsonl'
+        
+        result = detect_jsonl_format(jsonl_file)
+        self.assertTrue(result)
+
+    def test_detect_jsonl_format_compressed_st_mt(self):
+        """Checks if compressed ST MT JSONL format is detected correctly."""
+        import io
+        import gzip
+        
+        # Create a compressed file-like object with ST MT format
+        jsonl_content = '{"dataset_id": "wmtslavicllm2025_de-dsb", "sent_id": "de-dsb-00001", "source": "Source text", "target": "Target text"}\n'
+        
+        # Create a temporary file for testing compressed format
+        import tempfile
+        with tempfile.NamedTemporaryFile(suffix='.jsonl.gz', delete=False) as tmp:
+            with gzip.open(tmp.name, 'wt', encoding='utf-8') as gz_file:
+                gz_file.write(jsonl_content)
+            
+            # Open the compressed file for testing
+            with open(tmp.name, 'rb') as f:
+                jsonl_file = io.BytesIO(f.read())
+                jsonl_file.name = 'test.jsonl.gz'
+                
+                result = detect_jsonl_format(jsonl_file)
+                self.assertTrue(result)
+        
+        # Clean up
+        import os
+        os.unlink(tmp.name)
+
+    def test_detect_jsonl_format_empty_file(self):
+        """Checks if empty JSONL file is handled correctly."""
+        import io
+        
+        # Create an empty file-like object
+        jsonl_file = io.BytesIO(b'')
+        jsonl_file.name = 'empty.jsonl'
+        
+        result = detect_jsonl_format(jsonl_file)
+        self.assertFalse(result)
+
+    def test_detect_jsonl_format_invalid_json(self):
+        """Checks if invalid JSON in JSONL file is handled correctly."""
+        import io
+        
+        # Create a file-like object with invalid JSON
+        jsonl_content = '{"dataset_id": "test", invalid json}\n'
+        jsonl_file = io.BytesIO(jsonl_content.encode('utf-8'))
+        jsonl_file.name = 'invalid.jsonl'
+        
+        result = detect_jsonl_format(jsonl_file)
+        self.assertFalse(result)
+
 
     #################################################################
     # Tests for process_xyz_to_text functions
diff --git a/leaderboard/utils.py b/leaderboard/utils.py
@@ -5,6 +5,7 @@
 import re
 from typing import Optional
 import json
+import tempfile
 
 import lxml.etree as ET
 from sacrebleu.utils import smart_open
@@ -13,6 +14,51 @@
 MISSING_TRANSLATION_MESSAGE = "NO TRANSLATION AVAILABLE"
 
 
+def detect_jsonl_format(json_file):
+    """Detect whether a JSONL file uses the new wmtslavicllm2025_ format."""
+    json_file.seek(0)
+    
+    # Handle compressed files
+    if json_file.name.endswith('.jsonl.gz'):
+        if hasattr(json_file, 'temporary_file_path'):
+            file_path = json_file.temporary_file_path()
+        else:
+            # For in-memory files, write to temp file first
+            with tempfile.NamedTemporaryFile(delete=False, suffix='.jsonl.gz') as temp_file:
+                json_file.seek(0)
+                temp_file.write(json_file.read())
+                file_path = temp_file.name
+
+        with smart_open(file_path, 'rt', encoding='utf-8') as f:
+            for line in f:
+                text = line.strip()
+                if text:
+                    try:
+                        obj = json.loads(text)
+                        dataset_id = obj.get('dataset_id', '')
+                        json_file.seek(0)
+                        return dataset_id.startswith('wmtslavicllm2025_')
+                    except json.JSONDecodeError:
+                        json_file.seek(0)
+                        return False
+    else:
+        # Handle uncompressed files
+        for line in json_file:
+            text = line.decode('utf-8').strip() if isinstance(line, bytes) else line.strip()
+            if text:
+                try:
+                    obj = json.loads(text)
+                    dataset_id = obj.get('dataset_id', '')
+                    json_file.seek(0)
+                    return dataset_id.startswith('wmtslavicllm2025_')
+                except json.JSONDecodeError:
+                    json_file.seek(0)
+                    return False
+    
+    json_file.seek(0)
+    return False
+
+
 def analyze_xml_file(xml_path):
     """
     Return all collection names, source languages, reference languages,
@@ -49,6 +95,20 @@ def analyze_xml_file(xml_path):
     return collections, src_langs, ref_langs, translators, systems
 
 
+def detect_jsonl_format_from_path(jsonl_path):
+    """Detect whether a JSONL file uses the new wmtslavicllm2025_ format from file path."""
+    with smart_open(jsonl_path, 'rt', encoding='utf-8') as f:
+        first_line = f.readline().strip()
+        if first_line:
+            try:
+                obj = json.loads(first_line)
+                dataset_id = obj.get('dataset_id', '')
+                return dataset_id.startswith('wmtslavicllm2025_')
+            except json.JSONDecodeError:
+                pass
+    return False
+
+
 def analyze_jsonl_file(jsonl_path):
     """
     Return all collection IDs, source languages, reference languages,
@@ -65,16 +125,7 @@ def analyze_jsonl_file(jsonl_path):
     }
     
     # First, detect format by checking the first line
-    is_st_mt_format = False
-    with smart_open(jsonl_path, 'rt', encoding='utf-8') as f:
-        first_line = f.readline().strip()
-        if first_line:
-            try:
-                obj = json.loads(first_line)
-                dataset_id = obj.get('dataset_id', '')
-                is_st_mt_format = dataset_id.startswith('wmtslavicllm2025_')
-            except json.JSONDecodeError:
-                pass
+    is_st_mt_format = detect_jsonl_format_from_path(jsonl_path)
     
     # Read the JSONL file and extract the required information
     with smart_open(jsonl_path, 'rt', encoding='utf-8') as f:
@@ -353,16 +404,7 @@ def process_jsonl_to_text(
         )
 
     # First, detect format by checking the first line
-    is_st_mt_format = False
-    with smart_open(jsonl_path, 'rt', encoding='utf-8') as f:
-        first_line = f.readline().strip()
-        if first_line:
-            try:
-                obj = json.loads(first_line)
-                dataset_id = obj.get('dataset_id', '')
-                is_st_mt_format = dataset_id.startswith('wmtslavicllm2025_')
-            except json.JSONDecodeError:
-                pass
+    is_st_mt_format = detect_jsonl_format_from_path(jsonl_path)
 
     # Read and collect JSONL entries
     entries = []