55import re
66from typing import Optional
77import json
8+ import tempfile
89
910import lxml .etree as ET
1011from sacrebleu .utils import smart_open
1314MISSING_TRANSLATION_MESSAGE = "NO TRANSLATION AVAILABLE"
1415
1516
17+ def detect_jsonl_format (json_file ):
18+ """Detect whether a JSONL file uses the new wmtslavicllm2025_ format."""
19+ json_file .seek (0 )
20+
21+ # Handle compressed files
22+ if json_file .name .endswith ('.jsonl.gz' ):
23+ if hasattr (json_file , 'temporary_file_path' ):
24+ file_path = json_file .temporary_file_path ()
25+ else :
26+ # For in-memory files, write to temp file first
27+ with tempfile .NamedTemporaryFile (delete = False , suffix = '.jsonl.gz' ) as temp_file :
28+ json_file .seek (0 )
29+ temp_file .write (json_file .read ())
30+ file_path = temp_file .name
31+
32+ with smart_open (file_path , 'rt' , encoding = 'utf-8' ) as f :
33+ for line in f :
34+ text = line .strip ()
35+ if text :
36+ try :
37+ obj = json .loads (text )
38+ dataset_id = obj .get ('dataset_id' , '' )
39+ json_file .seek (0 )
40+ return dataset_id .startswith ('wmtslavicllm2025_' )
41+ except json .JSONDecodeError :
42+ json_file .seek (0 )
43+ return False
44+ else :
45+ # Handle uncompressed files
46+ for line in json_file :
47+ text = line .decode ('utf-8' ).strip () if isinstance (line , bytes ) else line .strip ()
48+ if text :
49+ try :
50+ obj = json .loads (text )
51+ dataset_id = obj .get ('dataset_id' , '' )
52+ json_file .seek (0 )
53+ return dataset_id .startswith ('wmtslavicllm2025_' )
54+ except json .JSONDecodeError :
55+ json_file .seek (0 )
56+ return False
57+
58+ json_file .seek (0 )
59+ return False
60+
61+
1662def analyze_xml_file (xml_path ):
1763 """
1864 Return all collection names, source languages, reference languages,
@@ -49,6 +95,20 @@ def analyze_xml_file(xml_path):
4995 return collections , src_langs , ref_langs , translators , systems
5096
5197
98+ def detect_jsonl_format_from_path (jsonl_path ):
99+ """Detect whether a JSONL file uses the new wmtslavicllm2025_ format from file path."""
100+ with smart_open (jsonl_path , 'rt' , encoding = 'utf-8' ) as f :
101+ first_line = f .readline ().strip ()
102+ if first_line :
103+ try :
104+ obj = json .loads (first_line )
105+ dataset_id = obj .get ('dataset_id' , '' )
106+ return dataset_id .startswith ('wmtslavicllm2025_' )
107+ except json .JSONDecodeError :
108+ pass
109+ return False
110+
111+
52112def analyze_jsonl_file (jsonl_path ):
53113 """
54114 Return all collection IDs, source languages, reference languages,
@@ -65,16 +125,7 @@ def analyze_jsonl_file(jsonl_path):
65125 }
66126
67127 # First, detect format by checking the first line
68- is_st_mt_format = False
69- with smart_open (jsonl_path , 'rt' , encoding = 'utf-8' ) as f :
70- first_line = f .readline ().strip ()
71- if first_line :
72- try :
73- obj = json .loads (first_line )
74- dataset_id = obj .get ('dataset_id' , '' )
75- is_st_mt_format = dataset_id .startswith ('wmtslavicllm2025_' )
76- except json .JSONDecodeError :
77- pass
128+ is_st_mt_format = detect_jsonl_format_from_path (jsonl_path )
78129
79130 # Read the JSONL file and extract the required information
80131 with smart_open (jsonl_path , 'rt' , encoding = 'utf-8' ) as f :
@@ -353,16 +404,7 @@ def process_jsonl_to_text(
353404 )
354405
355406 # First, detect format by checking the first line
356- is_st_mt_format = False
357- with smart_open (jsonl_path , 'rt' , encoding = 'utf-8' ) as f :
358- first_line = f .readline ().strip ()
359- if first_line :
360- try :
361- obj = json .loads (first_line )
362- dataset_id = obj .get ('dataset_id' , '' )
363- is_st_mt_format = dataset_id .startswith ('wmtslavicllm2025_' )
364- except json .JSONDecodeError :
365- pass
407+ is_st_mt_format = detect_jsonl_format_from_path (jsonl_path )
366408
367409 # Read and collect JSONL entries
368410 entries = []
0 commit comments