Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions examples/models/parakeet/export_parakeet_tdt.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

import argparse
import os
import shutil
import tarfile
import tempfile

import torch
import torchaudio
Expand Down Expand Up @@ -188,6 +191,51 @@ def load_model():
return model


def extract_tokenizer(output_dir: str) -> str | None:
"""Extract tokenizer.model from the cached .nemo file.

Args:
output_dir: Directory to save the tokenizer.model file.

Returns:
Path to the extracted tokenizer.model, or None if extraction failed.
"""
from huggingface_hub import hf_hub_download
Copy link

Copilot AI Jan 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The huggingface_hub import is placed inside the function rather than at the module level. While this can avoid import errors if the package is not installed, it's inconsistent with the module-level imports used for other dependencies like torch and torchaudio. Consider either importing at the module level for consistency, or documenting why this particular import needs to be local.

Copilot uses AI. Check for mistakes.

# Download/get cached .nemo file path
nemo_path = hf_hub_download(
repo_id="nvidia/parakeet-tdt-0.6b-v3",
filename="parakeet-tdt-0.6b-v3.nemo",
)

# .nemo files are tar archives - extract tokenizer.model
tokenizer_filename = "tokenizer.model"
output_path = os.path.join(output_dir, tokenizer_filename)

with tempfile.TemporaryDirectory() as tmpdir:
with tarfile.open(nemo_path, "r") as tar:
# Find tokenizer.model in the archive (may be in root or subdirectory)
tokenizer_member = None
for member in tar.getmembers():
if member.name.endswith(tokenizer_filename):
tokenizer_member = member
break

if tokenizer_member is None:
print(f"Warning: {tokenizer_filename} not found in .nemo archive")
return None

# Extract to temp directory
tar.extract(tokenizer_member, tmpdir)
extracted_path = os.path.join(tmpdir, tokenizer_member.name)

# Copy to output directory
shutil.copy2(extracted_path, output_path)
Comment on lines +216 to +233
Copy link

Copilot AI Jan 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function opens a tar file without specifying error handling for potential exceptions. If the .nemo file is corrupted or not a valid tar archive, this will raise an exception that is not handled. Consider adding a try-except block to handle tarfile.ReadError and other potential exceptions, and return None with an appropriate error message.

Suggested change
with tarfile.open(nemo_path, "r") as tar:
# Find tokenizer.model in the archive (may be in root or subdirectory)
tokenizer_member = None
for member in tar.getmembers():
if member.name.endswith(tokenizer_filename):
tokenizer_member = member
break
if tokenizer_member is None:
print(f"Warning: {tokenizer_filename} not found in .nemo archive")
return None
# Extract to temp directory
tar.extract(tokenizer_member, tmpdir)
extracted_path = os.path.join(tmpdir, tokenizer_member.name)
# Copy to output directory
shutil.copy2(extracted_path, output_path)
try:
with tarfile.open(nemo_path, "r") as tar:
# Find tokenizer.model in the archive (may be in root or subdirectory)
tokenizer_member = None
for member in tar.getmembers():
if member.name.endswith(tokenizer_filename):
tokenizer_member = member
break
if tokenizer_member is None:
print(f"Warning: {tokenizer_filename} not found in .nemo archive")
return None
# Extract to temp directory
tar.extract(tokenizer_member, tmpdir)
extracted_path = os.path.join(tmpdir, tokenizer_member.name)
# Copy to output directory
shutil.copy2(extracted_path, output_path)
except (tarfile.ReadError, tarfile.TarError, OSError) as e:
print(f"Error: failed to open or extract from .nemo archive '{nemo_path}': {e}")
return None

Copilot uses AI. Check for mistakes.
Comment on lines +215 to +233
Copy link

Copilot AI Jan 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potential path traversal vulnerability when extracting from tar archive. A malicious .nemo file could contain members with absolute paths or paths with ".." that could write files outside the intended output directory. Consider validating that the extracted member path is safe before extraction, or use a safer extraction method that prevents path traversal attacks.

Suggested change
with tempfile.TemporaryDirectory() as tmpdir:
with tarfile.open(nemo_path, "r") as tar:
# Find tokenizer.model in the archive (may be in root or subdirectory)
tokenizer_member = None
for member in tar.getmembers():
if member.name.endswith(tokenizer_filename):
tokenizer_member = member
break
if tokenizer_member is None:
print(f"Warning: {tokenizer_filename} not found in .nemo archive")
return None
# Extract to temp directory
tar.extract(tokenizer_member, tmpdir)
extracted_path = os.path.join(tmpdir, tokenizer_member.name)
# Copy to output directory
shutil.copy2(extracted_path, output_path)
with tarfile.open(nemo_path, "r") as tar:
# Find tokenizer.model in the archive (may be in root or subdirectory)
tokenizer_member = None
for member in tar.getmembers():
if member.name.endswith(tokenizer_filename):
tokenizer_member = member
break
if tokenizer_member is None:
print(f"Warning: {tokenizer_filename} not found in .nemo archive")
return None
if not tokenizer_member.isfile():
print(f"Warning: {tokenizer_filename} in .nemo archive is not a regular file")
return None
# Safely extract the tokenizer file contents directly to the output path
with tar.extractfile(tokenizer_member) as src, open(output_path, "wb") as dst:
shutil.copyfileobj(src, dst)

Copilot uses AI. Check for mistakes.

print(f"Extracted tokenizer to: {output_path}")
return output_path


class JointAfterProjection(torch.nn.Module):
def __init__(self, joint):
super().__init__()
Expand Down Expand Up @@ -401,6 +449,9 @@ def main():

os.makedirs(args.output_dir, exist_ok=True)

print("Extracting tokenizer...")
extract_tokenizer(args.output_dir)
Copy link

Copilot AI Jan 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return value from extract_tokenizer is not checked. If the tokenizer extraction fails (returns None), the script will continue execution without the tokenizer file. Consider checking the return value and handling failure appropriately, such as by logging a warning or raising an error if the tokenizer is required for the export.

Suggested change
extract_tokenizer(args.output_dir)
tokenizer_path = extract_tokenizer(args.output_dir)
if tokenizer_path is None:
raise RuntimeError(
f"Failed to extract tokenizer into output directory: {args.output_dir}"
)

Copilot uses AI. Check for mistakes.

print("Loading model...")
model = load_model()

Expand Down
Loading