Arm-Examples
diff --git a/‎pytorch-conformer-train-quantize/README.md‎
Lines changed: 60 additions & 0 deletions b/‎pytorch-conformer-train-quantize/README.md‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎pytorch-conformer-train-quantize/build_sp_128_librispeech.py‎
Lines changed: 156 additions & 0 deletions b/‎pytorch-conformer-train-quantize/build_sp_128_librispeech.py‎
Lines changed: 156 additions & 0 deletions
diff --git a/‎pytorch-conformer-train-quantize/evaluate.py‎
Lines changed: 107 additions & 0 deletions b/‎pytorch-conformer-train-quantize/evaluate.py‎
Lines changed: 107 additions & 0 deletions
@@ -0,0 +1,60 @@
+# Conformer-S Model Training
+
+This repository provides an example of training the **Conformer-S** model on the **LibriSpeech** dataset.
+
+## External dependencies
+- **Model**: https://github.com/sooftware/conformer implementation of Conformer-S
+- **Dataset**: LibriSpeech (downloaded via `torchaudio`) - used both to generate Tokenizer and Conformer model
+- **Tokenizer**: Generated using https://github.com/google/sentencepiece/
+- **Python Dependencies**: Python packages listed in **requirements.txt**.
+
+## Environment description
+- AWS g5.24xlarge instance 
+- Python version 3.12.7
+- AWS AMI - Deep Learning OSS Nvidia Driver AMI GPU PyTorch (Ubuntu 22.04)
+
+## Setup
+1) Make sure the Conformer repository is cloned in the same directory as the training script:
+```angular2html
+git clone https://github.com/sooftware/conformer.git
+```
+2) Generate SentencePiece Tokenizer
+- More information on what is SentencePiece tokenizer and how to use it can be found at https://github.com/google/sentencepiece?tab=readme-ov-file#overview
+- Generate the tokenizer using the following command
+```angular2html
+!python build_sp_128_librispeech.py \
+  --root ./data \
+  --subset train-clean-100 \
+  --output_dir ./tokenizer_out \
+  --vocab_size 128 \
+  --model_type unigram \
+  --lowercase \
+  --disable_bos_eos \
+  --pad_id -1
+```
+- Pass the tokenizer path to the training script via the --sp-model argument
+3) create an empty data folder in the same directory as the training script
+## Training
+Run the following command to start training:
+```angular2html
+!CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py \
+--train-sets "train-clean-100,train-clean-360,train-other-500" \
+--valid-set "dev-clean" \
+--epochs 160 \
+--batch-size 96 \
+--lr=0.0005 \
+--betas 0.9,0.98 \
+--weight-decay 1e-6 \
+--warmup-epochs 2.0 \
+--grad-clip 5 \
+--root "data" \
+--save-dir "checkpoints" \
+--num-workers=32 \
+--accum-steps 16 \
+2>&1 | tee train_log.txt
+```
+## Notes and recommendations
+- Hyperparameter tuning and active monitoring (“model babysitting”) are strongly recommended to achieve optimal performance
+- We should be able to reach WER in the range of 6%-7% on the test clean dataset
+- Ckeckpoints will be saved under the checkpoints/ directory
+- Logs are written to train_log.txt for convenience
@@ -0,0 +1,156 @@
+#
+# SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+
+#!/usr/bin/env python3
+
+import argparse
+import os
+import sys
+import tempfile
+from pathlib import Path
+
+import torchaudio
+from torchaudio.datasets import LIBRISPEECH
+
+import sentencepiece as spm
+
+
+def normalize_text(text: str, lowercase: bool = False) -> str:
+    t = text.strip()
+    if lowercase:
+        t = t.lower()
+    return t
+
+
+def build_corpus(root: str, subset: str, lowercase: bool, limit: int | None) -> str:
+    dataset = LIBRISPEECH(root=root, url=subset, download=True)
+    n = len(dataset)
+    if limit is not None:
+        n = min(n, limit)
+
+    tmp_fd, tmp_path = tempfile.mkstemp(prefix="librispeech_corpus_", suffix=".txt")
+    os.close(tmp_fd)
+
+    with open(tmp_path, "w", encoding="utf-8") as f:
+        for idx in range(n):
+            try:
+                _, _, transcript, *_ = dataset[idx]
+            except Exception as ex:
+                print(f"Warning: failed to read sample {idx}: {ex}", file=sys.stderr)
+                continue
+            line = normalize_text(transcript, lowercase)
+            if line:
+                f.write(line + "\n")
+    return tmp_path
+
+
+def train_sentencepiece(
+    corpus_path: str,
+    output_dir: str,
+    vocab_size: int,
+    model_type: str,
+    character_coverage: float,
+    model_prefix: str,
+    pad_id: int,
+    disable_bos_eos: bool,
+    seed_sentencepiece: int | None,
+    input_sentence_size: int,
+):
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
+    model_prefix_path = str(Path(output_dir) / model_prefix)
+
+    sp_args = [
+        f"--input={corpus_path}",
+        f"--model_prefix={model_prefix_path}",
+        f"--vocab_size={vocab_size}",
+        f"--model_type={model_type}",
+        f"--character_coverage={character_coverage}",
+        "--unk_id=0",
+        "--input_sentence_size="+str(input_sentence_size),
+        "--shuffle_input_sentence=true",
+        "--hard_vocab_limit=true",
+        "--num_threads=32",
+    ]
+
+    if disable_bos_eos:
+        sp_args += ["--bos_id=-1", "--eos_id=-1"]
+    else:
+        sp_args += ["--bos_id=1", "--eos_id=2"]
+
+    if pad_id is None or pad_id < 0:
+        sp_args += ["--pad_id=-1"]
+    else:
+        sp_args += [f"--pad_id={pad_id}"]
+
+    if seed_sentencepiece is not None:
+        sp_args += [f"--seed_sentencepiece_size={seed_sentencepiece}"]
+
+    spm.SentencePieceTrainer.Train(" ".join(sp_args))
+
+    model_path = model_prefix_path + ".model"
+    vocab_path = model_prefix_path + ".vocab"
+    return model_path, vocab_path
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Train a 128-token SentencePiece tokenizer on LibriSpeech using torchaudio.")
+    parser.add_argument("--root", type=str, default="./data", help="Directory to store/lookup LibriSpeech.")
+    parser.add_argument("--subset", type=str, default="train-clean-100",
+                        choices=[
+                            "train-clean-100", "train-clean-360", "train-other-500",
+                            "dev-clean", "dev-other", "test-clean", "test-other"
+                        ],
+                        help="LibriSpeech subset to use.")
+    parser.add_argument("--output_dir", type=str, default="./tokenizer_out", help="Where to write the tokenizer files.")
+    parser.add_argument("--vocab_size", type=int, default=128, help="Total vocabulary size.")
+    parser.add_argument("--model_type", type=str, default="unigram", choices=["unigram", "bpe"],
+                        help="SentencePiece model type. 'unigram' tends to match English lists like yours.")
+    parser.add_argument("--character_coverage", type=float, default=1.0,
+                        help="Fraction of characters covered by the model (1.0 is fine for English).")
+    parser.add_argument("--model_prefix", type=str, default="librispeech_sp",
+                        help="Prefix (filename stem) for the trained model.")
+    parser.add_argument("--lowercase", action="store_true", help="Lowercase transcripts before training.")
+    parser.add_argument("--pad_id", type=int, default=-1, help="Pad ID; set to -1 to disable (default).")
+    parser.add_argument("--disable_bos_eos", action="store_true", help="Disable BOS/EOS (recommended).")
+    parser.add_argument("--enable_bos_eos", action="store_true", help="Enable BOS/EOS tokens.")
+    parser.add_argument("--limit", type=int, default=None, help="Limit the number of samples for a quick run.")
+    parser.add_argument("--seed_sentencepiece_size", type=int, default=None,
+                        help="Advanced: initial seed size for SentencePiece's sentence sampling (optional).")
+    parser.add_argument("--input_sentence_size", type=int, default=1000000,
+                        help="Number of sentences to sample during training.")
+
+    args = parser.parse_args()
+
+    disable_bos_eos = True
+    if args.enable_bos_eos:
+        disable_bos_eos = False
+    if args.disable_bos_eos:
+        disable_bos_eos = True
+
+    print(f"Preparing corpus from LibriSpeech subset='{args.subset}'...")
+    corpus_path = build_corpus(root=args.root, subset=args.subset, lowercase=args.lowercase, limit=args.limit)
+
+    print(f"Training SentencePiece... corpus_path {corpus_path}")
+    model_path, vocab_path = train_sentencepiece(
+        corpus_path=corpus_path,
+        output_dir=args.output_dir,
+        vocab_size=args.vocab_size,
+        model_type=args.model_type,
+        character_coverage=args.character_coverage,
+        model_prefix=args.model_prefix,
+        pad_id=args.pad_id,
+        disable_bos_eos=disable_bos_eos,
+        seed_sentencepiece=args.seed_sentencepiece_size,
+        input_sentence_size=args.input_sentence_size,
+    )
+
+    print("Done!")
+    print(f"Model  : {model_path}")
+    print(f"Vocab  : {vocab_path}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,107 @@
+#
+# SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+
+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse
+from pathlib import Path
+
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+from torchaudio.datasets import LIBRISPEECH
+import sentencepiece as spm
+
+# ---------------------------------------------------------------------
+#  Import the components we need from the original training script
+# ---------------------------------------------------------------------
+import train                                  # assumes train.py is in the same directory
+from train import (
+    create_model,
+    AudioPreprocessor,
+    collate_eval_factory,
+    evaluate,
+)
+
+# ---------------------------------------------------------------------
+#  Argument parsing
+# ---------------------------------------------------------------------
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="Conformer evaluation only")
+    p.add_argument("--root", type=str, default="/shared/LIBRISPEECH",
+                   help="LibriSpeech root directory")
+    p.add_argument("--set", type=str, default="test-clean",
+                   help="Comma-separated LibriSpeech subset names (e.g. test-clean,test-other)")
+    p.add_argument("--batch-size", type=int, default=64,
+                   help="Batch size for evaluation")
+    p.add_argument("--num-workers", type=int, default=4,
+                   help="DataLoader worker processes")
+    p.add_argument("--sp-model", type=str,
+                   default="tokenizer_out/librispeech_sp.model",
+                   help="Path to the SentencePiece *.model used at training time")
+    p.add_argument("--checkpoint", type=str, required=True,
+                   help="Path to a trained checkpoint (*.pt)")
+    return p.parse_args()
+
+# ---------------------------------------------------------------------
+#  Main evaluation routine
+# ---------------------------------------------------------------------
+def main() -> None:
+    args = parse_args()
+
+    # -------- SentencePiece ----------
+    sp = spm.SentencePieceProcessor()
+    sp.load(args.sp_model)
+
+    # expose the tokenizer inside the imported `train` module so that
+    # train.int_to_text() works correctly during decoding
+    train.sp = sp
+
+    vocab_size = sp.get_piece_size() + 1      # +1 for CTC blank
+
+    # -------- Model ----------
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = create_model(vocab_size).to(device)
+
+    ckpt_path = Path(args.checkpoint)
+    ckpt = torch.load(ckpt_path, map_location=device)
+    state = ckpt.get("model", ckpt)           # handles raw state_dict vs wrapper
+
+    # tolerate DataParallel prefix differences
+    missing, unexpected = model.load_state_dict(state, strict=False)
+    if missing:
+        print(f"[warn] missing keys in checkpoint: {missing}")
+    if unexpected:
+        print(f"[warn] unexpected keys in checkpoint: {unexpected}")
+
+    print(f"=> loaded weights from {ckpt_path}")
+
+    if torch.cuda.device_count() > 1:
+        print(f"Using {torch.cuda.device_count()} GPUs via DataParallel …")
+        model = torch.nn.DataParallel(model)
+
+    loss_fn = nn.CTCLoss(blank=0, zero_infinity=True)
+
+    # -------- Datasets & evaluation ----------
+    subsets = [s.strip() for s in args.set.split(",") if s.strip()]
+    preproc = AudioPreprocessor(training=False)
+    for subset in subsets:
+        ds = LIBRISPEECH(args.root, url=subset, download=True)
+        loader = DataLoader(
+            ds,
+            batch_size=args.batch_size,
+            shuffle=False,
+            collate_fn=collate_eval_factory(preproc),
+            num_workers=args.num_workers,
+        )
+
+        wer, val_loss = evaluate(model, loader, device, loss_fn)
+        print(f"\n── Results on {subset} ──")
+        print(f"  • WER:  {wer:6.2f} %")
+        print(f"  • CTC loss: {val_loss:8.4f}")
+
+if __name__ == "__main__":
+    main()