diff --git a/README.md b/README.md index a6defc05b..fc0b33c4d 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ The SpeechBrain Benchmarks currently include the following: - [MOABB](https://github.com/speechbrain/benchmarks/tree/main/benchmarks/MOABB) - A benchmark designed for evaluating neural models in well-known EEG tasks like motor imagery, P300, and SSVEP. -- [DASB](https://github.com/speechbrain/benchmarks/tree/main/benchmarks/DASB) - A benchmark designed for evaluating discrete audio tokens across a wide range of discriminative +- [DASB](https://github.com/speechbrain/benchmarks/tree/DASB/benchmarks/DASB) - A benchmark designed for evaluating discrete audio tokens across a wide range of discriminative and generative tasks. diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/audio_tokens.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/audio_tokens.py deleted file mode 120000 index e34e113e5..000000000 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/audio_tokens.py +++ /dev/null @@ -1 +0,0 @@ -../../../utils/audio_tokens.py \ No newline at end of file diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py index bcb2670a6..52b7e1817 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py @@ -51,17 +51,7 @@ def __init__(self, hparams, create_waveform_fn, device): else: self.evaluators = {} - bulk_evaluators = getattr(self.hparams, "bulk_evaluators", {}) - if bulk_evaluators: - self.bulk_evaluators = { - key: evaluator_f() - for key, evaluator_f in bulk_evaluators.items() - if key in self.enabled_evaluators - } - else: - self.bulk_evaluators = {} - - if not self.evaluators and not self.bulk_evaluators: + if not self.evaluators: logger.warn( "No evaluators were defined - this run will produce samples only" ) @@ -98,9 +88,7 @@ def on_evaluate_start(self, stage, epoch): self.create_reports() self.modules.model.show_inference_progress = False self.item_ids = [] - details_keys = list(self.evaluators.keys()) + list( - self.bulk_evaluators.keys() - ) + details_keys = list(self.evaluators.keys()) self.details = {evaluator_key: [] for evaluator_key in details_keys} self.sample_text = [] self.sample_file_names = [] @@ -141,7 +129,6 @@ def on_evaluate_end(self): dataset : speechbrain.dataio.dataset.DynamicItemDataset a dataset """ - self.evaluate_bulk() self.write_summary() logger.info("Evaluation done") @@ -182,19 +169,6 @@ def get_report_columns(self, evaluator_key): wavs_ref=bogus_wavs, length_ref=bogus_length, ) - else: - bogus_file_name = self.output_folder / "bogus.wav" - evaluator = self.bulk_evaluators[evaluator_key] - sb.dataio.dataio.write_audio( - str(bogus_file_name), - bogus_wavs[0].cpu(), - samplerate=self.hparams.model_sample_rate, - ) - result = evaluator.evaluate_files( - file_names=[bogus_file_name], - text=["BOGUS"], - file_names_ref=[bogus_file_name], - ) return ["uttid"] + list(result.details.keys()) @@ -228,19 +202,6 @@ def evaluate_batch(self, batch): self.write_result(evaluator_key, batch.uttid, details) self.details[evaluator_key].extend(details) - def evaluate_bulk(self): - """Runs all configured bulk evaluators, which evaluate a directory - of files - rather than one file at a time""" - for evaluator_key, evaluator in self.bulk_evaluators.items(): - result = evaluator.evaluate_files( - file_names=self.sample_file_names, - text=self.sample_text, - file_names_ref=self.ref_file_names, - ) - self.details[evaluator_key].append(result.details) - details = undo_batch(result.details) - self.write_result(evaluator_key, self.item_ids, details) - def write_result(self, evaluator_key, uttid, details): """Outputs the result details to the report for the specified evaluator diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml index bdf6c0f75..dcdc6d920 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml @@ -1,50 +1,56 @@ +# ############################################################################ +# Evaluation Hyperparameters +# Common to old models, appended to main hyperparameters +# +# Authors: Artem Ploujnikov +# ############################################################################ + +eval_enabled: True eval_sample_rate: 16000 eval_samples: null eval_interval: 1 eval_asr_type: whisper -eval_asr_source: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - encoder_decoder: speechbrain/asr-transformer-transformerlm-librispeech - whisper: openai/whisper-small +eval_asr_source: openai/whisper-small evaluations: utmos,asr tmp_folder: null -utmos_batch_size: 8 -utmos_model_path: ./utmos -utmos_ckpt_name: epoch=3-step=7459.ckpt -utmos_ckpt_path: !ref / -utmos_use_python: True -utmos_script: predict.py - - -eval_asr: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - encoder_decoder: !name:eval.EncoderDecoderASRSpeechEvaluator - source: !ref - sample_rate: !ref - overrides: - lm_weight: 0.0 - whisper: !name:eval.WhisperASRSpeechEvaluator - source: !ref - sample_rate: !ref - savedir: !ref +eval_utmos_source: chaanks/wav2vec2-small +eval_utmos_save_path: !ref /utmos +eval_utmos_model_name: utmos.ckpt +eval_utmos_model_url: https://huggingface.co/chaanks/UTMOS/resolve/main +eval_utmos_domain_id: null +eval_utmos_judge_id: null +eval_perf: False + + +eval_utmos: !name:eval.UTMOSSpeechEvaluator + source: !ref + save_path: !ref + model_name: !ref + model_url: !ref + domain_id: !ref + judge_id: !ref + +eval_asr: !name:eval.WhisperASRSpeechEvaluator + source: !ref + sample_rate: !ref + savedir: !ref evaluators: + utmos: !ref asr: !ref -bulk_evaluators: - utmos: !name:eval.UTMOSSpeechEvaluator - model_path: !ref - output_folder: !ref - ckpt_path: !ref - batch_size: !ref - script: !ref - use_python: !ref - tmp_folder: !ref - eval_summary: asr: descriptive: ["wer", "cer", "wer_ref", "cer_ref", "dwer", "dcer"] utmos: descriptive: ["utmos"] + +eval_summary_log: + utmos: utmos_utmos_mean + dwer: asr_dwer_median + +eval_threshold: + dwer_max: 90.0 + +eval_threshold_set: + utmos: 0.0 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml index cd4f338bc..d49afdf29 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml @@ -8,18 +8,23 @@ experiment_name: tokotron/dac # Seed needs to be set at top of yaml, before objects with parameters are made seed: 74443 __set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER output_folder: !ref results// save_folder: !ref /save train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + token_model_src: "facebook/encodec_24khz" g2p_src: flexthink/soundchoice-g2p -vocoder_type: encodec -vocoder_src: "charactr/vocos-encodec-24khz" + +# Model type +representation_mode: discrete # Data files -data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech -prepare_save_folder: !ref /prepared/dac +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref pretrained_model_save_folder: !ref prepare_archive_path: null prepare_skip_ignore_folders: False @@ -29,16 +34,27 @@ test_json: !ref /test.json frozen_split_path: null sample_path: null progress_folder: !ref /progress -progress_archive: !ref /progress.tar progress_current: !ref /current progress_meta: !ref /meta.yaml num_audio_samples: 32 samples_interval: 5 -splits: ["train", "valid", "test"] -split_ratio: [90, 5, 5] +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +token_model_kwargs: + n_quantizers: !ref + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: null +test_key_kind: min ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters @@ -61,7 +77,7 @@ bos_index: 0 bos_width: 1 # stages related parameters -lr: 0.001 +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -85,8 +101,8 @@ model_bitrate: 8kbps # Label encoder label_encoder: !new:speechbrain.dataio.encoder.TextEncoder -token_list_file_text: ./hparams/char_en.txt -token_list_file_phn: ./hparams/arpabet.txt +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt token_list_file: !apply:speechbrain.utils.hparams.choice value: !ref choices: @@ -94,24 +110,13 @@ token_list_file: !apply:speechbrain.utils.hparams.choice phonemes: !ref # Gate offset -gate_offset: !apply:Tokotron.distance_diff_loss_ramp +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp beta: !ref gamma: !ref max_weight: !ref silence_padding: !ref -# Token model (pretrained) -dac: !new:speechbrain.lobes.models.discrete.dac.DAC - sample_rate: !ref - model_type: !ref - model_bitrate: !ref - load_pretrained: True - -# Token model (pretrained) -token_model: !new:Tokotron.DACFeatureExtractor - dac: !ref - n_quantizers: !ref # Dataloader options train_dataloader_opts: @@ -143,20 +148,13 @@ sample_dataloader_opts: padding_kwargs: value: !ref -extract_features_opts: - dataloader_opts: - batch_size: !ref - token_model: !ref - sample_rate: !ref - model_sample_rate: !ref - ####################### Model parameters ########################### # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 -dec_num_layers: 12 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 @@ -165,6 +163,7 @@ audio_num_tokens: 1024 audio_emb_size: 1024 audio_emb_freeze: False audio_emb_pretrained: False +audio_token_offsets: False text_num_tokens: 39 phn_num_tokens: 52 input_num_tokens: !apply:speechbrain.utils.hparams.choice @@ -178,7 +177,7 @@ attention_type: regularMHA ############################## models ################################ -model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length +model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length input_num_tokens: !ref audio_num_tokens: !ref audio_tokens_per_step: !ref @@ -198,15 +197,23 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line max_audio_length: !ref infer_max_audio_length: !ref +tokenizer: !new:utils.tokenizer_interface.DACTokenizer + model_type: !ref + model_bitrate: !ref + n_codebooks: !ref + load_pretrained: True + tag: latest + + modules: model: !ref - dac: !ref + tokenizer: !ref # define two optimizers here for two-stage training opt_class: !name:torch.optim.Adam lr: !ref -compute_cost: !new:Tokotron.TokotronLoss +compute_cost: !new:model.Tokotron.TokotronLoss guided_attention_weight: !ref guided_attention_sigma: !ref gate_weight: !ref @@ -226,10 +233,6 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer lr_scheduler: !ref counter: !ref -freezer: !new:preparation.Freezer - save_path: !ref - archive_path: !ref - epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml index f8a0ee622..af723f6c9 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -4,22 +4,24 @@ # ############################################################################ experiment_name: tokotron/discrete_ssl - # Seed needs to be set at top of yaml, before objects with parameters are made seed: 74443 __set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER # Model Type ssl_model_type: wavlm - -output_folder: !ref results/tokotron/// +representation_mode: discrete +output_folder: !ref results/// save_folder: !ref /save train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. # Data files -data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech -prepare_save_folder: !ref /prepared/discrete- +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref pretrained_model_save_folder: !ref vocoder_model_name: !ref unithifigan-dasb--discrete vocoder_model_path: !ref / @@ -36,36 +38,39 @@ progress_current: !ref /current progress_meta: !ref /meta.yaml num_audio_samples: 32 samples_interval: 5 - +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref freeze_token_model: True token_model_src: !apply:speechbrain.utils.hparams.choice value: !ref choices: wavlm: microsoft/wavlm-large hubert: facebook/hubert-large-ll60k - wav2vec2: facebook/wav2vec2-large-960h-lv60-self - + wav2vec2: facebook/wav2vec2-large g2p_src: speechbrain/soundchoice-g2p token_model_kmeans_src: poonehmousavi/SSL_Quantization -token_model_kmeans_dataset: LibriSpeech-100-360-500 -ssl_model_layers: [1, 3, 7, 12, 18, 23] -token_model_layers: !ref -token_offset: 1 -vocoder_src: !apply:speechbrain.utils.hparams.choice +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech +vocoder_repo_id: !apply:speechbrain.utils.hparams.choice value: !ref choices: - wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-k1000-LibriTTS - hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-k1000-LibriTTS - wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-k1000-LibriTTS + hubert: speechbrain/hifigan-hubert-k1000-LibriTTS + wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS + wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS + +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +token_offset: 1 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec use_spk_emb: False - -vocoder_available_layers: [1, 3, 7, 12, 18, 23] - splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] - - +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: null +test_key_kind: min ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters @@ -80,7 +85,7 @@ skip_prep: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 - +data_scale: null # index pad_index: 0 @@ -91,7 +96,7 @@ eos_width: 1 audio_token_shift: 0 # stages related parameters -lr: 0.0005 +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -107,12 +112,6 @@ eos_mode: gate decoder_mode: autoregressive scale_factor: 4 -# Beam Search-specific parameters -min_decode_ratio: 1.0 -max_decode_ratio: 10.0 -beam_size: 5 - - # Feature parameters sample_rate: 22050 model_sample_rate: 16000 @@ -122,8 +121,8 @@ debug_infer_max_audio_length: 10 # Label encoder label_encoder: !new:speechbrain.dataio.encoder.TextEncoder -token_list_file_text: ./hparams/char_en.txt -token_list_file_phn: ./hparams/arpabet.txt +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt token_list_file: !apply:speechbrain.utils.hparams.choice value: !ref choices: @@ -131,15 +130,13 @@ token_list_file: !apply:speechbrain.utils.hparams.choice phonemes: !ref # Gate offset -gate_offset: !apply:Tokotron.distance_diff_loss_ramp +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp beta: !ref gamma: !ref max_weight: !ref - silence_padding: !ref use_silence_padding: True - # Token model (pretrained) ssl_model: !apply:speechbrain.utils.hparams.choice value: !ref @@ -159,16 +156,6 @@ ssl_model: !apply:speechbrain.utils.hparams.choice save_path: !ref freeze: !ref output_all_hiddens: True - - -token_model: !new:speechbrain.lobes.models.huggingface_transformers.discrete_ssl.DiscreteSSL - ssl_model: !ref - kmeans_repo_id: !ref - kmeans_dataset: !ref - num_clusters: !ref - save_path: !ref - layers_num: !ref - spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams source: !ref savedir: !ref /ecapa @@ -181,58 +168,42 @@ train_dataloader_opts: collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: !ref - valid_dataloader_opts: batch_size: !ref num_workers: !ref collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: !ref - test_dataloader_opts: batch_size: 1 num_workers: !ref collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: !ref - sample_dataloader_opts: batch_size: !ref num_workers: !ref collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: !ref - token_model_kwargs: - SSL_layers: !ref - -extract_features_opts: - dataloader_opts: - batch_size: !ref - token_model: !ref - token_model_kwargs: !ref - ssl_model: !ref - ssl_model_layers: !ref - token_model_layers: !ref - sample_rate: !ref - model_sample_rate: !ref - spk_emb_model: !ref - + SSL_layers: !ref ####################### Model parameters ########################### # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 -dec_num_layers: 12 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 activation: !name:torch.nn.GELU -audio_num_tokens: 1000 +vocab_size: 1000 audio_emb_size: 1024 audio_emb_freeze: False audio_emb_pretrained: False +audio_token_offsets: False audio_emb_lr: 0.00001 audio_emb_weight_decay: 0.001 text_num_tokens: 39 @@ -247,14 +218,9 @@ attention_type: regularMHA ############################## models ################################ -vocoder: !apply:speechbrain.inference.vocoders.UnitHIFIGAN.from_hparams - source: !ref - savedir: !ref - - -model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length +model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length input_num_tokens: !ref - audio_num_tokens: !ref + audio_num_tokens: !ref audio_tokens_per_step: !ref d_model: !ref d_ffn: !ref @@ -273,20 +239,25 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line- eos_mode: !ref infer_max_audio_length: !ref audio_token_shift: !ref - decoder_mode: !ref scale_factor: !ref representation_mode: discrete +tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer + save_path: !ref + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref + modules: model: !ref - vocoder: !ref + tokenizer: !ref compute_cost: !ref -# define two optimizers here for two-stage training opt_class: !name:torch.optim.Adam lr: !ref -compute_cost: !new:Tokotron.TokotronLoss +compute_cost: !new:model.Tokotron.TokotronLoss guided_attention_weight: !ref guided_attention_sigma: !ref gate_weight: !ref @@ -302,7 +273,7 @@ compute_cost: !new:Tokotron.TokotronLoss representation_mode: discrete -lr_annealing: !new:Tokotron.TargetedNoamScheduler +lr_annealing: !new:model.Tokotron.TargetedNoamScheduler lr_initial: [!ref , !ref ] n_warmup_steps: !ref param_group: 0 @@ -314,10 +285,6 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer lr_scheduler: !ref counter: !ref -freezer: !new:preparation.Freezer - save_path: !ref - archive_path: !ref - epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml index f5e82c309..1c54128b7 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml @@ -8,18 +8,21 @@ experiment_name: tokotron/encodec # Seed needs to be set at top of yaml, before objects with parameters are made seed: 74443 __set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER output_folder: !ref results// save_folder: !ref /save train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. token_model_src: "facebook/encodec_24khz" g2p_src: flexthink/soundchoice-g2p -vocoder_type: encodec -vocoder_src: "charactr/vocos-encodec-24khz" +# Model type +representation_mode: discrete # Data files -data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech -prepare_save_folder: !ref /prepared/encodec +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref pretrained_model_save_folder: !ref prepare_archive_path: null prepare_skip_ignore_folders: False @@ -29,16 +32,23 @@ test_json: !ref /test.json frozen_split_path: null sample_path: null progress_folder: !ref /progress -progress_archive: !ref /progress.tar progress_current: !ref /current progress_meta: !ref /meta.yaml num_audio_samples: 32 samples_interval: 5 -splits: ["train", "valid", "test"] -split_ratio: [90, 5, 5] +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: null +test_key_kind: min ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters @@ -53,6 +63,7 @@ skip_prep: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 +data_scale: null # index pad_index: 0 @@ -60,7 +71,7 @@ bos_index: 0 bos_width: 1 # stages related parameters -lr: 0.001 +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -80,8 +91,8 @@ debug_infer_max_audio_length: 10 # Label encoder label_encoder: !new:speechbrain.dataio.encoder.TextEncoder -token_list_file_text: ./hparams/char_en.txt -token_list_file_phn: ./hparams/arpabet.txt +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt token_list_file: !apply:speechbrain.utils.hparams.choice value: !ref choices: @@ -89,20 +100,13 @@ token_list_file: !apply:speechbrain.utils.hparams.choice phonemes: !ref # Gate offset -gate_offset: !apply:Tokotron.distance_diff_loss_ramp +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp beta: !ref gamma: !ref max_weight: !ref silence_padding: !ref -# Token model (pretrained) -token_model: !new:speechbrain.lobes.models.huggingface_transformers.Encodec - source: !ref - save_path: !ref - bandwidth: !ref - flat_embeddings: True - # Dataloader options train_dataloader_opts: batch_size: !ref @@ -133,20 +137,13 @@ sample_dataloader_opts: padding_kwargs: value: !ref -extract_features_opts: - dataloader_opts: - batch_size: !ref - token_model: !ref - sample_rate: !ref - model_sample_rate: !ref - ####################### Model parameters ########################### # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 -dec_num_layers: 12 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 @@ -155,6 +152,7 @@ audio_num_tokens: 1024 audio_emb_size: 1024 audio_emb_freeze: False audio_emb_pretrained: False +audio_token_offsets: False text_num_tokens: 39 phn_num_tokens: 52 input_num_tokens: !apply:speechbrain.utils.hparams.choice @@ -168,7 +166,7 @@ attention_type: regularMHA ############################## models ################################ -model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length +model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length input_num_tokens: !ref audio_num_tokens: !ref audio_tokens_per_step: !ref @@ -188,15 +186,24 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line max_audio_length: !ref infer_max_audio_length: !ref +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + modules: model: !ref - token_model: !ref + tokenizer: !ref # define two optimizers here for two-stage training opt_class: !name:torch.optim.Adam lr: !ref -compute_cost: !new:Tokotron.TokotronLoss +compute_cost: !new:model.Tokotron.TokotronLoss guided_attention_weight: !ref guided_attention_sigma: !ref gate_weight: !ref @@ -216,10 +223,6 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer lr_scheduler: !ref counter: !ref -freezer: !new:preparation.Freezer - save_path: !ref - archive_path: !ref - epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml new file mode 100644 index 000000000..3842caa8f --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml @@ -0,0 +1,231 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: tokotron/mimi + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +model_hub: kyutai/mimi +g2p_src: flexthink/soundchoice-g2p + +# Model type +representation_mode: discrete + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: null +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 150 +batch_size: 16 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 5.0 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 +data_scale: null + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 24000 +max_audio_length: 1000 +infer_max_audio_length: !ref +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref + +# Token model (pretrained) +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" +d_ffn: 2048 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +audio_num_tokens: 2048 +audio_emb_size: 1024 +audio_emb_freeze: False +audio_emb_pretrained: False +audio_token_offsets: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 2 +flatten: False +attention_type: regularMHA + +############################## models ################################ + +model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length + input_num_tokens: !ref + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + infer_max_audio_length: !ref + +tokenizer: !new:utils.tokenizer_interface.MimiTokenizer + source: !ref + save_path: !ref + num_codebooks: !ref + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:model.Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml index 103d584ed..cb420591f 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -8,18 +8,23 @@ experiment_name: tokotron/discrete_ssl # Seed needs to be set at top of yaml, before objects with parameters are made seed: 74443 __set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER output_folder: !ref results// save_folder: !ref /save train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + token_model_src: "fnlp/SpeechTokenizer" g2p_src: flexthink/soundchoice-g2p -vocoder_type: encodec -vocoder_src: "charactr/vocos-encodec-24khz" + +# Model type +representation_mode: discrete # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech -prepare_save_folder: !ref /prepared/st +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref pretrained_model_save_folder: !ref prepare_archive_path: null prepare_skip_ignore_folders: False @@ -29,16 +34,24 @@ test_json: !ref /test.json frozen_split_path: null sample_path: null progress_folder: !ref /progress -progress_archive: !ref /progress.tar progress_current: !ref /current progress_meta: !ref /meta.yaml num_audio_samples: 32 samples_interval: 5 -splits: ["train", "valid", "test"] -split_ratio: [90, 5, 5] +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: null +test_key_kind: min ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters @@ -53,6 +66,7 @@ skip_prep: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 +data_scale: null # index @@ -61,7 +75,7 @@ bos_index: 0 bos_width: 1 # stages related parameters -lr: 0.001 +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -81,8 +95,8 @@ debug_infer_max_audio_length: 10 # Label encoder label_encoder: !new:speechbrain.dataio.encoder.TextEncoder -token_list_file_text: ./hparams/char_en.txt -token_list_file_phn: ./hparams/arpabet.txt +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt token_list_file: !apply:speechbrain.utils.hparams.choice value: !ref choices: @@ -90,7 +104,7 @@ token_list_file: !apply:speechbrain.utils.hparams.choice phonemes: !ref # Gate offset -gate_offset: !apply:Tokotron.distance_diff_loss_ramp +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp beta: !ref gamma: !ref max_weight: !ref @@ -98,14 +112,6 @@ gate_offset: !apply:Tokotron.distance_diff_loss_ramp silence_padding: !ref # Token model (pretrained) -speech_tokenizer: !new:speechbrain.lobes.models.discrete.speechtokenizer_interface.SpeechTokenizer_interface - source: !ref - save_path: !ref - -token_model: !new:Tokotron.SpeechTokenizerFeatureExtractor - speech_tokenizer: !ref - codebooks: !ref - # Dataloader options train_dataloader_opts: batch_size: !ref @@ -136,20 +142,12 @@ sample_dataloader_opts: padding_kwargs: value: !ref -extract_features_opts: - dataloader_opts: - batch_size: !ref - token_model: !ref - sample_rate: !ref - model_sample_rate: !ref - - ####################### Model parameters ########################### # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 -dec_num_layers: 12 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 @@ -157,7 +155,8 @@ activation: !name:torch.nn.GELU audio_num_tokens: 1024 audio_emb_size: 1024 audio_emb_freeze: False -audio_emb_pretrained: True +audio_emb_pretrained: False +audio_token_offsets: False text_num_tokens: 39 phn_num_tokens: 52 input_num_tokens: !apply:speechbrain.utils.hparams.choice @@ -166,12 +165,13 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice text: !ref phonemes: !ref audio_tokens_per_step: 2 +flatten: False bandwidth: 1.5 attention_type: regularMHA ############################## models ################################ -model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length +model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length input_num_tokens: !ref audio_num_tokens: !ref audio_tokens_per_step: !ref @@ -191,15 +191,19 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line max_audio_length: !ref infer_max_audio_length: !ref +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper + source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio + save_path: !ref + modules: model: !ref - token_model: !ref + tokenizer: !ref # define two optimizers here for two-stage training opt_class: !name:torch.optim.Adam lr: !ref -compute_cost: !new:Tokotron.TokotronLoss +compute_cost: !new:model.Tokotron.TokotronLoss guided_attention_weight: !ref guided_attention_sigma: !ref gate_weight: !ref @@ -219,10 +223,6 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer lr_scheduler: !ref counter: !ref -freezer: !new:preparation.Freezer - save_path: !ref - archive_path: !ref - epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml new file mode 100644 index 000000000..6e87dedfe --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml @@ -0,0 +1,258 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: tokotron/sqcodec + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +config: config.yaml +checkpoint: ckpt_00190000.pth +sq_codec_save_path: !ref /sq-codec +g2p_src: flexthink/soundchoice-g2p + +# Model type +representation_mode: discrete + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: null +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 150 +batch_size: 16 +grad_accumulation_factor: 1 +max_grad_norm: 5.0 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 +data_scale: null + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 1000 +infer_max_audio_length: !ref +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref + +# Token model (pretrained) +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +transform_audio: !name:model.sq_codec.tokens_to_ternary + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" +d_ffn: 2048 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +audio_num_tokens: 19683 +audio_emb_size: 36 +audio_emb_freeze: False +audio_emb_pretrained: False +audio_token_offsets: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 4 +ternary_num_digits: 9 +ternary_num_positions: !ref * +bandwidth: 1.5 +attention_type: regularMHA + +############################## models ################################ + +model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length + input_num_tokens: !ref + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + infer_max_audio_length: !ref + audio_emb: !ref + out_proj: !ref + multihead_input: False + inference: !ref + +inference: !new:model.Tokotron.TokotronTransformerAutoregressiveInference + gate_offset: !ref + gate_threshold: !ref + tokens_per_step: !ref + bos_idx: !ref + audio_token_shift: 0 + max_steps: !ref + representation_mode: !ref + transform_audio: !name:model.sq_codec.tokens_to_ternary + feed_audio: !name:model.sq_codec.ternary_logits_to_tokens + +audio_emb: !new:torch.nn.Identity + +out_proj: !new:model.custom_model.TernaryPredictionHead + d_model: !ref + num_positions: !ref + +tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer + save_path: !ref + checkpoint: !ref + config: !ref + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:model.Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + seq_cost: !name:model.sq_codec.ternary_loss + multihead_output: False + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml new file mode 100644 index 000000000..d3bf9c770 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml @@ -0,0 +1,231 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: tokotron/wavtokenizer + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +model_hub: novateur/WavTokenizer-medium-music-audio-75token +config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml +checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt +g2p_src: flexthink/soundchoice-g2p + +# Model type +representation_mode: discrete + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + + +splits: ["train", "valid", "test"] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: null +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 150 +batch_size: 16 +grad_accumulation_factor: 1 +max_grad_norm: 5.0 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 +data_scale: null + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 24000 +max_audio_length: 1000 +infer_max_audio_length: !ref +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref + +# Token model (pretrained) +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" +d_ffn: 2048 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +audio_num_tokens: 4096 +audio_emb_size: 1024 +audio_emb_freeze: False +audio_emb_pretrained: False +audio_token_offsets: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 1 +attention_type: regularMHA + +############################## models ################################ + +model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length + input_num_tokens: !ref + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + infer_max_audio_length: !ref + +tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper + source: !ref + save_path: !ref + checkpoint: !ref + config: !ref + freeze: True + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:model.Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/preparation.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/preparation.py deleted file mode 120000 index 08621a288..000000000 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/preparation.py +++ /dev/null @@ -1 +0,0 @@ -../../../utils/preparation.py \ No newline at end of file diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index 3dddf48dc..161a1fd93 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -22,17 +22,19 @@ import string from pathlib import Path from hyperpyyaml import load_hyperpyyaml -from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset +from speechbrain.dataio.dataio import clean_padding, clean_padding_ from speechbrain.utils.distributed import run_on_main -from preparation import add_prepared_features -from audio_tokens import ( + +base_dir = str(Path(__file__).resolve().parent.parent.parent.parent) +sys.path.append(base_dir) + +from model.Tokotron import ( # noqa: E402 get_silence_token, use_silence_padding, feature_pad_to, -) -from Tokotron import RepresentationMode -from evaluate import TokotronEvaluator - + RepresentationMode, +) # noqa: E402 +from evaluate import TokotronEvaluator # noqa: E402 logger = logging.getLogger(__name__) @@ -59,6 +61,9 @@ def __init__( create_waveform_fn=self.create_waveform, device=self.device, ) + self.representation_mode = RepresentationMode( + self.hparams.representation_mode + ) def compute_forward(self, batch, stage): """Runs all the computation of the Tokotron TTS @@ -77,11 +82,13 @@ def compute_forward(self, batch, stage): """ batch = batch.to(self.device) tokens, tokens_length = batch.tokens - audio, audio_length = batch.audio_bos + features = self.prepare_features(batch) + audio, audio_length, _, _ = features emb = None if self.use_spk_emb: emb = {"spk": batch.spk_emb.data.squeeze(1)} + audio = self.transform_audio(audio) predictions = self.modules.model( input_tokens=tokens, input_length=tokens_length, @@ -90,7 +97,69 @@ def compute_forward(self, batch, stage): emb=emb, ) - return predictions + return predictions, features + + def prepare_features(self, batch): + """Prepares features, depending on the configuration + + Arguments + --------- + batch : PaddedBatch + This batch object contains all the relevant tensors for computation + + Returns + ------- + audio_bos : torch.Tensor + Audio features, with BOS + audio_bos_length : torch.Tensor + Relative lengths of the audio features, with BOS + audio_tgt : torch.Tensor + Target audio features (for loss computation) + audio_tgt_length : torch.Tensor + Relative lengths of the target audio features + """ + if self.representation_mode == RepresentationMode.DISCRETE: + audio_bos, audio_bos_length = batch.audio_bos + audio_tgt, audio_tgt_length = batch.audio_pad + if self.audio_token_offsets is not None: + audio_bos = torch.cat( + [ + audio_bos[:, : self.hparams.bos_width], + audio_bos[:, self.hparams.bos_width :] + - self.audio_token_offsets, + ], + dim=1, + ) + clean_padding_(audio_bos, audio_bos_length) + audio_tgt = audio_tgt - self.audio_token_offsets + clean_padding_(audio_tgt, audio_tgt_length) + else: + wav, audio_length = batch.sig + audio = self.modules.ssl_model(wav) + audio = audio[self.hparams.ssl_model_layers, :, :, :].permute( + 1, 2, 0, 3 + ) + batch_size, _, heads, dim = audio.shape + bos = torch.zeros_like(audio[:, :1, :, :]).reshape( + batch_size, self.hparams.bos_width, heads, dim + ) + audio_bos = torch.concatenate([bos, audio], dim=1) + audio_bos_length = audio_length * audio.size(1) / audio_bos.size(1) + audio_tgt = audio + audio_tgt_length = audio_length + return audio_bos, audio_bos_length, audio_tgt, audio_tgt_length + + def get_token_offsets(self): + """Computes token offsets for tokenizers that require them""" + token_offsets = None + if self.hparams.audio_token_offsets: + token_offsets = ( + torch.arange( + self.hparams.audio_tokens_per_step, device=self.device + ) + * self.hparams.audio_num_tokens + )[None, None, :] + return token_offsets @torch.no_grad() def evaluate_batch(self, batch, stage): @@ -140,24 +209,27 @@ def compute_objectives(self, predictions, batch, stage): A one-element tensor used for backpropagating the gradient. """ batch = batch.to(self.device) - audio, audio_length = batch.audio_pad + predictions, features = predictions + _, _, audio_tgt, audio_tgt_length = features + + audio_tgt = self.transform_audio(audio_tgt) loss_details = self.hparams.compute_cost( predictions=predictions, - audio=audio, - audio_length=audio_length, + audio=audio_tgt, + audio_length=audio_tgt_length, input_tokens=batch.tokens.data, input_length=batch.tokens.lengths, ) self.loss_metric.append( batch.uttid, predictions=predictions, - audio=audio, - audio_length=audio_length, + audio=audio_tgt, + audio_length=audio_tgt_length, input_tokens=batch.tokens.data, input_length=batch.tokens.lengths, reduction="batch", ) - return loss_details.loss + return loss_details.loss.contiguous() def on_stage_start(self, stage, epoch): """Gets called at the beginning of each epoch. @@ -195,15 +267,25 @@ def on_stage_start(self, stage, epoch): self.use_spk_emb = getattr(self.hparams, "use_spk_emb", False) self.is_evaluating = False - if stage == sb.Stage.VALID: - if self.is_eval_epoch(epoch): + if self.hparams.eval_enabled: + if stage == sb.Stage.VALID: + if self.is_eval_epoch(epoch): + self.evaluator.on_evaluate_start(stage, epoch) + self.is_evaluating = True + else: + logger.info("No evaluation on epoch %d", epoch) + elif stage == sb.Stage.TEST: self.evaluator.on_evaluate_start(stage, epoch) self.is_evaluating = True - else: - logger.info("No evaluation on epoch %d", epoch) - elif stage == sb.Stage.TEST: - self.evaluator.on_evaluate_start(stage, epoch) - self.is_evaluating = True + + self.audio_token_offsets = self.get_token_offsets() + self.token_model_kwargs = getattr( + self.hparams, "token_model_kwargs", {} + ) + + self.transform_audio = getattr( + self.hparams, "transform_audio", torch.nn.Identity() + ) def on_stage_end(self, stage, stage_loss, epoch): """Gets called at the end of an epoch. @@ -225,6 +307,13 @@ def on_stage_end(self, stage, stage_loss, epoch): if stage == sb.Stage.TRAIN: self.train_stats = stage_stats + # End evaluation and report stats + eval_summary_stats = {} + if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch): + self.evaluator.on_evaluate_end() + eval_summary_stats = self.get_summary_stats() + stage_stats.update(eval_summary_stats) + # Perform end-of-iteration things, like annealing, logging, etc. if stage == sb.Stage.VALID: @@ -244,12 +333,61 @@ def on_stage_end(self, stage, stage_loss, epoch): ) # Save the current checkpoint and delete previous checkpoints. + ckpt_kwargs = { + f"{self.hparams.ckpt_key_kind}_keys": [self.hparams.ckpt_key], + } self.checkpointer.save_and_keep_only( - meta={"loss": stage_stats["loss"]}, min_keys=["loss"], + meta={"loss": stage_stats["loss"], **eval_summary_stats}, + num_to_keep=hparams["ckpt_keep"], + **ckpt_kwargs, ) - if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch): - self.evaluator.on_evaluate_end() + def get_summary_stats(self): + """Retrieves the stats that needs to be reported on every trial + in the train log, as indicated in eval_summary_log in eval.yaml + + Returns + ------- + eval_summary_stats : dict + A dict with stats""" + eval_summary = self.evaluator.compute_summary() + eval_summary_stats = { + key: eval_summary.get(value) + for key, value in self.hparams.eval_summary_log.items() + } + self._check_threshold(eval_summary_stats) + return eval_summary_stats + + def _check_threshold(self, eval_summary_stats): + """Checks threshold values for the defined stats and terminates + the trials if the parameters are not met. This is necessary because + some metrics produce bogus high values when the speech samples + do not contain any speech at all (e.g. UTMOS can be above 3 for + silence). + + Classic usage: dWER > 0.9 - treat the whole run as "garbage", set + UTMOS to 0 + + Arguments + --------- + eval_summary_stats : dict + Summary statistics + """ + for key, threshold_value in self.hparams.eval_threshold.items(): + key, threshold_type = key.split("_") + value = eval_summary_stats[key] + if threshold_type == "min": + meets = value >= threshold_value + elif threshold_type == "max": + meets = value <= threshold_value + else: + raise ValueError( + f"Invalid threshold definition: {key}, check eval_threshold" + ) + if not meets: + eval_summary_stats["broken"] = True + for key, value in self.hparams.eval_threshold_set.items(): + eval_summary_stats[key] = value def fit_batch(self, batch): """Fit one batch, override to do multiple updates. @@ -281,11 +419,7 @@ def fit_batch(self, batch): def init_optimizers(self): """Custom optimizer initialization """ - representation_mode = getattr( - self.hparams, "representation_mode", RepresentationMode.DISCRETE - ) - representation_mode = RepresentationMode(representation_mode) - if representation_mode == RepresentationMode.CONTINUOUS: + if self.representation_mode == RepresentationMode.CONTINUOUS: audio_emb_params = self.modules.model.decoder.audio_emb.parameters() audio_emb_params_set = set(audio_emb_params) model_params = [ @@ -323,7 +457,19 @@ def create_waveform(self, audio, length): ------- wav : torch.Tensor """ - raise NotImplementedError() + self.modules.tokenizer.device = self.device + if hasattr(self.modules.tokenizer, "codec_vocoder"): + self.modules.tokenizer.codec_vocoder.to(self.device) + self.modules.tokenizer.codec_vocoder.device = self.device + with torch.no_grad(): + if self.audio_token_offsets is not None: + audio = clean_padding(audio + self.audio_token_offsets, length) + wav = self.modules.tokenizer.tokens_to_sig( + audio, **self.token_model_kwargs + ) + wav = clean_padding(wav, length) + wav = wav.to(self.device) + return wav def is_eval_epoch(self, epoch): """Determines whether or not evaluation should be performed @@ -368,9 +514,7 @@ def dataio_prepare(hparams): the token used for silence """ - representation_mode = RepresentationMode( - hparams.get("representation_mode", RepresentationMode.DISCRETE) - ) + representation_mode = RepresentationMode(hparams["representation_mode"]) # Define datasets from json data manifest file # Define datasets sorted by ascending lengths for efficiency @@ -407,7 +551,7 @@ def audio_ref_pipeline(wav): Arguments --------- - wav : str + wav : strƒnum_ The file path Returns @@ -421,50 +565,50 @@ def audio_ref_pipeline(wav): use_silence_padding = hparams.get("use_silence_padding", True) if representation_mode == RepresentationMode.DISCRETE: - layers_key = "token_model_layers" - model_key = "token_model" - audio_features = "audio_tokens" + model_key = "tokenizer" else: - layers_key = "ssl_model_layers" model_key = "ssl_model" - audio_features = "audio_ssl" - audio_tokens_per_step = ( - len(hparams[layers_key]) - if layers_key in hparams - else hparams["audio_tokens_per_step"] - ) - if use_silence_padding: - silence_token, silence_emb = get_silence_token( + audio_tokens_per_step = hparams["audio_tokens_per_step"] + if ( + use_silence_padding + and representation_mode == RepresentationMode.DISCRETE + ): + silence_token = get_silence_token( hparams[model_key], - extract_emb=representation_mode == RepresentationMode.CONTINUOUS, - model_kwargs=hparams.get("token_model_kwargs"), + num_codebooks=( + hparams["speech_model_layers"] + if "speech_model_layers" in hparams + else audio_tokens_per_step + ), ) + if silence_token.dim() == 2: + silence_token = silence_token.squeeze(-1) else: silence_token = ( torch.ones(hparams["audio_tokens_per_step"], dtype=torch.int64) * hparams["eos_index"] ) - silence_token = silence_token.cpu() - silence_padding = ( - silence_token - if representation_mode == RepresentationMode.DISCRETE - else silence_emb - ) + silence_padding = silence_token.cpu() + silence_padding = silence_padding[:audio_tokens_per_step] silence_padding_len = int(math.ceil(hparams["silence_padding"])) bos_width = hparams.get("bos_width", 1) audio_bos_prefix = ( torch.ones(bos_width, audio_tokens_per_step) * hparams["bos_index"] ) - if representation_mode == RepresentationMode.CONTINUOUS: - audio_bos_prefix = audio_bos_prefix.unsqueeze(-1).repeat( - 1, 1, hparams["audio_dim"] - ) - @sb.utils.data_pipeline.takes(audio_features) + tokens_loader = hparams.get("tokens_loader") + if "speech_model_layers" in hparams: + tokens_loader_kwargs = { + "num_codebooks": get_selected_layer_indexes(hparams) + } + else: + tokens_loader_kwargs = {"num_codebooks": audio_tokens_per_step} + + @sb.utils.data_pipeline.takes("uttid") @sb.utils.data_pipeline.provides("audio_pad", "audio_bos") - def audio_pipeline(audio): - audio = torch.from_numpy(audio) + def audio_pipeline(id): + audio = tokens_loader.tokens_by_uttid(id, **tokens_loader_kwargs) audio_pad = feature_pad_to( audio, len(audio) + silence_padding_len, silence_padding ) @@ -480,21 +624,20 @@ def audio_pipeline(audio): ] init_sequence_encoder(hparams) - use_spk_emb = hparams.get("use_spk_emb", False) - prepared_features = [audio_features] output_keys = [ "uttid", "tokens", - "audio_pad", - "audio_bos", "label_norm_eval", ] - if use_spk_emb: - prepared_features.append("spk_emb") - output_keys.append("spk_emb") + if representation_mode == RepresentationMode.DISCRETE: + output_keys += [ + "audio_pad", + "audio_bos", + ] + else: + output_keys.append("sig") eval_output_keys = [*output_keys, "sig"] - for dataset in data_info: if dataset == "train": dataset_output_keys = output_keys @@ -508,16 +651,25 @@ def audio_pipeline(audio): output_keys=dataset_output_keys, ) - add_prepared_features( - dataset=dynamic_dataset, - save_path=Path(hparams["prepare_save_folder"]) / "features", - id_key="uttid", - features=prepared_features, - ) - datasets[dataset] = dynamic_dataset hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False + sort_datasets(datasets, hparams) + apply_data_scale(datasets, hparams) + + return datasets, silence_padding + + +def sort_datasets(datasets, hparams): + """Sorts datasets according to hyperparameters + + Arguments + --------- + datasets : dict + a key -> value dictionary of datasets (the keys are "train", "valid" and "test") + hparams : dict + a dictionary of hyperparameters + """ # Sorting training data with ascending order makes the code much # faster because we minimize zero-padding. In most of the cases, this # does not harm the performance. @@ -532,56 +684,31 @@ def audio_pipeline(audio): hparams["train_dataloader_opts"]["shuffle"] = False elif hparams["sorting"] == "random": - hparams["train_dataloader_opts"]["shuffle"] = True - pass - + if not hparams["overfit_test"]: + hparams["train_dataloader_opts"]["shuffle"] = True else: raise NotImplementedError( "sorting must be random, ascending or descending" ) - datasets["sample"] = select_sample(hparams, datasets) - return datasets, silence_padding - -def select_sample(hparams, datasets): - """Selects a sample of files for sample generation, freezing the sample if - requested to persist across multiple experiments +def apply_data_scale(datasets, hparams): + """Selects a fractional dataset if the corresponding parameter is specified, + using random sampling Arguments --------- - hparams : dict - experiment hyperparameters datasets : dict a dictionary of datasets - - Returns - ------- - dataset : speechbrain.dataio.dataset.FilteredSortedDynamicItemDataset - the sample dataset + hparams : dict + parsed hyperparameters """ - sample_path = hparams.get("sample_path") - dataset = None - if sample_path is not None: - sample_path = Path(sample_path) - if sample_path.exists(): - with open(sample_path, "r") as sample_file: - data_ids = [line.strip() for line in sample_file] - dataset = FilteredSortedDynamicItemDataset( - datasets["valid"], data_ids - ) - - if dataset is None: - dataset = ( - datasets["valid"] - .batch_shuffle(1) - .filtered_sorted(select_n=hparams["num_audio_samples"]) + data_scale = hparams.get("data_scale") + if data_scale: + scaled_data_count = int(len(datasets["train"]) * data_scale) + datasets["train"] = datasets["train"].filtered_sorted( + select_n=scaled_data_count ) - if sample_path is not None: - with open(sample_path, "w") as sample_file: - for data_id in dataset.data_ids: - print(data_id, file=sample_file) - return dataset def init_sequence_encoder(hparams): @@ -611,6 +738,22 @@ def init_sequence_encoder(hparams): return encoder +def get_selected_layer_indexes(hparams): + """Finds the layers of selected layers + + Arguments + --------- + hparams : dict + Hyperparameters + """ + selected_layers = hparams.get("speech_model_layers") + available_layers = hparams.get("available_speech_model_layers") + if not (selected_layers and available_layers): + return None + layer_idx = [available_layers.index(layer) for layer in selected_layers] + return layer_idx + + def read_token_list(file_name): """Reads a simple text file with tokens (e.g. characters or phonemes) listed one per line @@ -625,7 +768,10 @@ def read_token_list(file_name): result: list a list of tokens """ - if not Path(file_name).exists(): + file_name = Path(file_name) + if not file_name.is_absolute(): + file_name = Path(__file__).parent / "hparams" / file_name + if not file_name.exists(): raise ValueError(f"Token file {file_name} not found") with open(file_name) as token_file: return [line.strip("\r\n") for line in token_file if line] @@ -667,17 +813,23 @@ def apply_overfit_test(hparams, dataset): """ if hparams["overfit_test"]: if isinstance(dataset, tuple): - dataset_train, _, _ = dataset + dataset_train, dataset_valid, _ = dataset dataset_train = apply_overfit_test(hparams, dataset_train) dataset_eval = dataset_train.filtered_sorted( select_n=hparams["overfit_test_sample_count"] ) + dataset_eval.set_output_keys( + list(dataset_valid.pipeline.output_mapping.keys()) + ) result = dataset_train, dataset_eval, dataset_eval elif isinstance(dataset, dict): dataset_train = apply_overfit_test(hparams, dataset["train"]) dataset_eval = dataset_train.filtered_sorted( select_n=hparams["overfit_test_sample_count"] ) + dataset_eval.set_output_keys( + list(dataset["valid"].pipeline.output_mapping.keys()) + ) result = { "train": dataset_train, "valid": dataset_eval, @@ -699,7 +851,7 @@ def apply_overfit_test(hparams, dataset): ) -def run_experiment(brain_cls): +if __name__ == "__main__": # Reading command line arguments hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) @@ -712,6 +864,8 @@ def run_experiment(brain_cls): # Load evaluation hyperparameters eval_hparams_file = Path(hparams_file).parent / "eval.yaml" + if not eval_hparams_file.exists(): + eval_hparams_file = Path(__file__).parent / "hparams" / "eval.yaml" if eval_hparams_file.exists(): logger.info( "Using evaluation hyperparameters from %s", eval_hparams_file @@ -736,40 +890,23 @@ def run_experiment(brain_cls): from ljspeech_prepare import prepare_ljspeech # Data preparation, to be run on only one process. - representation_mode = RepresentationMode( - hparams.get("representation_mode", RepresentationMode.DISCRETE) - ) - audio_features = ( - "audio_tokens" - if representation_mode == RepresentationMode.DISCRETE - else "audio_ssl" - ) - extract_features = [audio_features] - if hparams.get("use_spk_emb", False): - extract_features.append("spk_emb") - if not hparams["skip_prep"]: - with hparams["freezer"]: - run_on_main( - prepare_ljspeech, - kwargs={ - "data_folder": hparams["data_folder"], - "save_folder": hparams["prepare_save_folder"], - "splits": hparams["splits"], - "split_ratio": hparams["split_ratio"], - "seed": hparams["seed"], - "extract_features": extract_features, - "extract_features_opts": hparams["extract_features_opts"], - "extract_phonemes": hparams["input"] == "phonemes", - "model_name": "tokotron", - "g2p_src": hparams["g2p_src"], - "skip_ignore_folders": hparams[ - "prepare_skip_ignore_folders" - ], - "frozen_split_path": hparams.get("frozen_split_path"), - "device": run_opts.get("device", "cpu"), - }, - ) + run_on_main( + prepare_ljspeech, + kwargs={ + "data_folder": hparams["data_folder"], + "save_folder": hparams["prepare_save_folder"], + "splits": hparams["splits"], + "split_ratio": hparams["split_ratio"], + "seed": hparams["seed"], + "extract_phonemes": hparams["input"] == "phonemes", + "model_name": "tokotron", + "g2p_src": hparams["g2p_src"], + "skip_ignore_folders": hparams["prepare_skip_ignore_folders"], + "frozen_split_path": hparams.get("frozen_split_path"), + "device": run_opts.get("device", "cpu"), + }, + ) # We can now directly create the datasets for training, valid, and test datasets, silence_padding = dataio_prepare(hparams) @@ -779,39 +916,71 @@ def run_experiment(brain_cls): audio_keys = ["audio_pad", "audio_bos"] # Trainer initialization - tts_brain = brain_cls( + tts_brain = TokotronBrain( modules=hparams["modules"], opt_class=hparams["opt_class"], hparams=hparams, run_opts=run_opts, checkpointer=hparams["checkpointer"], ) - tts_brain.sample_data = datasets["sample"] # The `fit()` method iterates the training loop, calling the methods # necessary to update the parameters of the model. Since all objects # with changing state are managed by the Checkpointer, training can be # stopped at any point, and will be resumed on next call. + + dataloader_opts = [ + hparams[f"{key}_dataloader_opts"] for key in ["train", "valid", "test"] + ] + representation_mode = RepresentationMode(hparams["representation_mode"]) + if representation_mode == RepresentationMode.DISCRETE: + dataloader_opts = [ + use_silence_padding(opts, silence_padding, audio_keys) + for opts in dataloader_opts + ] + ( + train_dataloader_opts, + valid_dataloader_opts, + test_dataloader_opts, + ) = dataloader_opts + tts_brain.fit( tts_brain.hparams.epoch_counter, datasets["train"], datasets["valid"], - train_loader_kwargs=use_silence_padding( - hparams["train_dataloader_opts"], silence_padding, audio_keys - ), - valid_loader_kwargs=use_silence_padding( - hparams["valid_dataloader_opts"], silence_padding, audio_keys - ), + train_loader_kwargs=train_dataloader_opts, + valid_loader_kwargs=valid_dataloader_opts, ) # Load best checkpoint for evaluation - tts_brain.evaluate( - test_set=datasets["test"], - min_key="loss", - test_loader_kwargs=use_silence_padding( - hparams["test_dataloader_opts"], silence_padding, audio_keys - ), - ) + if hparams["testing"]: + test_summary_file = ( + Path(hparams["output_folder"]) / "eval" / "test" / "summary.json" + ) + if test_summary_file.exists(): + logging.info("Test run already completed: %s", test_summary_file) + else: + test_summary_file = ( + Path(hparams["output_folder"]) + / "eval" + / "test" + / "summary.json" + ) + if test_summary_file.exists(): + logging.info( + "Test run already completed: %s", test_summary_file + ) + else: + eval_kwargs = {} + test_key_kind = hparams.get("test_key_kind", "min") + test_key = hparams.get("test_key") + if test_key: + eval_kwargs = {f"{test_key_kind}_key": test_key} + tts_brain.evaluate( + test_set=datasets["test"], + test_loader_kwargs=hparams["test_dataloader_opts"], + **eval_kwargs, + ) # Save final checkpoint (fixed name) tts_brain.checkpointer.save_checkpoint(name="latest") diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_continuous_ssl.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_continuous_ssl.py deleted file mode 100644 index f3495eaca..000000000 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_continuous_ssl.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training a Text-to-Speech system based on tokenized audio -Continuous SSL verfsion - -Inspired by WhisperSpeech -https://github.com/collabora/WhisperSpeech - -However, this is not an implementation of WhisperSpeech, but rather -a radical simplification of it that uses only an acoustic model - - -Authors - * Artem Ploujnikov 2024 -""" - -from train import TokotronBrain, run_experiment -from speechbrain.dataio.dataio import clean_padding_ - - -class TokotronContinuousSSLBrain(TokotronBrain): - """Tokotron implementation for Encodec""" - - def create_waveform(self, audio, length): - """Creates a waveform from a discrete or continuous audio - representation - - Arguments - --------- - audio : torch.Tensor - An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) - lengths : torch.Tensor - A 1-D tensor - - Returns - ------- - wav : torch.Tensor - """ - wav = self.modules.vocoder(audio) - wav = wav.squeeze(1) - clean_padding_(wav, length) - return wav - - -if __name__ == "__main__": - run_experiment(TokotronContinuousSSLBrain) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_dac.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_dac.py deleted file mode 100644 index d0bc9f4f7..000000000 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_dac.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training a Text-to-Speech system based on tokenized audio - DAC version - -Inspired by WhisperSpeech -https://github.com/collabora/WhisperSpeech - -However, this is not an implementation of WhisperSpeech, but rather -a radical simplification of it that uses only an acoustic model - - -Authors - * Artem Ploujnikov 2024 -""" -from train import TokotronBrain, run_experiment -from speechbrain.dataio.dataio import clean_padding_ - - -class TokotronDACBrain(TokotronBrain): - """Tokotron implementation for Encodec""" - - def create_waveform(self, audio, length): - """Creates a waveform from a discrete or continuous audio - representation - - Arguments - --------- - audio : torch.Tensor - An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) - lengths : torch.Tensor - A 1-D tensor - - Returns - ------- - wav : torch.Tensor - """ - z, _, _ = self.modules.dac.quantizer.from_codes( - audio.transpose(1, 2).int() - ) - wav = self.modules.dac.decode(z).squeeze(1) - clean_padding_(wav, length) - return wav - - -if __name__ == "__main__": - run_experiment(TokotronDACBrain) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_discrete_ssl.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_discrete_ssl.py deleted file mode 100644 index f9fc764cd..000000000 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_discrete_ssl.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training a Text-to-Speech system based on tokenized audio -Discrete SSL version - -Inspired by WhisperSpeech -https://github.com/collabora/WhisperSpeech - -However, this is not an implementation of WhisperSpeech, but rather -a radical simplification of it that uses only an acoustic model - - -Authors - * Artem Ploujnikov 2024 -""" - -import torch -from train import TokotronBrain, run_experiment -from speechbrain.dataio.dataio import clean_padding_ - - -class TokotronDiscreteSSLBrain(TokotronBrain): - """Tokotron implementation for Encodec""" - - def on_stage_start(self, stage, epoch): - self.compute_offset() - return super().on_stage_start(stage, epoch) - - def compute_offset(self): - """Computes per-layer offsets""" - layers_set = set(self.hparams.token_model_layers) - available_layers_set = set(self.hparams.vocoder_available_layers) - if not layers_set.issubset(available_layers_set): - unavailable_layers = ",".join( - str(layer) for layer in (layers_set - available_layers_set) - ) - raise ValueError(f"Layers {unavailable_layers} are not supported") - self.num_units = self.hparams.audio_num_tokens - _, layers_idx = torch.where( - torch.tensor( - self.hparams.vocoder_available_layers, device=self.device - ).unsqueeze(0) - == torch.tensor( - self.hparams.token_model_layers, device=self.device - ).unsqueeze(1) - ) - self.layer_offset = ( - torch.tensor(layers_idx, device=self.device) * self.num_units - )[None, None, :] - self.offset = self.hparams.token_offset - self.modules.vocoder.tokenize = False - - def create_waveform(self, audio, length): - """Creates a waveform from a discrete or continuous audio - representation - - Arguments - --------- - audio : torch.Tensor - An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) - lengths : torch.Tensor - A 1-D tensor - - Returns - ------- - wav : torch.Tensor - """ - units_with_offset = ( - audio + self.layer_offset.to(audio.device) + self.offset - ) - wav = self.modules.vocoder(units_with_offset) - wav = wav.squeeze(1) - clean_padding_(wav, length) - return wav - - -if __name__ == "__main__": - run_experiment(TokotronDiscreteSSLBrain) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_encodec.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_encodec.py deleted file mode 100644 index 2168f970d..000000000 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_encodec.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training a Text-to-Speech system based on tokenized audio - Encodec version - -Inspired by WhisperSpeech -https://github.com/collabora/WhisperSpeech - -However, this is not an implementation of WhisperSpeech, but rather -a radical simplification of it that uses only an acoustic model - - -Authors - * Artem Ploujnikov 2024 -""" - -from train import TokotronBrain, run_experiment -from speechbrain.dataio.dataio import clean_padding_ - - -class TokotronEncodecBrain(TokotronBrain): - """Tokotron implementation for Encodec""" - - def create_waveform(self, audio, length): - """Creates a waveform from a discrete or continuous audio - representation - - Arguments - --------- - audio : torch.Tensor - An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) - lengths : torch.Tensor - A 1-D tensor - - Returns - ------- - wav : torch.Tensor - """ - wav = self.modules.token_model.decode(audio) - wav = wav.squeeze(1) - clean_padding_(wav, length) - return wav - - -if __name__ == "__main__": - run_experiment(TokotronEncodecBrain) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_speech_tokenizer.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_speech_tokenizer.py deleted file mode 100644 index bc51db78c..000000000 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_speech_tokenizer.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training a Text-to-Speech system based on tokenized audio - Encodec version - -Inspired by WhisperSpeech -https://github.com/collabora/WhisperSpeech - -However, this is not an implementation of WhisperSpeech, but rather -a radical simplification of it that uses only an acoustic model - - -Authors - * Artem Ploujnikov 2024 -""" - -from train import TokotronBrain, run_experiment -from speechbrain.dataio.dataio import clean_padding_ - - -class TokotronSTBrain(TokotronBrain): - """Tokotron implementation for Encodec""" - - def create_waveform(self, audio, length): - """Creates a waveform from a discrete or continuous audio - representation - - Arguments - --------- - audio : torch.Tensor - An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) - lengths : torch.Tensor - A 1-D tensor - - Returns - ------- - wav : torch.Tensor - """ - wav = self.modules.token_model.decode(audio) - if length is not None: - clean_padding_(wav, length) - return wav - - -if __name__ == "__main__": - run_experiment(TokotronSTBrain) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py b/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py new file mode 100644 index 000000000..9700e8363 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py @@ -0,0 +1,366 @@ +"""TTS evaluation tools + +Authors + * Artem Ploujnikov 2024 +""" +import json +import torch +import logging +import re +import csv +from speechbrain.utils.metric_stats import MetricStats +from types import SimpleNamespace +from pathlib import Path +from utils.data import undo_batch +from torch import nn + + +logger = logging.getLogger(__name__) + + +class SpeechEvaluationMetricStats(MetricStats): + """An aggregate metric combining multiple speech evaluators + + Arguments + --------- + hparams : dict | SimpleNamespace | object + Raw hyperparameters for evaluation + + device : str + The device on which evaluation will be performed + + """ + + def __init__(self, hparams, device="cpu"): + if isinstance(hparams, dict): + hparams = SimpleNamespace(**hparams) + self.hparams = hparams + self.device = device + modules = self.hparams.modules + self.modules = nn.ModuleDict(modules).to(self.device) + self.enabled_evaluators = set(self.hparams.evaluations.split(",")) + evaluators = hparams.evaluators + if evaluators: + self.evaluators = { + key: evaluator_f(run_opts={"device": device}) + for key, evaluator_f in evaluators.items() + if key in self.enabled_evaluators + } + else: + self.evaluators = {} + + if not self.evaluators: + logger.warn( + "No evaluators were defined - this run will produce samples only" + ) + + self.attention = [] + + def on_evaluation_start(self, output_folder="eval"): + """Invoked at the beginning of the evaluation cycle. + + Arguments + --------- + output_folder : str | path-like + The folder to which results will be output + + """ + logger.info("Starting evaluation") + output_folder = Path(output_folder) + self.output_folder = ( + output_folder + if output_folder.is_absolute() + else self.hparams.output_folder / output_folder + ) + self.output_folder.mkdir(parents=True, exist_ok=True) + + self.files = [] + details_keys = list(self.evaluators.keys()) + self.details = {evaluator_key: [] for evaluator_key in details_keys} + self.read_reports() + self.create_reports() + self.item_ids = [] + for evaluator_key in self.enabled_evaluators: + self.evaluators[evaluator_key].on_evaluation_start() + + def on_evaluation_end(self): + """Invoked at the beginning of the evaluation cycle. The default + implementation is a no-op + """ + logger.info("Ending evaluation") + self.write_summary() + + def create_reports(self): + """Creates report files and report writers""" + self.report_files = {} + self.report_writers = {} + for evaluator_key in self.enabled_evaluators: + columns = self.get_report_columns(evaluator_key) + file_name = self.output_folder / f"{evaluator_key}.csv" + self.files.append(file_name) + resume = file_name.exists() and file_name.stat().st_size > 0 + report_file = open(file_name, "a+") + self.report_files[evaluator_key] = report_file + writer = csv.DictWriter(report_file, columns) + if not resume: + writer.writeheader() + self.report_writers[evaluator_key] = writer + + def read_reports(self): + """Invoked when resuming""" + for evaluator_key in self.enabled_evaluators: + file_name = self.output_folder / f"{evaluator_key}.csv" + if file_name.exists(): + logger.info("%s exists, reading") + with open(file_name) as report_file: + reader = csv.DictReader(report_file) + for row in reader: + del row["uttid"] + row = { + key: handle_number(value) + for key, value in row.items() + } + self.details[evaluator_key].append(row) + + def get_tracker_file_name(self): + """Determines the file name of the tracker file""" + suffix = ( + f"_{self.hparams.eval_suffix}" if self.hparams.eval_suffix else "" + ) + file_name = f"tracker_{self.hparams.eval_dataset}{suffix}.txt" + return self.output_folder / file_name + + def get_report_columns(self, evaluator_key): + """Returns the columns for the specified evaluator + + Arguments + --------- + evaluator_key : str + the identifier of the evaluator + + Returns + ------- + columns : list[str] + a list of column headers + """ + bogus_wavs = torch.randn(2, 10000, device=self.device) + bogus_length = torch.tensor([1.0, 1.0], device=self.device) + evaluator = self.evaluators[evaluator_key] + result = evaluator.evaluate( + wavs=bogus_wavs, + length=bogus_length, + text=["BOGUS"] * len(bogus_wavs), + wavs_ref=bogus_wavs, + length_ref=bogus_length, + ) + + return ["uttid"] + list(result.details.keys()) + + def append(self, ids, wav, length, text, wav_ref, length_ref): + """Appends the result of a single item + + Arguments + --------- + ids : str + Utterance IDs + wav : torch.Tensor + Synthesized waveforms + length : torch.Tensor + Relative lengths of the synthesized waveforms + text : list + Ground truth text + wav_ref : torch.Tensor + Reference (ground truth) waveforms + length_ref : torch.Tensor + Reference lengths + """ + with torch.no_grad(): + self.item_ids.extend(ids) + for evaluator_key, evaluator in self.evaluators.items(): + result = evaluator.evaluate( + wavs=wav, + length=length, + text=text, + wavs_ref=wav_ref, + length_ref=length_ref, + sample_rate_ref=self.hparams.sample_rate, + sample_rate=self.hparams.model_sample_rate, + ) + details = undo_batch(result.details) + self.write_result(evaluator_key, ids, details) + self.details[evaluator_key].extend(details) + + def write_result(self, evaluator_key, ids, details): + """Outputs the result details to the report for the specified evaluator + + Arguments + --------- + evaluator_key : str + The evaluator key + ids : list + The list of IDs + details : list + a list of evaluation details, one dictionary per item + """ + writer = self.report_writers[evaluator_key] + for uttid, details_item in zip(ids, details): + report_details = { + "uttid": uttid, + **details_item, + } + writer.writerow(ascii_only(flatten(report_details))) + self.report_files[evaluator_key].flush() + + def write_summary(self, file_name=None): + """Outputs summarized statistics + + Arguments + --------- + file_name : str | path-like + An alternative path to save the file + """ + summary = self.summarize() + if file_name is None: + file_name = self.output_folder / "summary.json" + self.files.append(file_name) + with open(file_name, "w") as output_file: + json.dump(summary, output_file, indent=4) + + def summarize(self, field=None): + """Computes the summarized statistics + + Arguments + --------- + field : str, optional + If specified, it will return a specific field + + Returns + ------- + result : dict | float + The summary - or the specified field from the sum + """ + result = { + f"{evaluator_key}_{stat_key}": value + for evaluator_key in self.enabled_evaluators + if evaluator_key in self.details + for metric_key in self.hparams.eval_summary[evaluator_key][ + "descriptive" + ] + for stat_key, value in descriptive_statistics( + items=self.details[evaluator_key], key=metric_key, + ).items() + } + if field is not None: + result = result[field] + return result + + def clear(self): + """Deletes all the files that have been created""" + for file_name in self.files: + file_name.unlink() + + +RE_NON_ASCII = re.compile(r"[^\x00-\x7F]+") + + +def ascii_only(values): + """Removes any non-ASCII characters from a dictionary + + Arguments + --------- + values : dict + A dictionary of values + + Returns + ------- + result : dict + The same dictionary - but with non-ASCII strings removed""" + return { + key: RE_NON_ASCII.sub("", value) if isinstance(value, str) else value + for key, value in values.items() + } + + +def descriptive_statistics(items, key): + """Computes descriptive statistics for the summary + + Arguments + --------- + items : list + a list of dictionaries with metric values for each item + key : str + The key of the metric for which the statistics will be computed + + Returns + ------- + statistics : dict + The desccriptive statistics computed + _mean : the arithmetic mean + _std : the standard deviation + _min : the minimum value + _max : the maximum value + _median : the median value + _q1 : the first quartile + _q3 : the third quartile + _iqr : the interquartile ratio + """ + values = torch.tensor([item[key] for item in items]) + quantiles = torch.tensor([0.25, 0.5, 0.75]) + q1, median, q3 = values.quantile(quantiles) + stats = { + "mean": values.mean(), + "std": values.std(), + "min": values.min(), + "max": values.max(), + "median": median, + "q1": q1, + "q3": q3, + "iqr": q3 - q1, + } + return { + f"{key}_{stat_key}": value.item() for stat_key, value in stats.items() + } + + +def flatten(value): + """Converts tensors to scalars and lists of strings to strings + + Arguments + --------- + value : dict + the dictionary to flatten + + Returns + ------- + result : dict + a flattened dictionary + """ + return { + key: item_value.item() if torch.is_tensor(item_value) else item_value + for key, item_value in value.items() + } + + +RE_INTEGER = re.compile(r"^-?\d+$") +RE_FLOAT = re.compile(r"^-?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?$") + + +def handle_number(value): + """Converts a value to a number, if applicable. Strings + that look like integers or floats will be converted to integers + or floats. + + Arguments + --------- + value : str + a string value + + Returns + ------- + result : object + The processed result""" + if RE_INTEGER.match(value): + value = int(value) + elif RE_FLOAT.match(value): + value = float(value) + return value diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/arpabet.txt b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/arpabet.txt new file mode 100644 index 000000000..105a1dd9d --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/arpabet.txt @@ -0,0 +1,50 @@ +AA +AE +AH +AO +AW +AY +B +CH +D +DH +EH +ER +EY +F +G +HH +IH +IY +JH +K +L +M +N +NG +OW +OY +P +R +S +SH +T +TH +UH +UW +V +W +Y +Z +ZH +' +" +! +( +) +, +- +. +: +; +? diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/char_en.txt b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/char_en.txt new file mode 100644 index 000000000..f43d3b08d --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/char_en.txt @@ -0,0 +1,38 @@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +' +" +! +( +) +, +- +. +: +; +? + \ No newline at end of file diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/eval.yaml new file mode 100644 index 000000000..08587ce23 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/eval.yaml @@ -0,0 +1,42 @@ +eval_sample_rate: 16000 +eval_samples: null +eval_interval: 1 +eval_asr_type: whisper +eval_asr_source: openai/whisper-small +evaluations: utmos,asr +tmp_folder: null +eval_utmos_source: chaanks/wav2vec2-small +eval_utmos_save_path: !ref /utmos +eval_utmos_model_name: utmos.ckpt +eval_utmos_model_url: https://huggingface.co/chaanks/UTMOS/resolve/main +eval_utmos_domain_id: null +eval_utmos_judge_id: null +eval_perf: False + + +eval_utmos: !name:utils.eval.UTMOSSpeechEvaluator + source: !ref + save_path: !ref + model_name: !ref + model_url: !ref + domain_id: !ref + judge_id: !ref + +eval_asr: !name:utils.eval.WhisperASRSpeechEvaluator + source: !ref + sample_rate: !ref + savedir: !ref + +evaluators: + utmos: !ref + asr: !ref + +eval_summary: + asr: + descriptive: ["wer", "cer", "wer_ref", "cer_ref", "dwer", "dcer"] + utmos: + descriptive: ["utmos"] + +eval_summary_log: + utmos: utmos_utmos_mean + dwer: asr_dwer_median diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml new file mode 100644 index 000000000..541fc2917 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml @@ -0,0 +1,269 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/discrete_ssl +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER + +# Model Type +ssl_model_type: wavlm +output_folder: !ref results/// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +vocoder_model_name: !ref unithifigan-dasb--discrete +vocoder_model_path: !ref / +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +freeze_token_model: True +token_model_src: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: microsoft/wavlm-large + hubert: facebook/hubert-large-ll60k + wav2vec2: facebook/wav2vec2-large +g2p_src: speechbrain/soundchoice-g2p +token_model_kmeans_src: poonehmousavi/SSL_Quantization +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech +vocoder_repo_id: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + hubert: speechbrain/hifigan-hubert-k1000-LibriTTS + wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS + wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS + +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +flip_layers: False +use_token_offsets: True +token_offset: 1 +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +use_spk_emb: False +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + + +# Training parameters +input: text +number_of_epochs: 50 +number_of_epochs_ar: null +number_of_epochs_nar: null +batch_size: 16 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 +data_scale: null + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +special_tokens: ["", "", ""] +special_num_tokens: 4 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 1000 +text_max_length: 500 +n_ctx: !ref + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Token model (pretrained) +ssl_model: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True + hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True + wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +token_model_kwargs: + SSL_layers: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 1000 +text_num_tokens: 39 +phn_num_tokens: 52 + +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 6 +flatten: False + +freeze_lm_head: False + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer + save_path: !ref / + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref + +modules: + model: !ref + tokenizer: !ref + +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml new file mode 100644 index 000000000..bfc8c58e4 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml @@ -0,0 +1,240 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/encodec +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER + +# Model Type +output_folder: !ref results/valle/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +freeze_token_model: True +g2p_src: speechbrain/soundchoice-g2p +token_model_kmeans_src: poonehmousavi/SSL_Quantization +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech + +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +flip_layers: False +use_token_offsets: True +token_offset: 1 +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +use_spk_emb: False +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 50 +number_of_epochs_ar: null +number_of_epochs_nar: null +batch_size: 16 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 +data_scale: null + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +special_tokens: ["", "", ""] +special_num_tokens: 4 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 24000 +max_audio_length: 1000 +text_max_length: 500 +n_ctx: !ref + +infer_top_k: 20 +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +token_model_kwargs: + SSL_layers: !ref + +freeze_lm_head: False + +####################### Model parameters ########################### +# Transformer +d_model: 1024 +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 1024 +text_num_tokens: 39 +phn_num_tokens: 52 + +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 8 +flatten: False +bandwidth: 6 + + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + top_k: !ref + +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + +modules: + model: !ref + tokenizer: !ref + +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml new file mode 100644 index 000000000..8a1e65e2c --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml @@ -0,0 +1,243 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/espnet-encodec +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER + +# Model Type +output_folder: !ref results/valle/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +freeze_token_model: True +g2p_src: speechbrain/soundchoice-g2p +token_model_kmeans_src: poonehmousavi/SSL_Quantization +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech + +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +flip_layers: False +use_token_offsets: True +token_offset: 1 +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +use_spk_emb: False +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 50 +number_of_epochs_ar: null +number_of_epochs_nar: null +batch_size: 16 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 +data_scale: null + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +special_tokens: ["", "", ""] +special_num_tokens: 4 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 24000 +max_audio_length: 1000 +text_max_length: 500 +n_ctx: !ref + +infer_top_k: 20 +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +token_model_kwargs: + SSL_layers: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 1024 +text_num_tokens: 39 +phn_num_tokens: 52 + +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 8 +flatten: False +espnet_repo: https://github.com/espnet/espnet +espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef +model_hub: espnet/libritts_encodec_24k +model_ckpt: exp/codec_encodec_ss4_24k/120epoch.pth +model_config: exp/codec_encodec_ss4_24k/config.yaml + +freeze_lm_head: True + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + top_k: !ref + +tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface + source: !ref + model_config: !ref + n_codebook: !ref + save_path: !ref + sample_rate: !ref + model_ckpt: !ref + espnet_commit: !ref + +modules: + model: !ref + tokenizer: !ref + +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml new file mode 100644 index 000000000..21d95dbf9 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml @@ -0,0 +1,237 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/mimi +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER + +# Model Type +output_folder: !ref results/valle/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +freeze_token_model: True +model_hub: kyutai/mimi +g2p_src: speechbrain/soundchoice-g2p +token_model_kmeans_src: poonehmousavi/SSL_Quantization +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech + +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +flip_layers: False +use_token_offsets: True +token_offset: 1 +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +use_spk_emb: False +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 50 +number_of_epochs_ar: null +number_of_epochs_nar: null +batch_size: 16 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 +data_scale: null + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +special_tokens: ["", "", ""] +special_num_tokens: 4 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 24000 +max_audio_length: 1000 +text_max_length: 500 +n_ctx: !ref + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +token_model_kwargs: + SSL_layers: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 2048 +text_num_tokens: 39 +phn_num_tokens: 52 + +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 8 +flatten: False +bandwidth: 6 + + +freeze_lm_head: False + + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +tokenizer: !new:utils.tokenizer_interface.MimiTokenizer + source: !ref + save_path: !ref + num_codebooks: !ref + +modules: + model: !ref + tokenizer: !ref + +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml new file mode 100644 index 000000000..d36a0cff0 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml @@ -0,0 +1,270 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/sqcodec +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER + +# Model Type +output_folder: !ref results/valle/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +freeze_token_model: True +config: config.yaml +checkpoint: ckpt_00190000.pth +sq_codec_save_path: !ref /sq-codec +g2p_src: speechbrain/soundchoice-g2p +token_model_kmeans_src: poonehmousavi/SSL_Quantization +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech + +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +flip_layers: False +use_token_offsets: False +token_offset: 1 +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +use_spk_emb: False +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 50 +number_of_epochs_ar: null +number_of_epochs_nar: null +batch_size: 16 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 +data_scale: null + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +special_tokens: ["", "", ""] +special_num_tokens: 4 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 1000 +text_max_length: 500 +n_ctx: !ref + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +top_k: 20 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +token_model_kwargs: + SSL_layers: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +ternary_d_hidden: 512 +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +target_dropout: 0.5 +vocab_size: 19683 +text_num_tokens: 39 +phn_num_tokens: 52 + +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !ref * 2 + +audio_token_shift: 19683 + +audio_tokens_per_step: 4 +flatten: True +ternary_num_digits: 10 +pred_mode: ternary +freeze_lm_head: False + + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: 1 + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + target_dropout: !ref + share_emb: !ref + qk_norm: !ref + emb: !ref + lm_head: !ref + logits_to_probs: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: 1 + top_k: !ref + +lm_head: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + ternary: !new:model.custom_model.TernaryPredictionHead + d_model: !ref + d_hidden: !ref + num_positions: !ref * + tokens: null + +logits_to_probs: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + ternary: !new:model.custom_model.TernaryLogitTokenizer + num_tokens: !ref + num_positions: !ref + tokens: !new:torch.nn.Identity + + +emb: !new:speechbrain.nnet.containers.Sequential + ternary: !new:model.sq_codec.TernaryEmbedding + num_digits: !ref + flat: True + linear: !new:speechbrain.nnet.linear.Linear + input_size: !ref * + n_neurons: !ref + +tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer + save_path: !ref + checkpoint: !ref + config: !ref + +modules: + model: !ref + tokenizer: !ref + +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + ternary: !name:model.sq_codec.ternary_loss + targets_type: tokens + num_positions: !ref + tokens: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml new file mode 100644 index 000000000..730eb08a5 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml @@ -0,0 +1,240 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/wavtokenizer +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER + +# Model Type +output_folder: !ref results/valle/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +freeze_token_model: True +model_hub: novateur/WavTokenizer-medium-music-audio-75token +config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml +checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt +g2p_src: speechbrain/soundchoice-g2p +token_model_kmeans_src: poonehmousavi/SSL_Quantization +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech + +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +flip_layers: False +use_token_offsets: True +token_offset: 1 +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +use_spk_emb: False +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 50 +number_of_epochs_ar: null +number_of_epochs_nar: null +batch_size: 16 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 +data_scale: null + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +special_tokens: ["", "", ""] +special_num_tokens: 4 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 24000 +max_audio_length: 1000 +text_max_length: 500 +n_ctx: !ref + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +token_model_kwargs: + SSL_layers: !ref + +freeze_lm_head: False + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 4096 +text_num_tokens: 39 +phn_num_tokens: 52 + +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 1 +bandwidth: 6 + + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper + source: !ref + save_path: !ref + checkpoint: !ref + config: !ref + freeze: True + + +modules: + model: !ref + tokenizer: !ref + +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/ljspeech_prepare.py b/benchmarks/DASB/LJSpeech/TTS/valle/ljspeech_prepare.py new file mode 120000 index 000000000..2f703273c --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/ljspeech_prepare.py @@ -0,0 +1 @@ +../../ljspeech_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py new file mode 100644 index 000000000..899c0f159 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -0,0 +1,1025 @@ +#!/usr/bin/env/python3 +"""Recipe for training VALL-E + +Based on ESPNET VALL-E + +Curriculum inspired by Lifeiteng's VALL-E +https://github.com/lifeiteng/vall-e + +Authors + * Artem Ploujnikov 2024 +""" + + +import logging +import speechbrain as sb +import os +import torch +import sys +import shutil +from pathlib import Path +from hyperpyyaml import load_hyperpyyaml +from speechbrain.dataio.dataio import ( + clean_padding, + length_to_mask, + write_audio, +) +from speechbrain.utils.distributed import run_on_main +from speechbrain.utils.data_utils import batch_pad_right +import re +import string + +base_dir = str(Path(__file__).resolve().parent.parent.parent.parent) +sys.path.append(base_dir) + +from evaluation import SpeechEvaluationMetricStats # noqa: E402 + +logger = logging.getLogger(__name__) + + +# Brain class for speech recognition training +class VALLEBrain(sb.Brain): + """Class that manages the training loop. See speechbrain.core.Brain.""" + + def __init__( + self, + modules=None, + opt_class=None, + hparams=None, + run_opts=None, + checkpointer=None, + ): + super().__init__( + modules, opt_class, hparams, run_opts, checkpointer, + ) + self.evaluation_metric = SpeechEvaluationMetricStats( + self.hparams, self.device + ) + + def create_waveform(self, audio, length): + """Creates a waveform from a discrete or continuous audio + representation + + Arguments + --------- + audio : torch.Tensor + An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) + lengths : torch.Tensor + A 1-D tensor + + Returns + ------- + wav : torch.Tensor + """ + tokenizer = ( + self.modules.tokenizer.module + if hasattr(self.modules.tokenizer, "module") + else self.modules.tokenizer + ) + tokenizer.device = self.device + if hasattr(tokenizer, "codec_vocoder"): + tokenizer.codec_vocoder.to(self.device) + tokenizer.codec_vocoder.device = self.device + wav = tokenizer.tokens_to_sig(audio) + wav = clean_padding(wav, length) + wav = wav.to(self.device) + return wav + + def compute_forward(self, batch, stage): + """Runs all the computation of the Tokotron TTS + + Arguments + --------- + batch : PaddedBatch + This batch object contains all the relevant tensors for computation. + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + + Returns + ------- + predictions : dict + TTS predictions + """ + batch = batch.to(self.device) + prompt, prompt_length = batch.prompt + batch_size, prompt_max_len, num_tracks = prompt.shape + if self.train_nar: + nar_track = torch.randint( + 1, num_tracks, (batch_size,), device=self.device + ) + else: + nar_track = None + logits_ar, logits_nar = self.modules.model( + dec_seq=batch.prompt.data, + dec_seq_lengths=batch.prompt.lengths, + prefix_len=batch.prefix_length / prompt_max_len, + nar_level_idx=nar_track, + predict_ar=self.train_ar, + predict_nar=self.train_nar, + ) + return logits_ar, logits_nar, nar_track + + def compute_objectives(self, predictions, batch, stage): + """Computes the loss given the predicted and targeted outputs. We here + do multi-task learning and the loss is a weighted sum of the ctc + seq2seq + costs. + + Arguments + --------- + predictions : dict + The output dict from `compute_forward`. + batch : PaddedBatch + This batch object contains all the relevant tensors for computation. + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + + Returns + ------- + loss : torch.Tensor + A one-element tensor used for backpropagating the gradient. + """ + batch = batch.to(self.device) + + logits_ar, logits_nar, nar_track = predictions + prompt, prompt_length = batch.prompt + prefix_length = batch.prefix_length + + batch_size, prompt_max_len, _ = prompt.shape + batch_idx = torch.arange(batch_size, device=prompt.device) + length_mask = length_to_mask( + prompt_length * prompt_max_len, prompt_max_len + ) + prefix_mask = length_to_mask( + prefix_length, prompt_max_len + ).logical_not() + mask = (length_mask * prefix_mask)[:, 1:] + + loss_components = [] + + if self.train_ar: + logits_ar_sm = self.hparams.log_softmax(logits_ar) + if self.hparams.flatten: + targets_ar = prompt[:, 1:] + else: + targets_ar = prompt[:, 1:, 0] + loss_ar = self.hparams.compute_cost( + logits_ar_sm, targets=targets_ar, mask=mask + ) + loss_components.append(loss_ar) + else: + logits_ar_sm, targets_ar = None, None + if self.train_nar: + logits_nar_sm = self.hparams.log_softmax(logits_nar) + targets_nar = prompt[batch_idx, 1:, nar_track] + loss_nar = self.hparams.compute_cost( + logits_nar_sm, targets=targets_nar, mask=mask, + ) + loss_components.append(loss_nar) + else: + logits_nar_sm, targets_nar = None, None + + self.loss_metric.append( + ids=batch.uttid, + logits_ar=logits_ar_sm, + targets_ar=targets_ar, + logits_nar=logits_nar_sm, + targets_nar=targets_nar, + mask=mask, + reduction="batch", + ) + + loss = torch.mean(torch.stack(loss_components)) + return loss + + def compute_loss_stats( + self, + logits_ar, + targets_ar, + logits_nar, + targets_nar, + mask, + reduction="batch", + ): + """Computes an autoregressive/non-autoregressive loss breakdown, + to be used for metrics/stats + + Arguments + --------- + logits_ar : torch.Tensor + The autoregressive predictions + targets_ar : torch.Tensor + The targets for autoregressive predictions + logits_nar : torch.Tensor + The non-autoregressive predictions + targets_nar : torch.Tensor + The targets for non-autoregressive prediction + + Returns + ------- + stats: dict + statistics + """ + stats = {} + if self.train_ar: + stats["loss_ar"] = self.hparams.compute_cost( + logits_ar, targets=targets_ar, mask=mask, reduction=reduction, + ) + if self.train_nar: + stats["loss_nar"] = self.hparams.compute_cost( + logits_nar, targets=targets_nar, mask=mask, reduction=reduction, + ) + return stats + + def on_stage_start(self, stage, epoch): + """Gets called at the beginning of each epoch. + + Arguments + --------- + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + self.offsets = get_offsets( + self.hparams.vocab_size, self.hparams.audio_tokens_per_step, + )[None, None, :].to(self.device) + if not self.hparams.use_token_offsets: + self.offsets = torch.zeros_like(self.offsets) + + self.loss_metric = sb.utils.metric_stats.MultiMetricStats( + metric=self.compute_loss_stats, batch_eval=True, + ) + self.apply_curriculum() + + self.is_evaluating = False + if stage == sb.Stage.VALID: + if self.is_eval_epoch(epoch): + self.evaluation_metric.on_evaluation_start() + self.is_evaluating = True + else: + logger.info("No evaluation on epoch %d", epoch) + elif stage == sb.Stage.TEST: + self.evaluation_metric.on_evaluation_start() + self.is_evaluating = True + self.transform_audio = getattr(self.hparams, "transform_audio", None) + + def apply_curriculum(self): + """Applies curriculum settings, if specified, training only the autoregressive part - or + only the non-autoregressive part""" + epoch = self.hparams.epoch_counter.current + self.train_ar, self.train_nar = True, True + lm_head = ( + self.modules.model.module.lm_head + if hasattr(self.modules.model, "module") + else self.modules.model.lm_head + ) + lm_head.requires_grad_(True) + if self.hparams.audio_tokens_per_step == 1 or self.hparams.flatten: + # NOTE: If there is only one track it's autoregressive + self.train_nar = False + elif ( + self.hparams.number_of_epochs_ar is not None + and epoch <= self.hparams.number_of_epochs_ar + ): + self.train_nar = False + elif self.hparams.number_of_epochs_nar is not None and epoch <= ( + self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar + ): + self.train_ar = False + if self.hparams.freeze_lm_head: + lm_head.requires_grad_(False) + + def is_eval_epoch(self, epoch): + """Determines whether or not evaluation should be performed + in the specieied epoch + + Arguments + --------- + epoch : int + The epoch number. If omitted, the epoch number from the + epoch counter will be used + + Returns + ------- + eval_epoch : bool + True if evaluation should be run in this epoch, false + otherwise""" + if epoch is None: + epoch = self.hparams.epoch_counter.current + # NOTE: Need to get past AR-only training to be able to evaluate + can_evaluate = not ( + self.hparams.number_of_epochs_ar is not None + and epoch <= self.hparams.number_of_epochs_ar + ) + return can_evaluate and (epoch % self.hparams.eval_interval == 0) + + def on_fit_start(self): + """Gets called at the beginning of ``fit()``, on multiple processes + if ``distributed_count > 0`` and backend is ddp. + + Default implementation compiles the jit modules, initializes + optimizers, and loads the latest checkpoint to resume training. + """ + # Run this *after* starting all processes since jit/compiled modules + # cannot be pickled. + self._compile() + + # Wrap modules with parallel backend after jit + self._wrap_distributed() + + # Initialize optimizers after parameters are configured + self.init_optimizers() + + # Load latest checkpoint to resume training if interrupted + if self.checkpointer is not None and not getattr( + self, "_ckpt_recovered", False + ): + self.checkpointer.recover_if_possible() + self._ckpt_recovered = True + + @torch.no_grad() + def evaluate_batch(self, batch, stage): + """Evaluate one batch, override for different procedure than train. + + The default implementation depends on two methods being defined + with a particular behavior: + + * ``compute_forward()`` + * ``compute_objectives()`` + + Arguments + --------- + batch : list of torch.Tensors + Batch of data to use for evaluation. Default implementation assumes + this batch has two elements: inputs and targets. + stage : Stage + The stage of the experiment: Stage.VALID, Stage.TEST + + Returns + ------- + detached loss + """ + out = self.compute_forward(batch, stage=stage) + loss = self.compute_objectives(out, batch, stage=stage) + if self.is_evaluating: + with torch.no_grad(): + audio_tokens, audio_length = self.inference(batch) + if self.hparams.flip_layers: + audio_tokens = audio_tokens.flip(2) + wav = self.create_waveform(audio_tokens, audio_length) + wav = wav.squeeze(1) + self.save_samples( + batch=batch, wav=wav, length=audio_length, stage=stage + ) + self.evaluation_metric.append( + ids=batch.uttid, + wav=wav, + text=batch.label_norm_eval, + length=audio_length, + wav_ref=batch.sig.data, + length_ref=batch.sig.lengths, + ) + return loss.detach().cpu() + + def on_stage_end(self, stage, stage_loss, epoch): + """Gets called at the end of an epoch. + + Arguments + --------- + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST + stage_loss : float + The average loss for all of the data processed in this stage. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + + # Store the train loss until the validation stage. + loss_stats = self.loss_metric.summarize(flat=True) + stage_stats = {"loss": stage_loss, **loss_stats} + if stage == sb.Stage.TRAIN: + self.train_stats = stage_stats + + # End evaluation and report stats + if stage != sb.Stage.TRAIN and self.is_evaluating: + self.evaluation_metric.on_evaluation_end() + self.save_eval(stage) + eval_summary = self.evaluation_metric.summarize() + eval_summary_stats = { + key: eval_summary.get(value) + for key, value in self.hparams.eval_summary_log.items() + } + stage_stats.update(eval_summary_stats) + else: + eval_summary_stats = {} + + # Perform end-of-iteration things, like annealing, logging, etc. + if stage == sb.Stage.VALID: + + if self.hparams.lr_annealing_mode == "epoch": + _, new_lr = self.hparams.lr_annealing(stage_loss) + sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr) + + lr = self.optimizer.param_groups[0]["lr"] + + # The train_logger writes a summary to stdout and to the logfile. + self.hparams.train_logger.log_stats( + stats_meta={"epoch": epoch, "lr": lr}, + train_stats=self.train_stats, + valid_stats=stage_stats, + ) + + # Save the current checkpoint and delete previous checkpoints. + ckpt_kwargs = { + f"{self.hparams.ckpt_key_kind}_keys": [self.hparams.ckpt_key], + } + self.checkpointer.save_and_keep_only( + meta={"loss": stage_stats["loss"], **eval_summary_stats}, + num_to_keep=hparams["ckpt_keep"], + **ckpt_kwargs, + ) + elif stage == sb.Stage.TEST: + self.hparams.train_logger.log_stats( + stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, + test_stats=stage_stats, + ) + + def inference(self, batch): + """Runs TTS inference + + Arguments + --------- + batch : PaddedBatch + A batch + + Returns + ------- + audio : torch.Tensor + A padded tensor of audio + audio_length : torch.Tensor + Relative lengths + """ + prefix, prefix_length = batch.prefix + # NOTE: ESPNET VALL-E does not support batched inference + prefix_items = undo_padding_tensor(prefix.int(), prefix_length) + inference = ( + self.modules.model.module.inference + if hasattr(self.modules.model, "module") + else self.modules.model.inference + ) + inference_results = [ + inference( + prefix=prefix_item.unsqueeze(0), opts=self._get_inference_opts() + ) + for prefix_item in prefix_items + ] + inferred_tokens = [ + result[0][0] + if result[0] + else torch.zeros(1000, self.hparams.audio_tokens_per_step) + for result in inference_results + ] + audio, audio_length = batch_pad_right(inferred_tokens) + offsets = self.offsets + if self.hparams.flip_layers: + offsets = offsets.flip(2) + audio = (audio - self.hparams.audio_token_shift - offsets).clip(0) + return audio, audio_length + + def _get_inference_opts(self): + idx = torch.arange(self.hparams.model_vocab_size, device=self.device)[ + None, : + ] + tracks = torch.arange( + self.hparams.audio_tokens_per_step, device=self.device + )[:, None] + if not self.hparams.use_token_offsets: + tracks = torch.zeros_like(tracks) + track_start = ( + self.hparams.audio_token_shift + tracks * self.hparams.vocab_size + ) + if self.hparams.flip_layers: + track_start = track_start.flip(0) + track_end = track_start + self.hparams.vocab_size + mask = ( + ((idx >= track_start) & (idx < track_end)) + | (idx == self.hparams.bos_index) + ).logical_not() + mask[ + ( + (idx >= self.hparams.special_num_tokens) + & (idx <= self.hparams.audio_token_shift) + ).expand_as(mask) + ] = True + return self.hparams.inference_opts( + masks={self.hparams.bos_index: mask}, device=self.device, + ) + + def save_samples(self, batch, wav, length, stage): + """Saves audio samples + + Arguments + --------- + batch : PaddedBatch + An audio batch + wav : torch.Tensor + Generated audio + length : torch.Tensor + Relative lengths + stage : speechbrain.Stage + The training stage + """ + output_folder = self._get_eval_output_folder(stage) + samples = undo_padding_tensor(wav, length) + for uttid, sample in zip(batch.uttid, samples): + file_name = output_folder / f"pred_{uttid}.wav" + write_audio( + file_name, sample.detach().cpu(), self.hparams.model_sample_rate + ) + + def save_eval(self, stage): + """Saves evaluation results + + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + """ + output_folder = self._get_eval_output_folder(stage) + for src_file_name in self.evaluation_metric.files: + dest_file_name = output_folder / src_file_name.name + shutil.copyfile(src_file_name, dest_file_name) + self.evaluation_metric.clear() + + def _get_eval_output_folder(self, stage): + epoch = self.hparams.epoch_counter.current + output_folder = ( + Path(self.hparams.output_folder) / "eval" / stage.name.lower() + ) + if epoch is not None: + output_folder = output_folder / str(epoch) + output_folder.mkdir(exist_ok=True, parents=True) + return output_folder + + def fit_batch(self, batch): + loss = super().fit_batch(batch) + if self.hparams.lr_annealing_mode == "step": + self.hparams.lr_annealing(self.optimizer) + return loss + + +INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phonemes"} + + +def dataio_prepare(hparams): + """This function prepares the datasets to be used in the brain class. + It also defines the data processing pipeline through user-defined functions. + + + Arguments + --------- + hparams : dict + This dictionary is loaded from the `train.yaml` file, and it includes + all the hyperparameters needed for dataset construction and loading. + + Returns + ------- + datasets : dict + Dictionary containing "train", "valid", and "test" keys that correspond + to the DynamicItemDataset objects. + silence_token : dict + the token used for silence + """ + + # Define datasets from json data manifest file + # Define datasets sorted by ascending lengths for efficiency + datasets = {} + data_folder = hparams["data_folder"] + data_info = { + "train": hparams["train_json"], + "valid": hparams["valid_json"], + "test": hparams["test_json"], + } + + label_encoder = hparams["label_encoder"] + input_feature = INPUT_FEATURE_MAP[hparams["input"]] + offsets = get_offsets( + hparams["vocab_size"], hparams["audio_tokens_per_step"] + ).unsqueeze(0) + if not hparams["use_token_offsets"]: + offsets = torch.zeros_like(offsets) + if hparams["flip_layers"]: + offsets = offsets.flip(-1) + + tokens_loader = hparams.get("tokens_loader") + + @sb.utils.data_pipeline.takes("label") + @sb.utils.data_pipeline.provides("label_norm", "label_norm_eval") + def text_pipeline(label): + """Processes the transcriptions to generate proper labels""" + label_norm = label.upper() + yield label_norm + label_norm_eval = RE_PUNCTUATION.sub("", label_norm) + yield label_norm_eval + + @sb.utils.data_pipeline.takes(input_feature) + @sb.utils.data_pipeline.provides("tokens") + def tokens_pipeline(label): + """Processes the transcriptions to generate proper labels""" + return label_encoder.encode_sequence_torch(label) + + @sb.utils.data_pipeline.takes("uttid", "tokens") + @sb.utils.data_pipeline.provides( + "audio", "prefix", "prompt", "prefix_length", "length" + ) + def prompt_pipeline(id, tokens): + audio = tokens_loader.tokens_by_uttid( + id, num_codebooks=hparams["audio_tokens_per_step"] + ) + if hparams["flip_layers"]: + audio = audio.flip(-1) + yield audio + num_tracks = audio.size(1) + prefix = torch.cat( + [ + torch.ones(1, num_tracks) * hparams["bos_index"], + tokens.unsqueeze(-1).expand(len(tokens), num_tracks), + torch.ones(1, num_tracks) * hparams["eot_index"], + ] + ) + yield prefix + prompt = torch.cat( + [ + prefix, + torch.ones(1, num_tracks) * hparams["bos_index"], + audio + hparams["audio_token_shift"] + offsets, + torch.ones(1, num_tracks) * hparams["eos_index"], + ] + ).int() + yield prompt + yield len(prefix) + yield len(prompt) + + @sb.utils.data_pipeline.takes("wav") + @sb.utils.data_pipeline.provides("sig") + def sig_pipeline(wav): + sig = sb.dataio.dataio.read_audio(wav) + return sig + + dynamic_items = [ + sig_pipeline, + text_pipeline, + tokens_pipeline, + prompt_pipeline, + ] + + init_sequence_encoder(hparams) + use_spk_emb = hparams.get("use_spk_emb", False) + prepared_features = ["audio_tokens"] + output_keys = [ + "uttid", + "tokens", + "label_norm", + "audio", + "prompt", + "prefix_length", + "length", + ] + if use_spk_emb: + prepared_features.append("spk_emb") + output_keys.append("spk_emb") + + for dataset in data_info: + dataset_dynamic_items = list(dynamic_items) + dataset_output_keys = list(output_keys) + if dataset != "train": + dataset_output_keys += ["sig", "label_norm_eval", "prefix"] + dynamic_dataset = sb.dataio.dataset.DynamicItemDataset.from_json( + json_path=data_info[dataset], + replacements={"data_root": data_folder}, + dynamic_items=dataset_dynamic_items, + output_keys=dataset_output_keys, + ) + + datasets[dataset] = dynamic_dataset + hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False + + # Sorting training data with ascending order makes the code much + # faster because we minimize zero-padding. In most of the cases, this + # does not harm the performance. + if hparams["sorting"] == "ascending": + datasets["train"] = datasets["train"].filtered_sorted(sort_key="length") + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "descending": + datasets["train"] = datasets["train"].filtered_sorted( + sort_key="length", reverse=True + ) + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "random": + hparams["train_dataloader_opts"]["shuffle"] = True + pass + + else: + raise NotImplementedError( + "sorting must be random, ascending or descending" + ) + data_scale = hparams.get("data_scale") + if data_scale: + scaled_data_count = int(len(datasets["train"]) * data_scale) + datasets["train"] = datasets["train"].filtered_sorted( + select_n=scaled_data_count + ) + return datasets + + +def get_offsets(vocab_size, tracks): + """Adds offsets to each track to treat the tokens as distinct + + Arguments + --------- + vocab_size : int + The vocabulary size, for each track + tracks : int + The number of tracks + """ + return torch.arange(tracks) * vocab_size + + +def apply_mem_fraction(): + """Applies the memory fraction, based on environment variables, useful for cases where + multiple experiments share a large GPU""" + if not torch.cuda.is_available(): + return + mem_fraction = os.environ.get("SB_CUDA_MEM_FRACTION") + if mem_fraction: + fraction, device = mem_fraction.split(":") + fraction, device = float(fraction), int(device) + logger.info("Using %f of GPU %f", fraction, device) + torch.cuda.set_per_process_memory_fraction(fraction, device) + + +def init_sequence_encoder(hparams): + """Initialize a sequence encoder + + Arguments + --------- + hparams: dict + parsed hyperparameters + prefix: str + the prefix to be prepended to hyperparameter keys, per the naming + convention + + {prefix}_label_encoder: the hparams key for the label encoder + {prefix}_list_file: the hparams key for the list file + + Returns + ------- + encoder: speechbrain.dataio.encoder.TextEncoder + an encoder instance""" + encoder = hparams["label_encoder"] + token_list_file_name = hparams["token_list_file"] + tokens = read_token_list(token_list_file_name) + encoder.add_unk() + for token in hparams["special_tokens"]: + token_key = token.replace("<", "").replace(">", "") + token_index = hparams[f"{token_key}_index"] + encoder.insert_label(token, token_index) + encoder.update_from_iterable(tokens, sequence_input=False) + encoder.expect_len(len(tokens) + hparams["special_num_tokens"]) + return encoder + + +def read_token_list(file_name): + """Reads a simple text file with tokens (e.g. characters or phonemes) listed + one per line + + Arguments + --------- + file_name: str + the file name + + Returns + ------- + result: list + a list of tokens + """ + file_name = Path(file_name) + if not file_name.is_absolute(): + file_name = Path(__file__).parent / "hparams" / file_name + if not file_name.exists(): + raise ValueError(f"Token file {file_name} not found") + with open(file_name) as token_file: + return [line.strip("\r\n") for line in token_file if line] + + +def apply_overfit_test(hparams, dataset): + """Helper for applying an overfit test conditionally based + on hyperparameters: + + `overfit_test`: whether or not to apply an overfit test + `overfit_test_sample_count`: the number of samples to use from the + original dataset + `overfit_test_epoch_data_count`: the number of samples per epoch + + The function will accept datasets, (train, valid, test) tuples + or dictionaries of the form: + {"train": dataset1, "valid": dataset2, "test": dataset3} + + If a tuple or dictionary is used, the training dataset will be of length + overfit_test_epoch_data_count wheres the evaluation dataset will be of + length overfit_test_sample_count. + + Arguments + --------- + hparams: dict + parsed hyperparameters + dataset: DynamicItemDataset|tuple|dict + One of the following + a dataset + a dictionary ({"train": dataset1, "valid": dataset2, "test": dataset3}) + a (train, valid, test) tuple of datasets + + Returns + ------- + result: DynamicItemDataset|tuple|dict + a dataset or collection of datasets suitable for + an overfitting test - in the same format as the + dataset argument (single dataset, dictionary and tuple) + """ + if hparams["overfit_test"]: + if isinstance(dataset, tuple): + dataset_train, dataset_valid, _ = dataset + dataset_train = apply_overfit_test(hparams, dataset_train) + dataset_eval = dataset_train.filtered_sorted( + select_n=hparams["overfit_test_sample_count"] + ) + dataset_eval.set_output_keys( + list(dataset_valid.pipeline.output_mapping.keys()) + ) + result = dataset_train, dataset_eval, dataset_eval + elif isinstance(dataset, dict): + dataset_train = apply_overfit_test(hparams, dataset["train"]) + dataset_eval = dataset_train.filtered_sorted( + select_n=hparams["overfit_test_sample_count"] + ) + dataset_eval.set_output_keys( + list(dataset["valid"].pipeline.output_mapping.keys()) + ) + result = { + "train": dataset_train, + "valid": dataset_eval, + "test": dataset_eval, + "sample": dataset_eval, + } + else: + result = dataset.overfit_test( + hparams["overfit_test_sample_count"], + hparams["overfit_test_epoch_data_count"], + ) + else: + result = dataset + return result + + +def undo_padding_tensor(batch, lengths): + """Produces Python lists given a batch of sentences with + their corresponding relative lengths. + + Arguments + --------- + batch : torch.Tensor + Batch of sentences gathered in a batch. + lengths : torch.Tensor + Relative length of each sentence in the batch. + + Returns + ------- + as_list : list + A python list of the corresponding input tensor. + + Example + ------- + >>> batch=torch.rand([4,100]) + >>> lengths=torch.tensor([0.5,0.6,0.7,1.0]) + >>> snt_list=undo_padding(batch, lengths) + >>> len(snt_list) + 4 + """ + batch_max_len = batch.shape[1] + as_list = [] + for seq, seq_length in zip(batch, lengths): + actual_size = int(torch.round(seq_length * batch_max_len)) + seq_true = seq.narrow(0, 0, actual_size) + as_list.append(seq_true) + return as_list + + +RE_PUNCTUATION = re.compile( + "|".join(re.escape(char) for char in string.punctuation) +) + + +if __name__ == "__main__": + # Reading command line arguments + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + + # Initialize ddp (useful only for multi-GPU DDP training) + sb.utils.distributed.ddp_init_group(run_opts) + + # Applies the memory fraction for a shared GPU + apply_mem_fraction() + + # Load hyperparameters file with command-line overrides + with open(hparams_file) as fin: + yaml = fin.read() + + # Load evaluation hyperparameters + eval_hparams_file = Path(hparams_file).parent / "eval.yaml" + if eval_hparams_file.exists(): + logger.info( + "Using evaluation hyperparameters from %s", eval_hparams_file + ) + with open(eval_hparams_file) as eval_hparams: + hparams_yaml = eval_hparams.read() + yaml = "\n".join([yaml, hparams_yaml]) + else: + logger.info( + "%s not found - not using evaluation hyperparameters", + eval_hparams_file, + ) + hparams = load_hyperpyyaml(yaml, overrides, overrides_must_match=True) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + from ljspeech_prepare import prepare_ljspeech + + # Data preparation, to be run on only one process. + if not hparams["skip_prep"]: + run_on_main( + prepare_ljspeech, + kwargs={ + "data_folder": hparams["data_folder"], + "save_folder": hparams["prepare_save_folder"], + "splits": hparams["splits"], + "split_ratio": hparams["split_ratio"], + "seed": hparams["seed"], + "extract_phonemes": hparams["input"] == "phonemes", + "model_name": "tokotron", + "g2p_src": hparams["g2p_src"], + "skip_ignore_folders": hparams["prepare_skip_ignore_folders"], + "frozen_split_path": hparams.get("frozen_split_path"), + "device": run_opts.get("device", "cpu"), + }, + ) + + # We can now directly create the datasets for training, valid, and test + datasets = dataio_prepare(hparams) + + # Apply overfit test settings + datasets = apply_overfit_test(hparams, datasets) + audio_keys = ["audio_tokens"] + + # Trainer initialization + tts_brain = VALLEBrain( + modules=hparams["modules"], + opt_class=hparams["opt_class"], + hparams=hparams, + run_opts=run_opts, + checkpointer=hparams["checkpointer"], + ) + + # The `fit()` method iterates the training loop, calling the methods + # necessary to update the parameters of the model. Since all objects + # with changing state are managed by the Checkpointer, training can be + # stopped at any point, and will be resumed on next call. + tts_brain.fit( + tts_brain.hparams.epoch_counter, + datasets["train"], + datasets["valid"], + train_loader_kwargs=hparams["train_dataloader_opts"], + valid_loader_kwargs=hparams["valid_dataloader_opts"], + ) + + # Load best checkpoint for evaluation + if hparams["testing"]: + test_summary_file = ( + Path(hparams["output_folder"]) / "eval" / "test" / "summary.json" + ) + if test_summary_file.exists(): + logging.info("Test run already completed: %s", test_summary_file) + else: + test_key_kind = hparams["test_key_kind"] + test_key = hparams["test_key"] + eval_kwargs = {f"{test_key_kind}_key": test_key} + tts_brain.evaluate( + test_set=datasets["test"], + test_loader_kwargs=hparams["test_dataloader_opts"], + **eval_kwargs, + ) diff --git a/benchmarks/DASB/LJSpeech/extraction/extract.py b/benchmarks/DASB/LJSpeech/extraction/extract.py new file mode 100644 index 000000000..bb25afa87 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/extract.py @@ -0,0 +1,88 @@ +#!/usr/bin/env/python3 +"""Recipe for extracting a discrete tokens with librispeech. + +Authors + * Jarod Duret 2024 +""" + +import os +import sys +import logging +import pathlib as pl +import speechbrain as sb +from speechbrain.dataio.dataset import DynamicItemDataset +from speechbrain.utils.distributed import run_on_main +from hyperpyyaml import load_hyperpyyaml + +base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) +sys.path.append(base_dir) + +print(base_dir) + +logger = logging.getLogger(__name__) + + +if __name__ == "__main__": + # CLI: + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + with open(hparams_file) as fin: + hparams = load_hyperpyyaml(fin, overrides) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + # Dataset prep (parsing Librispeech) + from ljspeech_prepare import prepare_ljspeech # noqa + + # multi-gpu (ddp) save data preparation + run_on_main( + prepare_ljspeech, + kwargs={ + "data_folder": hparams["data_folder"], + "save_folder": hparams["output_folder"], + "splits": hparams["splits"], + "split_ratio": hparams["split_ratio"], + "seed": hparams["seed"], + "frozen_split_path": hparams.get("frozen_split_path"), + "device": run_opts.get("device", "cpu"), + }, + ) + + tokens_extractor = hparams["tokens_extractor"] + data_folder = hparams["data_folder"] + datasets = [] + for split in ["train", "valid", "test"]: + json_path = hparams[f"{split}_json"] + name = pl.Path(json_path).stem + dataset = sb.dataio.dataset.DynamicItemDataset.from_json( + json_path=json_path, replacements={"data_root": data_folder}, + ) + datasets.append(dataset) + + merged_data = { + key: value + for dataset in datasets + for key, value in dataset.data.items() + } + merged_dataset = DynamicItemDataset(merged_data) + + save_folder = pl.Path(hparams["save_folder"]) + logger.info("Extracting dataset tokens ...") + tokens_extractor.extract_tokens( + merged_dataset, + hparams["num_codebooks"], + (save_folder / "ljspeech").as_posix(), + ) + + if hparams["save_embedding"]: + save_folder = pl.Path(hparams["save_folder"]) + logger.info("Saving embeddings ...") + tokens_extractor.save_pretrained_embeddings( + (save_folder / "embeddings").as_posix(), + vocab_size=hparams["vocab_size"], + num_codebooks=hparams["num_codebooks"], + ) diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/dac.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/dac.yaml new file mode 100644 index 000000000..b90054db6 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/dac.yaml @@ -0,0 +1,63 @@ +# ############################################################################ +# Auido Tokenizer: DAC +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/dac +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LJSpeech +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +frozen_split_path: null +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +####################### Model parameters ########################### +# Tokenizer parameters +# DAC parameters +# model_type: [16khz, 24khz, 44khz, 44khz] +# vocab_size: [1024, 1024, 1024, 1024] +# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps] +# max_num_codebooks: [12, 32, 9, 18] +# embedding_dim: [1024, 1024, 1024, 128] +model_type: 24khz +vocab_size: 1024 +model_bitrate: 8kbps +num_codebooks: 32 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +save_embedding: False + +tokenizer: !new:utils.tokenizer_interface.DACTokenizer + model_type: !ref + model_bitrate: !ref + load_pretrained: True + tag: latest + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/discrete_ssl.yaml new file mode 100644 index 000000000..d50cb85ef --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/discrete_ssl.yaml @@ -0,0 +1,100 @@ +# ############################################################################ +# Auido Tokenizer: WavLM +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/wavlm +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LJSpeech +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +frozen_split_path: null +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +### Configuration for discrete SSL model +# | SSL Model | HF Encoder | K-Means Dataset | K-Means Size | SSL Layers | Vocoder Model | +# |------------|----------------------------------------|-----------------|--------------|----------------------|------------------------------------------| +# | WavLM | microsoft/wavlm-large | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | speechbrain/hifigan-wavlm-k1000-LibriTTS | +# | HuBERT | facebook/hubert-large-ll60k | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | WIP | +# | Wav2Vec2 | facebook/wav2vec2-large-960h-lv60-self | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | WIP | + +# ssl_model_type: hubert, wavlm, wav2vec2 +# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large, facebook/wav2vec2-large +ssl_model_type: WavLM +ssl_hub: microsoft/wavlm-large +ssl_folder: !ref /ssl_checkpoint +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech +vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS +freeze_ssl: True +freeze_feature_extractor: True +vocab_size: 1000 +save_embedding: False + +### Config for Tokenizer +# Layer number should be among the supported layers for discrete SSL models(kmenas model should be available for that layer) +num_codebooks: [1, 3, 7, 12, 18, 23] +deduplicate: [False, False, False, False, False, False] +bpe_tokenizer_path: [null, null, null, null, null, null] +sample_rate: 16000 +encoder_dim: 1024 + +ssl_model: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + WavLM: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + HuBERT: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + Wav2Vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + +tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer + save_path: !ref + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml new file mode 100644 index 000000000..869d1c503 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml @@ -0,0 +1,63 @@ +# ############################################################################ +# Auido Tokenizer: Encodec +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/encodec +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LJSpeech +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +frozen_split_path: null +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# EnCodec parameters +# sample_rate: [24000, 24000, 24000, 24000] +# vocab_size: [1024, 1024, 1024, 1024] +# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] +# num_codebooks: [2, 4, 8, 16, 32] +bandwidth: 24.0 +num_codebooks: 32 +vocab_size: 1024 +sample_rate: 24000 +save_embedding: False + +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/espnet_encodec.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/espnet_encodec.yaml new file mode 100644 index 000000000..c03ffa936 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/espnet_encodec.yaml @@ -0,0 +1,66 @@ +# ############################################################################ +# Auido Tokenizer: Encodec +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/encodec +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LJSpeech +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +frozen_split_path: null +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# EnCodec parameters +# sample_rate: [24000, 24000, 24000, 24000] +# vocab_size: [1024, 1024, 1024, 1024] +# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] +# num_codebooks: [2, 4, 8, 16, 32] +espnet_repo: https://github.com/espnet/espnet +espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef +model_hub: espnet/libritts_encodec_24k +model_ckpt: exp/codec_encodec_ss4_24k/120epoch.pth +model_config: exp/codec_encodec_ss4_24k/config.yaml +num_codebooks: 32 +vocab_size: 1024 +sample_rate: 24000 +save_embedding: False + +tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface + source: !ref + model_config: !ref + n_codebook: !ref + save_path: !ref + sample_rate: !ref + model_ckpt: !ref + espnet_commit: !ref + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml new file mode 100644 index 000000000..c534bef0f --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml @@ -0,0 +1,57 @@ +# ############################################################################ +# Auido Tokenizer: Mimi +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/mimi +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LJSpeech +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +frozen_split_path: null +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json + +batch_size: 1 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +####################### Model parameters ########################### +# Tokenizer parameters +model_hub: kyutai/mimi +vocab_size: 1024 +num_codebooks: 32 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +save_embedding: False + +tokenizer: !new:utils.tokenizer_interface.MimiTokenizer + source: !ref + save_path: !ref + num_codebooks: !ref + + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml new file mode 100644 index 000000000..d036e05a3 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml @@ -0,0 +1,53 @@ +# ############################################################################ +# Auido Tokenizer: Speech Tokenizer +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/speech_tokenizer +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LJSpeech +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +frozen_split_path: null +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +vocab_size: 1024 +num_codebooks: 8 +sample_rate: 16000 +encoder_dim: 1024 +freeze_embedding: False +save_embedding: False + + +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper + source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio + save_path: !ref + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml new file mode 100644 index 000000000..28c7c9be9 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml @@ -0,0 +1,56 @@ +# ############################################################################ +# Auido Tokenizer: SQCodec +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/sqcodec +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LJSpeech +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +frozen_split_path: null +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# SQCodec parameters +config: config.yaml +checkpoint: ckpt_00190000.pth +sample_rate: 16000 +save_embedding: False +num_codebooks: 4 +save_path: /home/ubuntu/sq-codec/SQ-Codec + +# SQCodec model +tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer + save_path: !ref + checkpoint: !ref + config: !ref + + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml new file mode 100644 index 000000000..a23c29e59 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml @@ -0,0 +1,59 @@ +# ############################################################################ +# Auido Tokenizer: wavtokenizer +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/wavtokenizer +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LJSpeech +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +frozen_split_path: null +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# WavTokenizer parameters +model_hub: novateur/WavTokenizer-medium-music-audio-75token +config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml +checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt +sample_rate: 24000 +save_embedding: False +num_codebooks: 1 +vocab_size: 4096 + +# wavtokenizer model +tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper + source: !ref + save_path: !ref + checkpoint: !ref + config: !ref + freeze: True + + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/ljspeech_prepare.py b/benchmarks/DASB/LJSpeech/extraction/ljspeech_prepare.py new file mode 120000 index 000000000..2de5a21a8 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/ljspeech_prepare.py @@ -0,0 +1 @@ +../ljspeech_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/LJSpeech/ljspeech_prepare.py b/benchmarks/DASB/LJSpeech/ljspeech_prepare.py index e88b92eb6..416c63010 100644 --- a/benchmarks/DASB/LJSpeech/ljspeech_prepare.py +++ b/benchmarks/DASB/LJSpeech/ljspeech_prepare.py @@ -13,13 +13,11 @@ import json import random import logging -from types import SimpleNamespace import torch import torchaudio import numpy as np import tgt import re -import speechbrain as sb from tqdm import tqdm from pathlib import Path from speechbrain.utils.data_utils import download_file @@ -27,10 +25,6 @@ from speechbrain.inference.text import GraphemeToPhoneme from unidecode import unidecode from speechbrain.utils.text_to_sequence import _g2p_keep_punctuations -from speechbrain.dataio.batch import PaddedData -from speechbrain.dataio.dataset import DynamicItemDataset -from preparation import FeatureExtractor -from torchaudio.functional import resample logger = logging.getLogger(__name__) @@ -59,8 +53,6 @@ def prepare_ljspeech( pitch_max_f0=400, skip_prep=False, use_custom_cleaner=False, - extract_features=None, - extract_features_opts=None, extract_phonemes=False, g2p_src="speechbrain/soundchoice-g2p", skip_ignore_folders=False, @@ -179,7 +171,7 @@ def prepare_ljspeech( os.makedirs(duration_folder) # extract pitch for both Fastspeech2 and FastSpeech2WithAligner models - if "FastSpeech2" in model_name: + if model_name is not None and "FastSpeech2" in model_name: pitch_folder = os.path.join(data_folder, "pitch") if not os.path.exists(pitch_folder): os.makedirs(pitch_folder) @@ -200,22 +192,11 @@ def prepare_ljspeech( data_folder, splits, split_ratio, frozen_split_path ) - extract_features_context = None - extract_features_folder = None - if extract_features: - extract_features_context = get_context( - extract_features=extract_features, - extract_features_opts=extract_features_opts or {}, - device=device, - ) - extract_features_folder = Path(save_folder) / "features" - if "train" in splits: prepare_json( model_name, data_split["train"], save_json_train, - data_folder, wavs_folder, meta_csv, phoneme_alignments_folder, @@ -226,10 +207,6 @@ def prepare_ljspeech( pitch_min_f0, pitch_max_f0, use_custom_cleaner, - extract_features, - extract_features_context, - extract_features_folder, - extract_features_opts, extract_phonemes, g2p_src, device, @@ -239,7 +216,6 @@ def prepare_ljspeech( model_name, data_split["valid"], save_json_valid, - data_folder, wavs_folder, meta_csv, phoneme_alignments_folder, @@ -250,10 +226,6 @@ def prepare_ljspeech( pitch_min_f0, pitch_max_f0, use_custom_cleaner, - extract_features, - extract_features_context, - extract_features_folder, - extract_features_opts, extract_phonemes, g2p_src, device, @@ -263,7 +235,6 @@ def prepare_ljspeech( model_name, data_split["test"], save_json_test, - data_folder, wavs_folder, meta_csv, phoneme_alignments_folder, @@ -274,10 +245,6 @@ def prepare_ljspeech( pitch_min_f0, pitch_max_f0, use_custom_cleaner, - extract_features, - extract_features_context, - extract_features_folder, - extract_features_opts, extract_phonemes, g2p_src, device, @@ -421,7 +388,6 @@ def prepare_json( model_name, seg_lst, json_file, - data_folder, wavs_folder, csv_reader, phoneme_alignments_folder, @@ -432,10 +398,6 @@ def prepare_json( pitch_min_f0, pitch_max_f0, use_custom_cleaner=False, - extract_features=None, - extract_features_context=None, - extract_features_folder=None, - extract_features_opts=None, extract_phonemes=False, g2p_src="speechbrain/soundchoice-g2p", device="cpu", @@ -471,14 +433,8 @@ def prepare_json( Max f0 for pitch computation use_custom_cleaner : bool If True, uses custom cleaner defined for this recipe - extract_features : list, optional - If specified, feature extraction will be performed - extract_features_context : types.SimpleNamespace, optional - Context for feature extraction (pretrained models, etc) - extract_features_folder : path-like, optional - The folder where extracted features will be saved - extract_features_opts : dict, optional - Options for feature extraction + extract_phonemes : bool + Whether to extract phonemes g2p_src : str The name of the HuggingFace Hub to use for the Grapheme-to-Phoneme model or the path to it @@ -495,12 +451,12 @@ def prepare_json( extract_phonemes = True if extract_phonemes: logger.info( - "Computing phonemes for LJSpeech labels using SpeechBrain G2P. This may take a while." + "Computing phonemes for LJSpeech labels using SpeechBrain f This may take a while." ) g2p = GraphemeToPhoneme.from_hparams( g2p_src, run_opts={"device": device} ) - if "FastSpeech2" in model_name: + if model_name is not None and "FastSpeech2" in model_name: logger.info( "Computing pitch as required for FastSpeech2. This may take a while." ) @@ -649,19 +605,6 @@ def prepare_json( # Updates data for the utterance json_dict[id].update({"phonemes": phonemes}) - # Feature Extraction - if extract_features: - extract_features_folder.mkdir(exist_ok=True) - prepare_features( - data=json_dict, - data_folder=data_folder, - save_path=extract_features_folder, - features=extract_features, - context=extract_features_context, - options=extract_features_opts, - device=device, - ) - # Writing the dictionary to the json file with open(json_file, mode="w") as json_f: json.dump(json_dict, json_f, indent=2) @@ -838,146 +781,3 @@ def custom_clean(text, model_name): for regex, replacement in _abbreviations: text = re.sub(regex, replacement, text) return text - - -INLINE_FEATURES = ["audio_ssl_len"] - - -def prepare_features( - data, data_folder, save_path, features, context, options=None, device="cpu" -): - """Performs feature extraction - - Arguments - --------- - data: dict - a preprocessed dataset - data_folder : str - the data folder - save_folder : str - the folder where features will be saved - context : dict - context data - features: list - the list of feature extractions to be performed - """ - dataset = DynamicItemDataset(data) - feature_extractor = FeatureExtractor( - save_path=save_path, - src_keys=["sig"], - id_key="uttid", - dataloader_opts=options.get("dataloader_opts", {}), - device=device, - ) - token_model_kwargs = options.get("token_model_kwargs", {}) - ssl_layers = options.get("ssl_model_layers") or options.get( - "token_model_layers" - ) - - @sb.utils.data_pipeline.takes("wav") - @sb.utils.data_pipeline.provides("sig") - def audio_pipeline(wav): - """Load the audio signal. """ - wav = wav.replace("{data_root}", data_folder) - sig = sb.dataio.dataio.read_audio(wav) - - yield sig - - dataset.add_dynamic_item(audio_pipeline) - - @sb.utils.data_pipeline.takes("sig") - @sb.utils.data_pipeline.provides("sig_resampled") - def resample_pipeline(sig): - sig_data = resample( - waveform=sig.data, - orig_freq=options["sample_rate"], - new_freq=options["model_sample_rate"], - ) - return PaddedData(sig_data, sig.lengths) - - @sb.utils.data_pipeline.takes("sig_resampled") - @sb.utils.data_pipeline.provides("audio_tokens", "audio_emb") - def token_pipeline(sig): - with torch.no_grad(): - result = context.token_model( - sig.data, sig.lengths, **token_model_kwargs - ) - # TODO: Clean this up - if torch.is_tensor(result): - tokens = result - # Note: Dummy embedding - meaning embeddings are not available - emb = torch.zeros((len(sig.data), 1, 1), device=sig.data.device) - else: - tokens, emb = result[:2] - tokens = tokens.int() - if tokens.dim() < 3: - tokens = tokens.unsqueeze(-1) - yield PaddedData(tokens, sig.lengths) - yield PaddedData(emb, sig.lengths) - - @sb.utils.data_pipeline.takes("sig_resampled") - @sb.utils.data_pipeline.provides("spk_emb") - def spk_emb_pipeline(sig): - mel_spec = context.spk_emb_model.mel_spectogram(audio=sig.data) - return context.spk_emb_model.encode_mel_spectrogram_batch( - mel_spec, sig.lengths - ) - - @sb.utils.data_pipeline.takes("sig_resampled") - @sb.utils.data_pipeline.provides("audio_ssl", "audio_ssl_len") - def ssl_pipeline(sig): - ssl_raw = context.ssl_model(sig.data, sig.lengths) - ssl = ssl_raw[ssl_layers].permute(1, 2, 0, 3) - yield PaddedData(ssl, sig.lengths) - yield (sig.lengths * ssl.size(1)).tolist() - - dynamic_items = [ - resample_pipeline, - token_pipeline, - ssl_pipeline, - spk_emb_pipeline, - ] - for dynamic_item in dynamic_items: - feature_extractor.add_dynamic_item(dynamic_item) - - feature_keys = [key for key in features if key not in INLINE_FEATURES] - inline_keys = [key for key in features if key in INLINE_FEATURES] - feature_extractor.set_output_features(feature_keys, inline_keys=inline_keys) - feature_extractor.extract(dataset, data) - - -def get_context(extract_features, extract_features_opts, device): - """ - Gets the context (pretrained models, etc) for feature extraction - - Arguments - --------- - extract_features : list - A list of features to extract - Available features: - audio_tokens - raw tokens - audio_emb - embeddings from the model - extract_features_opts : dict - Options for feature extraction - device : str|torch.Device - The device on which extraction will be run - - Returns - -------- - context: SimpleNamespace - The context object - """ - context = {} - if ( - any(key in extract_features for key in ["audio_tokens", "audio_emb"]) - and "token_model" in extract_features_opts - ): - context["token_model"] = extract_features_opts["token_model"].to(device) - if "audio_ssl" in extract_features: - context["ssl_model"] = extract_features_opts["ssl_model"].to(device) - if "spk_emb" in extract_features: - context["spk_emb_model"] = extract_features_opts["spk_emb_model"]( - run_opts={"device": device} - ) - - return SimpleNamespace(**context) diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/dac.yaml new file mode 100644 index 000000000..605b772b5 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/dac.yaml @@ -0,0 +1,231 @@ +# ############################################################################ +# Model: E2E ASR with CTC +# Auido Tokenizer: DAC +# Encoder: LSTM Encoder +# Decoder: CTC beam searcher and greedy searcher +# Tokens: character +# Training: Librispeech 960h +# Authors: Pooneh Mousavi 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/dac/LSTM/ +output_wer_folder: !ref /wer.txt +save_folder: !ref /save +train_log: !ref /train_log.txt + + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES +# then data_folder_rirs should be /localscratch/xxx_corpus +# otherwise the dataset will automatically be downloaded +# data_folder_rirs: !ref +train_splits: ["train-clean-100", "train-clean-360", "train-other-500"] +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /dev-clean.csv + - !ref /test-clean.csv + + +####################### Training Parameters #################################### +number_of_epochs: 20 +batch_size: 4 # This works for 2x GPUs with 32GB +test_batch_size: 1 +grad_accumulation_factor: 2 +max_grad_norm: 5.0 +sorting: descending #random +num_workers: 8 +loss_reduction: batchmean +precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean +valid_search_interval: 1 +avg_checkpoints: 10 # Number of checkpoints to average for evaluation +cache_size: 1.e+10 + +lr_model: 0.001 +weight_decay: 0.0005 + + +# Training parameters +# To make Transformers converge, the global bath size should be large enough. +# The global batch size is max_batch_len * n_gpus * gradient_accumulation. +# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2. +# Please, set your parameters accordingly. +dynamic_batching: True +max_batch_length_train: 850 +max_batch_len_val: 100 +num_bucket: 200 +shuffle: False # if true re-creates batches at each epoch shuffling examples. +max_batch_ex: 128 +batch_ordering: random + +dynamic_batch_sampler_train: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +dynamic_batch_sampler_val: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + + +####################### Model parameters ########################### +# Tokenizer parameters +# DAC parameters +# model_type: [16khz, 24khz, 44khz, 44khz] +# vocab_size: [1024, 1024, 1024, 1024] +# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps] +# max_num_codebooks: [12, 32, 9, 18] +# embedding_dim: [1024, 1024, 1024, 128] +model_type: 24khz +vocab_size: 1024 +model_bitrate: 8kbps +num_codebooks: 2 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. +pretrain_embeddings: False +freeze_embedding: False + + +# LSTM +activation: !name:torch.nn.Sigmoid +dnn_layers: 2 +dnn_neurons: 1024 +dropout: 0.2 +output_neurons: 31 + +# BPE parameters +token_type: char # ["unigram", "bpe", "char"] +character_coverage: 1.0 +blank_index: 0 +bos_index: 1 +eos_index: 2 + +# Decoding parameters +beam_size: 100 +beam_prune_logp: -12.0 +token_prune_min_logp: -1.2 +prune_history: False + +############################## models ################################ +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:utils.tokenizer_interface.DACTokenizer + model_type: !ref + model_bitrate: !ref + load_pretrained: True + tag: latest + +discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer + num_codebooks: !ref + vocab_size: !ref + emb_dim: !ref + # hidden_dim: !ref + freeze: !ref + init: !ref + +attention_mlp: !new:model.custom_model.AttentionMLP + input_dim: !ref + hidden_dim: !ref + +encoder: !new:speechbrain.nnet.RNN.LSTM + input_shape: [Null, Null, !ref ] + num_layers: !ref + bidirectional: True + dropout: !ref + hidden_size: !ref + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: 2048 + n_neurons: !ref + +modules: + encoder: !ref + ctc_lin: !ref + attention_mlp: !ref + tokenizer: !ref + discrete_embedding_layer: !ref + + +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref ] + +####################### Decoding & optimiser ########################### +# Decoding parameters +test_beam_search: + blank_index: !ref + beam_size: !ref + beam_prune_logp: !ref + token_prune_min_logp: !ref + prune_history: !ref + alpha: 0.8 + beta: 1.2 + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 +# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler +# lr_initial: !ref +# n_warmup_steps: 7500 +# n_keep_steps: 36000 + +model_opt_class: !name:torch.optim.AdamW + lr: !ref + betas: (0.9, 0.98) + eps: 0.000000001 + weight_decay: !ref + +############################## Logging and Pretrainer ########################## +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + counter: !ref + + +# Functions and classes +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True +wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/encodec.yaml new file mode 100644 index 000000000..f13e3cb53 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/encodec.yaml @@ -0,0 +1,231 @@ +# ############################################################################ +# Model: E2E ASR with CTC +# Auido Tokenizer: Encodec +# Encoder: LSTM Encoder +# Decoder: CTC beam searcher and greedy searcher +# Tokens: character +# Training: Librispeech 960h +# Authors: Pooneh Mousavi 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/enocdec/LSTM/ +output_wer_folder: !ref /wer.txt +save_folder: !ref /save +train_log: !ref /train_log.txt + + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES +# then data_folder_rirs should be /localscratch/xxx_corpus +# otherwise the dataset will automatically be downloaded +# data_folder_rirs: !ref +train_splits: ["train-clean-100", "train-clean-360", "train-other-500"] +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /dev-clean.csv + - !ref /test-clean.csv + + +####################### Training Parameters #################################### +number_of_epochs: 20 +batch_size: 4 # This works for 2x GPUs with 32GB +test_batch_size: 1 +grad_accumulation_factor: 2 +max_grad_norm: 5.0 +sorting: descending #random +num_workers: 8 +loss_reduction: batchmean +precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean +valid_search_interval: 1 +avg_checkpoints: 10 # Number of checkpoints to average for evaluation +cache_size: 1.e+10 + +lr_model: 0.001 +weight_decay: 0.0005 + + +# Training parameters +# To make Transformers converge, the global bath size should be large enough. +# The global batch size is max_batch_len * n_gpus * gradient_accumulation. +# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2. +# Please, set your parameters accordingly. +dynamic_batching: True +max_batch_length_train: 850 +max_batch_len_val: 100 +num_bucket: 200 +shuffle: False # if true re-creates batches at each epoch shuffling examples. +max_batch_ex: 128 +batch_ordering: random + +dynamic_batch_sampler_train: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +dynamic_batch_sampler_val: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + + +####################### Model parameters ########################### +# Tokenizer parameters +# sample_rate: [24000, 24000, 24000, 24000] +# vocab_size: [1024, 1024, 1024, 1024] +# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] +# num_codebooks: [2, 4, 8, 16, 32] +vocab_size: 1024 +bandwidth: 1.5 +num_codebooks: 2 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. +pretrain_embeddings: False +freeze_embedding: False + + +# LSTM +activation: !name:torch.nn.Sigmoid +dnn_layers: 2 +dnn_neurons: 1024 +dropout: 0.2 +output_neurons: 31 + +# BPE parameters +token_type: char # ["unigram", "bpe", "char"] +character_coverage: 1.0 +blank_index: 0 +bos_index: 1 +eos_index: 2 + +# Decoding parameters +beam_size: 100 +beam_prune_logp: -12.0 +token_prune_min_logp: -1.2 +prune_history: False + +############################## models ################################ +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + +discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer + num_codebooks: !ref + vocab_size: !ref + emb_dim: !ref + # hidden_dim: !ref + freeze: !ref + init: !ref + +attention_mlp: !new:model.custom_model.AttentionMLP + input_dim: !ref + hidden_dim: !ref + +encoder: !new:speechbrain.nnet.RNN.LSTM + input_shape: [Null, Null, !ref ] + num_layers: !ref + bidirectional: True + dropout: !ref + hidden_size: !ref + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: 2048 + n_neurons: !ref + +modules: + encoder: !ref + ctc_lin: !ref + attention_mlp: !ref + tokenizer: !ref + discrete_embedding_layer: !ref + + +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref ] + +####################### Decoding & optimiser ########################### +# Decoding parameters +test_beam_search: + blank_index: !ref + beam_size: !ref + beam_prune_logp: !ref + token_prune_min_logp: !ref + prune_history: !ref + alpha: 0.8 + beta: 1.2 + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 +# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler +# lr_initial: !ref +# n_warmup_steps: 7500 +# n_keep_steps: 36000 + +model_opt_class: !name:torch.optim.AdamW + lr: !ref + betas: (0.9, 0.98) + eps: 0.000000001 + weight_decay: !ref + +############################## Logging and Pretrainer ########################## +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + counter: !ref + + +# Functions and classes +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True +wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml new file mode 100644 index 000000000..9607dab79 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml @@ -0,0 +1,221 @@ +# ############################################################################ +# Model: E2E ASR with CTC +# Auido Tokenizer: SpeechTokenizer +# Encoder: LSTM Encoder +# Decoder: CTC beam searcher and greedy searcher +# Tokens: character +# Training: Librispeech 960h +# Authors: Pooneh Mousavi 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/speechtokenizer/LSTM/ +output_wer_folder: !ref /wer.txt +save_folder: !ref /save +train_log: !ref /train_log.txt + + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES +# then data_folder_rirs should be /localscratch/xxx_corpus +# otherwise the dataset will automatically be downloaded +# data_folder_rirs: !ref +train_splits: ["train-clean-100", "train-clean-360", "train-other-500"] +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /dev-clean.csv + - !ref /test-clean.csv + + +####################### Training Parameters #################################### +number_of_epochs: 20 +batch_size: 4 # This works for 2x GPUs with 32GB +test_batch_size: 1 +grad_accumulation_factor: 2 +max_grad_norm: 5.0 +sorting: descending #random +num_workers: 8 +loss_reduction: batchmean +precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean +valid_search_interval: 1 +avg_checkpoints: 10 # Number of checkpoints to average for evaluation +cache_size: 1.e+10 + +lr_model: 0.001 +weight_decay: 0.0005 + + +# Training parameters +# To make Transformers converge, the global bath size should be large enough. +# The global batch size is max_batch_len * n_gpus * gradient_accumulation. +# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2. +# Please, set your parameters accordingly. +dynamic_batching: True +max_batch_length_train: 850 +max_batch_len_val: 100 +num_bucket: 200 +shuffle: False # if true re-creates batches at each epoch shuffling examples. +max_batch_ex: 128 +batch_ordering: random + +dynamic_batch_sampler_train: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +dynamic_batch_sampler_val: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + + +####################### Model parameters ########################### +# Tokenizer parameters +vocab_size: 1024 +num_codebooks: 2 +sample_rate: 16000 +# Feature parameters +encoder_dim: 1024 +# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. +pretrain_embeddings: False +freeze_embedding: False + + +# LSTM +activation: !name:torch.nn.Sigmoid +dnn_layers: 2 +dnn_neurons: 1024 +dropout: 0.2 +output_neurons: 31 + +# BPE parameters +token_type: char # ["unigram", "bpe", "char"] +character_coverage: 1.0 +blank_index: 0 +bos_index: 1 +eos_index: 2 + +# Decoding parameters +beam_size: 100 +beam_prune_logp: -12.0 +token_prune_min_logp: -1.2 +prune_history: False + +############################## models ################################ +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper + source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio + save_path: !ref + +discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer + num_codebooks: !ref + vocab_size: !ref + emb_dim: !ref + # hidden_dim: !ref + freeze: !ref + init: !ref + +attention_mlp: !new:model.custom_model.AttentionMLP + input_dim: !ref + hidden_dim: !ref + +encoder: !new:speechbrain.nnet.RNN.LSTM + input_shape: [Null, Null, !ref ] + num_layers: !ref + bidirectional: True + dropout: !ref + hidden_size: !ref + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: 2048 + n_neurons: !ref + +modules: + encoder: !ref + ctc_lin: !ref + attention_mlp: !ref + tokenizer: !ref + discrete_embedding_layer: !ref + + +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref ] + +####################### Decoding & optimiser ########################### +# Decoding parameters +test_beam_search: + blank_index: !ref + beam_size: !ref + beam_prune_logp: !ref + token_prune_min_logp: !ref + prune_history: !ref + alpha: 0.8 + beta: 1.2 + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 +# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler +# lr_initial: !ref +# n_warmup_steps: 7500 +# n_keep_steps: 36000 + +model_opt_class: !name:torch.optim.AdamW + lr: !ref + betas: (0.9, 0.98) + eps: 0.000000001 + weight_decay: !ref + +############################## Logging and Pretrainer ########################## +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + counter: !ref + + +# Functions and classes +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True +wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/dac.yaml new file mode 100644 index 000000000..8e73e3601 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/dac.yaml @@ -0,0 +1,224 @@ +# ############################################################################ +# Model: E2E ASR with CTC +# Auido Tokenizer: DAC +# Encoder: Contextnet Encoder +# Decoder: CTC beam searcher and greedy searcher +# Tokens: character +# Training: Librispeech 960h +# Authors: Pooneh Mousavi 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/dac/contextnet/ +output_wer_folder: !ref /wer.txt +save_folder: !ref /save +train_log: !ref /train_log.txt + + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES +# then data_folder_rirs should be /localscratch/xxx_corpus +# otherwise the dataset will automatically be downloaded +# data_folder_rirs: !ref +train_splits: ["train-clean-100", "train-clean-360", "train-other-500"] +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /dev-clean.csv + - !ref /test-clean.csv + + +####################### Training Parameters #################################### +number_of_epochs: 20 +batch_size: 4 # This works for 2x GPUs with 32GB +test_batch_size: 1 +grad_accumulation_factor: 2 +max_grad_norm: 5.0 +sorting: descending #random +num_workers: 8 +loss_reduction: batchmean +precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean +valid_search_interval: 1 +avg_checkpoints: 10 # Number of checkpoints to average for evaluation +cache_size: 1.e+10 + +lr_model: 0.001 +weight_decay: 0.0005 + + +# Training parameters +# To make Transformers converge, the global bath size should be large enough. +# The global batch size is max_batch_len * n_gpus * gradient_accumulation. +# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2. +# Please, set your parameters accordingly. +dynamic_batching: True +max_batch_length_train: 850 +max_batch_len_val: 100 +num_bucket: 200 +shuffle: False # if true re-creates batches at each epoch shuffling examples. +max_batch_ex: 128 +batch_ordering: random + +dynamic_batch_sampler_train: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +dynamic_batch_sampler_val: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + + +####################### Model parameters ########################### +# Tokenizer parameters +# DAC parameters +# model_type: [16khz, 24khz, 44khz, 44khz] +# vocab_size: [1024, 1024, 1024, 1024] +# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps] +# max_num_codebooks: [12, 32, 9, 18] +# embedding_dim: [1024, 1024, 1024, 128] +model_type: 24khz +vocab_size: 1024 +model_bitrate: 8kbps +num_codebooks: 2 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. +pretrain_embeddings: False +freeze_embedding: False + + +# LSTM +output_neurons: 31 + +# BPE parameters +token_type: char # ["unigram", "bpe", "char"] +character_coverage: 1.0 +blank_index: 0 +bos_index: 1 +eos_index: 2 + +# Decoding parameters +beam_size: 100 +beam_prune_logp: -12.0 +token_prune_min_logp: -1.2 +prune_history: False + +############################## models ################################ +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:utils.tokenizer_interface.DACTokenizer + model_type: !ref + model_bitrate: !ref + load_pretrained: True + tag: latest + +discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer + num_codebooks: !ref + vocab_size: !ref + emb_dim: !ref + # hidden_dim: !ref + freeze: !ref + init: !ref + +attention_mlp: !new:model.custom_model.AttentionMLP + input_dim: !ref + hidden_dim: !ref + +encoder: !new:speechbrain.lobes.models.ContextNet.ContextNet + input_shape: [null, null, !ref ] + strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: 640 + n_neurons: !ref + +modules: + encoder: !ref + ctc_lin: !ref + attention_mlp: !ref + tokenizer: !ref + discrete_embedding_layer: !ref + + +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref ] + +####################### Decoding & optimiser ########################### +# Decoding parameters +test_beam_search: + blank_index: !ref + beam_size: !ref + beam_prune_logp: !ref + token_prune_min_logp: !ref + prune_history: !ref + alpha: 0.8 + beta: 1.2 + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 +# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler +# lr_initial: !ref +# n_warmup_steps: 7500 +# n_keep_steps: 36000 + +model_opt_class: !name:torch.optim.AdamW + lr: !ref + betas: (0.9, 0.98) + eps: 0.000000001 + weight_decay: !ref + +############################## Logging and Pretrainer ########################## +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + counter: !ref + + +# Functions and classes +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True +wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/encodec.yaml new file mode 100644 index 000000000..4d88a7978 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/encodec.yaml @@ -0,0 +1,221 @@ +# ############################################################################ +# Model: E2E ASR with CTC +# Encoder: Contextnet Encoder +# Decoder: CTC beam searcher and greedy searcher +# Tokens: character +# Training: Librispeech 960h +# Authors: Pooneh Mousavi 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/enocdec/Contexnet/ +output_wer_folder: !ref /wer.txt +save_folder: !ref /save +train_log: !ref /train_log.txt + + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES +# then data_folder_rirs should be /localscratch/xxx_corpus +# otherwise the dataset will automatically be downloaded +# data_folder_rirs: !ref +train_splits: ["train-clean-100", "train-clean-360", "train-other-500"] +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /dev-clean.csv + - !ref /test-clean.csv + + +####################### Training Parameters #################################### +number_of_epochs: 20 +batch_size: 4 # This works for 2x GPUs with 32GB +test_batch_size: 1 +grad_accumulation_factor: 2 +max_grad_norm: 5.0 +sorting: descending #random +num_workers: 8 +loss_reduction: batchmean +precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean +valid_search_interval: 1 +avg_checkpoints: 10 # Number of checkpoints to average for evaluation +cache_size: 1.e+10 + +lr_model: 0.001 +weight_decay: 0.0005 + + +# Training parameters +# To make Transformers converge, the global bath size should be large enough. +# The global batch size is max_batch_len * n_gpus * gradient_accumulation. +# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2. +# Please, set your parameters accordingly. +dynamic_batching: True +max_batch_length_train: 850 +max_batch_len_val: 100 +num_bucket: 200 +shuffle: False # if true re-creates batches at each epoch shuffling examples. +max_batch_ex: 128 +batch_ordering: random + +dynamic_batch_sampler_train: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +dynamic_batch_sampler_val: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + + +####################### Model parameters ########################### +# Tokenizer parameters +# sample_rate: [24000, 24000, 24000, 24000] +# vocab_size: [1024, 1024, 1024, 1024] +# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] +# num_codebooks: [2, 4, 8, 16, 32] +vocab_size: 1024 +bandwidth: 1.5 +num_codebooks: 2 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. +pretrain_embeddings: False +freeze_embedding: False + +output_neurons: 31 + +# BPE parameters +token_type: char # ["unigram", "bpe", "char"] +character_coverage: 1.0 +blank_index: 0 +bos_index: 1 +eos_index: 2 + +# Decoding parameters +beam_size: 100 +beam_prune_logp: -12.0 +token_prune_min_logp: -1.2 +prune_history: False + +############################## models ################################ +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + +discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer + num_codebooks: !ref + vocab_size: !ref + emb_dim: !ref + # hidden_dim: !ref + freeze: !ref + init: !ref + +attention_mlp: !new:model.custom_model.AttentionMLP + input_dim: !ref + hidden_dim: !ref + +encoder: !new:speechbrain.lobes.models.ContextNet.ContextNet + input_shape: [null, null, !ref ] + strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: 640 + n_neurons: !ref + +modules: + encoder: !ref + ctc_lin: !ref + attention_mlp: !ref + tokenizer: !ref + discrete_embedding_layer: !ref + + +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref ] + +####################### Decoding & optimiser ########################### +# Decoding parameters +test_beam_search: + blank_index: !ref + beam_size: !ref + beam_prune_logp: !ref + token_prune_min_logp: !ref + prune_history: !ref + alpha: 0.8 + beta: 1.2 + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 +# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler +# lr_initial: !ref +# n_warmup_steps: 7500 +# n_keep_steps: 36000 + +model_opt_class: !name:torch.optim.AdamW + lr: !ref + betas: (0.9, 0.98) + eps: 0.000000001 + weight_decay: !ref + +############################## Logging and Pretrainer ########################## +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + counter: !ref + + +# Functions and classes +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True +wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml new file mode 100644 index 000000000..615777a99 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml @@ -0,0 +1,212 @@ +# ############################################################################ +# Model: E2E ASR with CTC +# Auido Tokenizer: SpeechTokenizer +# Encoder: Contextnet Encoder +# Decoder: CTC beam searcher and greedy searcher +# Tokens: character +# Training: Librispeech 960h +# Authors: Pooneh Mousavi 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/speechtokenizer/contextnet/ +output_wer_folder: !ref /wer.txt +save_folder: !ref /save +train_log: !ref /train_log.txt + + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES +# then data_folder_rirs should be /localscratch/xxx_corpus +# otherwise the dataset will automatically be downloaded +# data_folder_rirs: !ref +train_splits: ["train-clean-100", "train-clean-360", "train-other-500"] +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /dev-clean.csv + - !ref /test-clean.csv + + +####################### Training Parameters #################################### +number_of_epochs: 20 +batch_size: 4 # This works for 2x GPUs with 32GB +test_batch_size: 1 +grad_accumulation_factor: 2 +max_grad_norm: 5.0 +sorting: descending #random +num_workers: 8 +loss_reduction: batchmean +precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean +valid_search_interval: 1 +avg_checkpoints: 10 # Number of checkpoints to average for evaluation +cache_size: 1.e+10 + +lr_model: 0.001 +weight_decay: 0.0005 + + +# Training parameters +# To make Transformers converge, the global bath size should be large enough. +# The global batch size is max_batch_len * n_gpus * gradient_accumulation. +# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2. +# Please, set your parameters accordingly. +dynamic_batching: True +max_batch_length_train: 850 +max_batch_len_val: 100 +num_bucket: 200 +shuffle: False # if true re-creates batches at each epoch shuffling examples. +max_batch_ex: 128 +batch_ordering: random + +dynamic_batch_sampler_train: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +dynamic_batch_sampler_val: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + + +####################### Model parameters ########################### +# Tokenizer parameters +vocab_size: 1024 +num_codebooks: 2 +sample_rate: 16000 +# Feature parameters +encoder_dim: 1024 +# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. +pretrain_embeddings: False +freeze_embedding: False + +output_neurons: 31 + +# BPE parameters +token_type: char # ["unigram", "bpe", "char"] +character_coverage: 1.0 +blank_index: 0 +bos_index: 1 +eos_index: 2 + +# Decoding parameters +beam_size: 100 +beam_prune_logp: -12.0 +token_prune_min_logp: -1.2 +prune_history: False + +############################## models ################################ +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper + source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio + save_path: !ref + +discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer + num_codebooks: !ref + vocab_size: !ref + emb_dim: !ref + # hidden_dim: !ref + freeze: !ref + init: !ref + +attention_mlp: !new:model.custom_model.AttentionMLP + input_dim: !ref + hidden_dim: !ref + +encoder: !new:speechbrain.lobes.models.ContextNet.ContextNet + input_shape: [null, null, !ref ] + strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: 640 + n_neurons: !ref + +modules: + encoder: !ref + ctc_lin: !ref + attention_mlp: !ref + tokenizer: !ref + discrete_embedding_layer: !ref + + +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref ] + +####################### Decoding & optimiser ########################### +# Decoding parameters +test_beam_search: + blank_index: !ref + beam_size: !ref + beam_prune_logp: !ref + token_prune_min_logp: !ref + prune_history: !ref + alpha: 0.8 + beta: 1.2 + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 +# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler +# lr_initial: !ref +# n_warmup_steps: 7500 +# n_keep_steps: 36000 + +model_opt_class: !name:torch.optim.AdamW + lr: !ref + betas: (0.9, 0.98) + eps: 0.000000001 + weight_decay: !ref + +############################## Logging and Pretrainer ########################## +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + counter: !ref + + +# Functions and classes +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True +wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/librispeech_prepare.py b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/librispeech_prepare.py new file mode 120000 index 000000000..a3126ec94 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/librispeech_prepare.py @@ -0,0 +1 @@ +../librispeech_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_dac.py b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py similarity index 52% rename from benchmarks/DASB/LibriSpeech/ASR/LSTM/train_dac.py rename to benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py index 479d6719b..098986565 100644 --- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_dac.py +++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py @@ -10,16 +10,24 @@ import os import sys +import time import torch import torchaudio import logging import speechbrain as sb from speechbrain.utils.distributed import run_on_main, if_main_process +from speechbrain.tokenizers.SentencePiece import SentencePiece from hyperpyyaml import load_hyperpyyaml from pathlib import Path +base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) +sys.path.append(base_dir) + + logger = logging.getLogger(__name__) +_CACHE = {"size": 0} + # Define training procedure class ASR(sb.Brain): @@ -28,24 +36,56 @@ def compute_forward(self, batch, stage): batch = batch.to(self.device) wavs, wav_lens = batch.sig - # Forward pass - # Feature extraction and attention pooling - with torch.no_grad(): - self.hparams.codec.to(self.device).eval() - tokens, _ = self.hparams.codec( - wavs.unsqueeze(1), n_quantizers=self.hparams.num_codebooks - ) - embeddings = self.modules.discrete_embedding_layer( - tokens.movedim(-2, -1) - ) - att_w = self.modules.attention_mlp(embeddings) - feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2) - y = self.modules.enc(feats) - y = y[0] # As it is an RNN output - # Compute outputs - p_tokens = None - logits = self.modules.ctc_lin(y) + # Add waveform augmentation if specified. + if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): + wavs, wav_lens = self.hparams.wav_augment(wavs, wav_lens) # [B, T] + + # compute features + # Extract tokens (cache them at first epoch if augmentation is disabled) + key = tuple(sorted(batch.id)) + try: + in_toks = _CACHE[key] + in_toks = in_toks.to(self.device) + except KeyError: + with torch.no_grad(): + self.hparams.tokenizer.eval().to(self.device) + in_toks = self.hparams.tokenizer.sig_to_tokens( + wavs, wav_lens, num_codebooks=hparams["num_codebooks"] + ) # [B, T, N-Q] + if stage != sb.Stage.TRAIN or ( + stage == sb.Stage.TRAIN + and (not hasattr(self.hparams, "wav_augment")) + ): + if _CACHE["size"] < self.hparams.cache_size: + _CACHE[key] = in_toks.cpu() + _CACHE["size"] += in_toks.numel() + + # Extract embeddings + in_embs = self.modules.discrete_embedding_layer( + in_toks + ) # [B, T, N-Q, D] + + # Attention-Pooling + att_w = self.modules.attention_mlp(in_embs) # [B, T, N-Q, 1] + in_embs = torch.matmul(att_w.transpose(2, -1), in_embs).squeeze( + -2 + ) # [B, T, D] + + # forward modules + if type(self.modules.encoder).__name__ == "ContextNet": + enc_out = self.modules.encoder(in_embs) + + elif type(self.modules.encoder).__name__ == "LSTM": + enc_out, _ = self.modules.encoder(in_embs) + + else: + raise NotImplementedError + + # output layer for ctc log-probabilities + logits = self.modules.ctc_lin(enc_out) p_ctc = self.hparams.log_softmax(logits) + + p_tokens = None if stage == sb.Stage.VALID: p_tokens = sb.decoders.ctc_greedy_decode( p_ctc, wav_lens, blank_id=self.hparams.blank_index @@ -61,14 +101,19 @@ def compute_objectives(self, predictions, batch, stage): p_ctc, wav_lens, predicted_tokens = predictions ids = batch.id tokens, tokens_lens = batch.tokens + + # Label Augmentation + if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): + tokens = self.hparams.wav_augment.replicate_labels(tokens) + tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens) + loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) if stage == sb.Stage.VALID: # Decode token terms to words - predicted_words = [ - "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ") - for utt_seq in predicted_tokens - ] + predicted_words = self.tokenizer( + predicted_tokens, task="decode_from_list" + ) elif stage == sb.Stage.TEST: predicted_words = [ hyp[0].text.split(" ") for hyp in predicted_tokens @@ -85,10 +130,10 @@ def on_stage_start(self, stage, epoch): """Gets called at the beginning of each epoch""" if stage != sb.Stage.TRAIN: self.cer_metric = self.hparams.cer_computer() - self.wer_metric = self.hparams.error_rate_computer() + self.wer_metric = self.hparams.wer_computer() def on_stage_end(self, stage, stage_loss, epoch): - """Gets called at the end of an epoch.""" + """Gets called at the end of a epoch.""" # Compute/store important stats stage_stats = {"loss": stage_loss} if stage == sb.Stage.TRAIN: @@ -96,60 +141,61 @@ def on_stage_end(self, stage, stage_loss, epoch): else: stage_stats["CER"] = self.cer_metric.summarize("error_rate") stage_stats["WER"] = self.wer_metric.summarize("error_rate") - - # Perform end-of-iteration things, like annealing, logging, etc. + current_epoch = self.hparams.epoch_counter.current + valid_search_interval = self.hparams.valid_search_interval + if ( + current_epoch % valid_search_interval == 0 + or stage == sb.Stage.TEST + ): + stage_stats["WER"] = self.wer_metric.summarize("error_rate") + + # log stats and save checkpoint at end-of-epoch if stage == sb.Stage.VALID: - old_lr_model, new_lr_model = self.hparams.lr_annealing_model( - stage_stats["loss"] - ) - # old_lr_weights, new_lr_weights = self.hparams.lr_annealing_weights( - # stage_stats["loss"] - # ) - sb.nnet.schedulers.update_learning_rate( - self.model_optimizer, new_lr_model - ) - # sb.nnet.schedulers.update_learning_rate( - # self.weights_optimizer, new_lr_weights - # ) - + if type(self.hparams.scheduler).__name__ == "NewBobScheduler": + lr, new_lr = self.hparams.scheduler(stage_stats["loss"]) + sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr) + elif type(self.hparams.scheduler).__name__ == "LinearNoamScheduler": + lr = self.hparams.scheduler.current_lr + else: + raise NotImplementedError + + optimizer = self.optimizer.__class__.__name__ + epoch_stats = { + "epoch": epoch, + "lr": lr, + "optimizer": optimizer, + } self.hparams.train_logger.log_stats( - stats_meta={"epoch": epoch, "lr_model": old_lr_model}, + stats_meta=epoch_stats, train_stats=self.train_stats, valid_stats=stage_stats, ) self.checkpointer.save_and_keep_only( - meta={"WER": stage_stats["WER"]}, min_keys=["WER"], + meta={"WER": stage_stats["WER"], "epoch": epoch}, + min_keys=["WER"], + num_to_keep=self.hparams.avg_checkpoints, ) + elif stage == sb.Stage.TEST: self.hparams.train_logger.log_stats( stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, test_stats=stage_stats, ) if if_main_process(): - with open(self.hparams.test_wer_file, "w") as w: + with open( + self.hparams.output_wer_folder, "w", encoding="utf-8" + ) as w: self.wer_metric.write_stats(w) - def init_optimizers(self): - # "Initializes the weights optimizer and model optimizer" - # self.weights_optimizer = self.hparams.weights_opt_class( - # self.hparams.attention_mlp.parameters() - # ) - self.model_optimizer = self.hparams.model_opt_class( - self.hparams.model.parameters() - ) - self.optimizers_dict = { - # "weights_optimizer": self.weights_optimizer, - "model_optimizer": self.model_optimizer, - } - # Initializing the weights - if self.checkpointer is not None: - self.checkpointer.add_recoverable("modelopt", self.model_optimizer) - # self.checkpointer.add_recoverable( - # "weights_opt", self.weights_optimizer - # ) + def on_fit_batch_end(self, batch, outputs, loss, should_step): + if ( + should_step + and type(self.hparams.scheduler).__name__ == "LinearNoamScheduler" + ): + self.hparams.scheduler(self.optimizer) -def dataio_prepare(hparams): +def dataio_prepare(hparams, tokenizer): """This function prepares the datasets to be used in the brain class. It also defines the data processing pipeline through user-defined functions.""" data_folder = hparams["data_folder"] @@ -206,11 +252,10 @@ def audio_pipeline(wav): resampled = torchaudio.transforms.Resample( info.sample_rate, hparams["sample_rate"], )(sig) - # resampled = resampled.unsqueeze(0) + # resampled = resampled.unsqueeze(0) return resampled sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) - label_encoder = sb.dataio.encoder.CTCTextEncoder() # 3. Define text pipeline: @sb.utils.data_pipeline.takes("wrd") @@ -221,45 +266,59 @@ def text_pipeline(wrd): yield wrd char_list = list(wrd) yield char_list - tokens_list = label_encoder.encode_sequence(char_list) + tokens_list = tokenizer.sp.encode_as_ids(wrd) yield tokens_list tokens = torch.LongTensor(tokens_list) yield tokens sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) - lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt") - special_labels = { - "blank_label": hparams["blank_index"], - "unk_label": hparams["unk_index"], - } - label_encoder.load_or_create( - path=lab_enc_file, - from_didatasets=[train_data], - output_key="char_list", - special_labels=special_labels, - sequence_input=True, - ) - # 4. Set output: sb.dataio.dataset.set_output_keys( datasets, ["id", "sig", "wrd", "char_list", "tokens"], ) - return train_data, valid_data, test_datasets, label_encoder + + # 5. If Dynamic Batching is used, we instantiate the needed samplers. + train_batch_sampler = None + valid_batch_sampler = None + if hparams["dynamic_batching"]: + from speechbrain.dataio.sampler import DynamicBatchSampler # noqa + + dynamic_hparams_train = hparams["dynamic_batch_sampler_train"] + dynamic_hparams_val = hparams["dynamic_batch_sampler_val"] + + train_batch_sampler = DynamicBatchSampler( + train_data, + length_func=lambda x: x["duration"], + **dynamic_hparams_train, + ) + + valid_batch_sampler = DynamicBatchSampler( + valid_data, + length_func=lambda x: x["duration"], + **dynamic_hparams_val, + ) + + return ( + train_data, + valid_data, + test_datasets, + train_batch_sampler, + valid_batch_sampler, + ) if __name__ == "__main__": # CLI: hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + with open(hparams_file) as fin: + hparams = load_hyperpyyaml(fin, overrides) # If distributed_launch=True then # create ddp_group with the right communication protocol sb.utils.distributed.ddp_init_group(run_opts) - with open(hparams_file) as fin: - hparams = load_hyperpyyaml(fin, overrides) - # Create experiment directory sb.create_experiment_directory( experiment_directory=hparams["output_folder"], @@ -285,25 +344,68 @@ def text_pipeline(wrd): }, ) + # Defining tokenizer and loading it + tokenizer = SentencePiece( + model_dir=hparams["save_folder"], + vocab_size=hparams["output_neurons"], + annotation_train=hparams["train_csv"], + annotation_read="wrd", + model_type=hparams["token_type"], + character_coverage=hparams["character_coverage"], + bos_id=hparams["bos_index"], + eos_id=hparams["eos_index"], + ) + # here we create the datasets objects as well as tokenization and encoding - train_data, valid_data, test_datasets, label_encoder = dataio_prepare( - hparams + ( + train_data, + valid_data, + test_datasets, + train_bsampler, + valid_bsampler, + ) = dataio_prepare(hparams, tokenizer) + + # Use pretrained embeddings + if hparams["pretrain_embeddings"]: + embs = hparams["tokenizer"].get_pretrained_embeddings( + device=run_opts["device"], + num_codebooks=hparams["num_codebooks"], + vocab_size=hparams["vocab_size"], + ) + hparams["discrete_embedding_layer"].init_embedding(embs) + + # Log number of parameters/buffers + codec_params = sum( + [x.numel() for x in hparams["tokenizer"].state_dict().values()] + ) + model_params = sum( + [ + x.numel() + for module in hparams["modules"].values() + for x in module.state_dict().values() + ] + ) + hparams["train_logger"].log_stats( + stats_meta={ + "Codec parameters/buffers (M)": f"{codec_params / 1e6:.2f}", + "Model parameters/buffers (M)": f"{model_params / 1e6:.2f}", + }, ) # Trainer initialization asr_brain = ASR( modules=hparams["modules"], + opt_class=hparams["model_opt_class"], hparams=hparams, run_opts=run_opts, checkpointer=hparams["checkpointer"], ) - # Loading the SSL model - # We dynamicaly add the tokenizer to our brain class. - asr_brain.tokenizer = label_encoder - - ind2lab = label_encoder.ind2lab - vocab_list = [ind2lab[x] for x in range(len(ind2lab))] + # Adding objects to trainer. + asr_brain.tokenizer = tokenizer + vocab_list = [ + tokenizer.sp.id_to_piece(i) for i in range(tokenizer.sp.vocab_size()) + ] from speechbrain.decoders.ctc import CTCBeamSearcher @@ -311,6 +413,20 @@ def text_pipeline(wrd): **hparams["test_beam_search"], vocab_list=vocab_list, ) + train_dataloader_opts = hparams["train_dataloader_opts"] + valid_dataloader_opts = hparams["valid_dataloader_opts"] + + if train_bsampler is not None: + train_dataloader_opts = { + "batch_sampler": train_bsampler, + "num_workers": hparams["num_workers"], + } + + if valid_bsampler is not None: + valid_dataloader_opts = {"batch_sampler": valid_bsampler} + # Measure time + start_time = time.time() # Start the timer + # Training asr_brain.fit( asr_brain.hparams.epoch_counter, @@ -320,12 +436,19 @@ def text_pipeline(wrd): valid_loader_kwargs=hparams["valid_dataloader_opts"], ) + end_time = time.time() # End the timer + # Calculate elapsed time + elapsed_time = end_time - start_time + logger.info(f"Model execution time: {elapsed_time:.6f} seconds") + # hparams["train_logger"].log_stats( + # stats_meta={f"Model execution time: {elapsed_time:.6f} seconds"}, + # ) # Testing if not os.path.exists(hparams["output_wer_folder"]): os.makedirs(hparams["output_wer_folder"]) for k in test_datasets.keys(): # keys are test_clean, test_other etc - asr_brain.hparams.test_wer_file = os.path.join( + asr_brain.hparams.output_wer_folder = os.path.join( hparams["output_wer_folder"], f"wer_{k}.txt" ) asr_brain.evaluate( diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_dac.yaml b/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_dac.yaml deleted file mode 100644 index 0b00db1f7..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_dac.yaml +++ /dev/null @@ -1,178 +0,0 @@ -# ################################ -# Recipe for training an discrete-input ctc ASR system with librispeech. -# Decoding is performed with ctc greedy or LM-rescored decoder. -# -# Authors -# * Pooneh Mousavi 2024 -# ################################ - -# Seed needs to be set at top of yaml, before objects with parameters are made -seed: 1986 -__set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/MP3S-LSTM/dac/ -output_wer_folder: !ref / -save_folder: !ref /save -train_log: !ref /train_log.txt - - -# Data files -data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech -# noise/ris dataset will automatically be downloaded -# data_folder_rirs: !ref -train_splits: ["train-clean-100"] -dev_splits: ["dev-clean"] -test_splits: ["test-clean", "test-other"] - -skip_prep: False -ckpt_interval_minutes: 25 # save checkpoint every N min -train_csv: !ref /train-clean-100.csv -valid_csv: !ref /dev-clean.csv -test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv - - -# Training parameters -number_of_epochs: 20 -lr: 0.0002 -sorting: ascending -precision: fp32 - -# With data_parallel batch_size is split into N jobs -# With DDP batch_size is multiplied by N jobs -# Must be 3 per GPU to fit 32GB of VRAM -batch_size: 4 -test_batch_size: 1 - - -### Config for Tokenizer -# DAC parameters -# model_type: [16khz, 24khz, 44khz, 44khz] -# vocab_size: [1024, 1024, 1024, 1024] -# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps] -# max_num_codebooks: [12, 32, 9, 18] -# embedding_dim: [1024, 1024, 1024, 128] -model_type: 24khz -vocab_size: 1024 -model_bitrate: 8kbps -num_codebooks: 2 # NOTE: must be smaller or equal to the maximum number of codebooks for the given model type -sample_rate: 24000 -encoder_dim: 1024 - - -# Dataloader options -train_dataloader_opts: - batch_size: !ref - -valid_dataloader_opts: - batch_size: !ref - -test_dataloader_opts: - batch_size: !ref - -# Model parameters -activation: !name:torch.nn.Sigmoid -dnn_layers: 1 -dnn_neurons: 768 -freeze_encoder: True - -# Outputs -output_neurons: 30 # BPE size, index(blank/eos/bos) = 0 - -# Decoding parameters -blank_index: 0 -unk_index: 1 - -test_beam_search: - beam_size: 143 - topk: 1 - blank_index: !ref - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -1.2 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - -# Functions and classes -# -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref - -# Modules -# DAC model (see https://github.com/descriptinc/descript-audio-codec) -codec: !new:speechbrain.lobes.models.discrete.dac.DAC - model_type: !ref - model_bitrate: !ref - load_pretrained: True - tag: latest - -discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer - num_codebooks: !ref - vocab_size: !ref - emb_dim: !ref - -attention_mlp: !new:custom_model.AttentionMLP - input_dim: !ref - hidden_dim: !ref - -enc: !new:speechbrain.nnet.RNN.LSTM - input_shape: [Null, Null, !ref ] - num_layers: 2 - bidirectional: True - dropout: 0.2 - hidden_size: 1024 - -ctc_lin: !new:speechbrain.nnet.linear.Linear - input_size: 2048 - n_neurons: !ref - -log_softmax: !new:speechbrain.nnet.activations.Softmax - apply_log: True - -ctc_cost: !name:speechbrain.nnet.losses.ctc_loss - blank_index: !ref - -modules: - enc: !ref - ctc_lin: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - -model: !new:torch.nn.ModuleList - - [!ref , !ref , !ref , !ref ] - -model_opt_class: !name:torch.optim.Adam - lr: !ref - -lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.8 - patient: 0 - -label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder - -checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer - checkpoints_dir: !ref - recoverables: - model: !ref - scheduler_model: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - counter: !ref - tokenizer: !ref - -train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger - save_file: !ref - -error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - -cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - split_tokens: True diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_discrete_ssl.yaml deleted file mode 100644 index c5a920693..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_discrete_ssl.yaml +++ /dev/null @@ -1,216 +0,0 @@ -# ################################ -# Recipe for training an discrete-input ctc ASR system with librispeech. -# Decoding is performed with ctc greedy or LM-rescored decoder. -# -# Authors -# * Pooneh Mousavi 2024 -# ################################ - -# Seed needs to be set at top of yaml, before objects with parameters are made -seed: 1986 -__set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/MP3S-LSTM/discrete_ssl/ -output_wer_folder: !ref / -save_folder: !ref /save -train_log: !ref /train_log.txt - - -# Data files -data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech -# noise/ris dataset will automatically be downloaded -# data_folder_rirs: !ref -train_splits: ["train-clean-100"] -dev_splits: ["dev-clean"] -test_splits: ["test-clean", "test-other"] - -skip_prep: False -ckpt_interval_minutes: 25 # save checkpoint every N min -train_csv: !ref /train-clean-100.csv -valid_csv: !ref /dev-clean.csv -test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv - -# Training parameters -number_of_epochs: 20 -lr: 0.0002 -sorting: ascending -precision: fp32 - -# With data_parallel batch_size is split into N jobs -# With DDP batch_size is multiplied by N jobs -# Must be 3 per GPU to fit 32GB of VRAM -batch_size: 4 -test_batch_size: 1 - -### Configuration for discrete SSL model -# ssl_model_type: hubert, wavlm, wav2vec2 -# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large, facebook/wav2vec2-large -ssl_model_type: hubert # hubert, wavml or wav2vec2 -ssl_hub: facebook/hubert-large-ll60k -ssl_folder: !ref /ssl_checkpoint -kmeans_repo_id: speechbrain/SSL_Quantization -kmeans_cache_dir: !ref /kmeans_checkpoint -kmeans_dataset: LibriSpeech-100-360-500 -freeze_ssl: True -freeze_feature_extractor: True -num_clusters: 1000 - -### Config for Tokenizer -# Layer number should be among the supported layers for discrete SSL models(kmenas model should be available for that layer) -# ssl_layer_num: [3, 7, 12, 23] -# deduplicate: [False, False, False, False] -# bpe_tokenizer_path: [null , null, null, null] -ssl_layer_num: [1, 3, 7, 12, 18, 23] -num_codebooks: 6 -deduplicate: [False, False, False, False, False, False] -bpe_tokenizer_path: [null, null, null, null, null, null] -sample_rate: 16000 -encoder_dim: 1024 - -# Dataloader options -train_dataloader_opts: - batch_size: !ref - -valid_dataloader_opts: - batch_size: !ref - -test_dataloader_opts: - batch_size: !ref - -# Model parameters -activation: !name:torch.nn.Sigmoid -dnn_layers: 1 -dnn_neurons: 1024 -freeze_encoder: True - -# Outputs -output_neurons: 30 # BPE size, index(blank/eos/bos) = 0 - -# Decoding parameters -blank_index: 0 -unk_index: 1 - -test_beam_search: - beam_size: 143 - topk: 1 - blank_index: !ref - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -1.2 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - -# Functions and classes -# -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref - -# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -tokenizer_config: - SSL_layers: !ref - deduplicates: !ref - bpe_tokenizers: !ref - -ssl_model: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM - source: !ref - output_norm: False - freeze: !ref - freeze_feature_extractor: !ref - output_all_hiddens: True - save_path: !ref - hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT - source: !ref - output_norm: False - freeze: !ref - freeze_feature_extractor: !ref - output_all_hiddens: True - save_path: !ref - wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 - source: !ref - output_norm: False - freeze: !ref - freeze_feature_extractor: !ref - output_all_hiddens: True - save_path: !ref - -codec: !new:speechbrain.lobes.models.huggingface_transformers.discrete_ssl.DiscreteSSL - save_path: !ref - ssl_model: !ref - kmeans_dataset: !ref - kmeans_repo_id: !ref - num_clusters: !ref - -discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer - num_codebooks: !ref - vocab_size: !ref - emb_dim: !ref - -attention_mlp: !new:custom_model.AttentionMLP - input_dim: !ref - hidden_dim: !ref - -enc: !new:speechbrain.nnet.RNN.LSTM - input_shape: [Null, Null, !ref ] - num_layers: 2 - bidirectional: True - dropout: 0.2 - hidden_size: 1024 - -ctc_lin: !new:speechbrain.nnet.linear.Linear - input_size: 2048 - n_neurons: !ref - -log_softmax: !new:speechbrain.nnet.activations.Softmax - apply_log: True - -ctc_cost: !name:speechbrain.nnet.losses.ctc_loss - blank_index: !ref - -modules: - enc: !ref - ctc_lin: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - -model: !new:torch.nn.ModuleList - - [!ref , !ref , !ref , !ref ] - -model_opt_class: !name:torch.optim.Adam - lr: !ref - -lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.8 - patient: 0 - -label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder - -checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer - checkpoints_dir: !ref - recoverables: - model: !ref - scheduler_model: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - counter: !ref - tokenizer: !ref - -train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger - save_file: !ref - -error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - -cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - split_tokens: True diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_encodec.yaml deleted file mode 100644 index e2477819a..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_encodec.yaml +++ /dev/null @@ -1,183 +0,0 @@ -# ################################ -# Recipe for training an discrete-input ctc ASR system with librispeech. -# Decoding is performed with ctc greedy or LM-rescored decoder. -# -# Authors -# * Pooneh Mousavi 2024 -# ################################ - -# Seed needs to be set at top of yaml, before objects with parameters are made -seed: 1986 -__set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/MP3S-LSTM/encodec/ -output_wer_folder: !ref / -save_folder: !ref /save -train_log: !ref /train_log.txt - - -# Data files -data_folder: data # e,g./path/to/LibriSpeech -# noise/ris dataset will automatically be downloaded -# data_folder_rirs: !ref -train_splits: ["train-clean-100"] -dev_splits: ["dev-clean"] -test_splits: ["test-clean", "test-other"] - -skip_prep: False -ckpt_interval_minutes: 25 # save checkpoint every N min -train_csv: !ref /train-clean-100.csv -valid_csv: !ref /dev-clean.csv -test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv - - -# Training parameters -number_of_epochs: 20 -lr: 0.0002 -sorting: ascending -precision: fp32 - -# With data_parallel batch_size is split into N jobs -# With DDP batch_size is multiplied by N jobs -# Must be 3 per GPU to fit 32GB of VRAM -batch_size: 4 -test_batch_size: 1 - - -### Config for Tokenizer -# EnCodec parameters -# sample_rate: [24000, 24000, 24000, 24000] -# vocab_size: [1024, 1024, 1024, 1024] -# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] -# num_codebooks: [2, 4, 8, 16, 32] -vocab_size: 1024 -bandwidth: 1.5 -num_codebooks: 2 -sample_rate: 24000 -# Feature parameters -encoder_dim: 1024 -# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. -init_embedding: False -freeze_embedding: False - -# Dataloader options -train_dataloader_opts: - batch_size: !ref - -valid_dataloader_opts: - batch_size: !ref - -test_dataloader_opts: - batch_size: !ref - -# Model parameters -activation: !name:torch.nn.Sigmoid -dnn_layers: 1 -dnn_neurons: 1024 -freeze_encoder: True - -# Outputs -output_neurons: 30 # BPE size, index(blank/eos/bos) = 0 - -# Decoding parameters -blank_index: 0 -unk_index: 1 - -test_beam_search: - beam_size: 143 - topk: 1 - blank_index: !ref - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -1.2 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - -# Functions and classes -# -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref - -# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -codec: !new:speechbrain.lobes.models.huggingface_transformers.encodec.Encodec - source: facebook/encodec_24khz # Only the 24kHz version supports mono audio - save_path: !ref - sample_rate: !ref - bandwidth: !ref - flat_embeddings: False - freeze: True - renorm_embeddings: False - -discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer - num_codebooks: !ref - vocab_size: !ref - emb_dim: !ref - freeze: !ref - init: !ref - -attention_mlp: !new:custom_model.AttentionMLP - input_dim: !ref - hidden_dim: !ref - -enc: !new:speechbrain.nnet.RNN.LSTM - input_shape: [Null, Null, !ref ] - num_layers: 2 - bidirectional: True - dropout: 0.2 - hidden_size: 1024 - -ctc_lin: !new:speechbrain.nnet.linear.Linear - input_size: 2048 - n_neurons: !ref - -log_softmax: !new:speechbrain.nnet.activations.Softmax - apply_log: True - -ctc_cost: !name:speechbrain.nnet.losses.ctc_loss - blank_index: !ref - -modules: - enc: !ref - ctc_lin: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - -model: !new:torch.nn.ModuleList - - [!ref , !ref , !ref , !ref ] - -model_opt_class: !name:torch.optim.Adam - lr: !ref - -lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.8 - patient: 0 - -label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder - -checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer - checkpoints_dir: !ref - recoverables: - model: !ref - scheduler_model: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - counter: !ref - tokenizer: !ref - -train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger - save_file: !ref - -error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - -cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - split_tokens: True diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_speech_tokenizer.yaml deleted file mode 100644 index eda9a2bad..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_speech_tokenizer.yaml +++ /dev/null @@ -1,169 +0,0 @@ -# ################################ -# Recipe for training an discrete-input ctc ASR system with librispeech. -# Decoding is performed with ctc greedy or LM-rescored decoder. -# -# Authors -# * Pooneh Mousavi 2024 -# ################################ - -# Seed needs to be set at top of yaml, before objects with parameters are made -seed: 1986 -__set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/MP3S-LSTM/speech_tokenizer/ -output_wer_folder: !ref / -save_folder: !ref /save -train_log: !ref /train_log.txt - - -# Data files -data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech -# noise/ris dataset will automatically be downloaded -# data_folder_rirs: !ref -train_splits: ["train-clean-100"] -dev_splits: ["dev-clean"] -test_splits: ["test-clean", "test-other"] - -skip_prep: False -ckpt_interval_minutes: 25 # save checkpoint every N min -train_csv: !ref /train-clean-100.csv -valid_csv: !ref /dev-clean.csv -test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv - - -# Training parameters -number_of_epochs: 20 -lr: 0.0002 -sorting: ascending -precision: fp32 - -# With data_parallel batch_size is split into N jobs -# With DDP batch_size is multiplied by N jobs -# Must be 3 per GPU to fit 32GB of VRAM -batch_size: 4 -test_batch_size: 1 - - -### Config for Tokenizer -vocab_size: 1024 -num_codebooks: 2 -sample_rate: 16000 - -# Feature parameters - -encoder_dim: 1024 - -# Dataloader options -train_dataloader_opts: - batch_size: !ref - -valid_dataloader_opts: - batch_size: !ref - -test_dataloader_opts: - batch_size: !ref - -# Model parameters -activation: !name:torch.nn.Sigmoid -dnn_layers: 1 -dnn_neurons: 1024 -freeze_encoder: True - -# Outputs -output_neurons: 30 # BPE size, index(blank/eos/bos) = 0 - -# Decoding parameters -blank_index: 0 -unk_index: 1 - -test_beam_search: - beam_size: 143 - topk: 1 - blank_index: !ref - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -1.2 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - -# Functions and classes -# -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref - -# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -codec: !new:speechbrain.lobes.models.discrete.speechtokenizer_interface.SpeechTokenizer_interface - source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio - save_path: !ref -discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer - num_codebooks: !ref - vocab_size: !ref - emb_dim: !ref - -attention_mlp: !new:custom_model.AttentionMLP - input_dim: !ref - hidden_dim: !ref - -enc: !new:speechbrain.nnet.RNN.LSTM - input_shape: [Null, Null, !ref ] - num_layers: 2 - bidirectional: True - dropout: 0.2 - hidden_size: 1024 - -ctc_lin: !new:speechbrain.nnet.linear.Linear - input_size: 2048 - n_neurons: !ref - -log_softmax: !new:speechbrain.nnet.activations.Softmax - apply_log: True - -ctc_cost: !name:speechbrain.nnet.losses.ctc_loss - blank_index: !ref - -modules: - enc: !ref - ctc_lin: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - -model: !new:torch.nn.ModuleList - - [!ref , !ref , !ref , !ref ] - -model_opt_class: !name:torch.optim.Adam - lr: !ref - -lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.8 - patient: 0 - -label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder - -checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer - checkpoints_dir: !ref - recoverables: - model: !ref - scheduler_model: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - counter: !ref - tokenizer: !ref - -train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger - save_file: !ref - -error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - -cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - split_tokens: True diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_weighted_ssl.yaml b/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_weighted_ssl.yaml deleted file mode 100644 index bcfbe8d50..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_weighted_ssl.yaml +++ /dev/null @@ -1,162 +0,0 @@ -# ################################ -# Recipe for training an SSL-based ctc ASR system with librispeech. -# Decoding is performed with ctc greedy or LM-rescored decoder. -# -# Authors -# * Salah Zaiem 2023 -# * Youcef Kemiche 2023 -# * Pooneh Mousavi 2024 -# ################################ - -# Seed needs to be set at top of yaml, before objects with parameters are made -seed: 1986 -__set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/MP3S-LSTM/weighted_ssl/ -output_wer_folder: !ref / -save_folder: !ref /save -train_log: !ref /train_log.txt - - -# Data files -data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech -# noise/ris dataset will automatically be downloaded -# data_folder_rirs: !ref -train_splits: ["train-clean-100"] -dev_splits: ["dev-clean"] -test_splits: ["test-clean", "test-other"] - -skip_prep: False -ckpt_interval_minutes: 25 # save checkpoint every N min -train_csv: !ref /train-clean-100.csv -valid_csv: !ref /dev-clean.csv -test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv - -ssl_hub: microsoft/wavlm-large -ssl_folder: !ref /ssl_checkpoints -encoder_dim: 1024 - -# Training parameters -number_of_epochs: 20 -lr: 0.0002 -lr_weights: 0.01 -sorting: ascending -precision: fp32 -sample_rate: 16000 - -# With data_parallel batch_size is split into N jobs -# With DDP batch_size is multiplied by N jobs -# Must be 3 per GPU to fit 32GB of VRAM -batch_size: 4 -test_batch_size: 1 - -# Dataloader options -train_dataloader_opts: - batch_size: !ref - -valid_dataloader_opts: - batch_size: !ref - -test_dataloader_opts: - batch_size: !ref - -# Model parameters -activation: !name:torch.nn.Sigmoid -dnn_layers: 1 -dnn_neurons: 768 -freeze_encoder: True - -# Outputs -output_neurons: 30 # BPE size, index(blank/eos/bos) = 0 - -# Decoding parameters -blank_index: 0 -unk_index: 1 - -test_beam_search: - beam_size: 143 - topk: 1 - blank_index: !ref - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -1.2 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - -# Functions and classes -# -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref - -weighted_ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.WeightedSSLModel # yamllint disable-line rule:line-length - hub: !ref - save_path: !ref - -enc: !new:speechbrain.nnet.RNN.LSTM - input_shape: [Null, Null, !ref ] - num_layers: 2 - bidirectional: True - dropout: 0.2 - hidden_size: 1024 - -ctc_lin: !new:speechbrain.nnet.linear.Linear - input_size: 2048 - n_neurons: !ref - -log_softmax: !new:speechbrain.nnet.activations.Softmax - apply_log: True - -ctc_cost: !name:speechbrain.nnet.losses.ctc_loss - blank_index: !ref - -modules: - enc: !ref - ctc_lin: !ref - weighted_ssl_model: !ref - -model: !new:torch.nn.ModuleList - - [!ref , !ref ] - -model_opt_class: !name:torch.optim.Adam - lr: !ref - -weights_opt_class: !name:torch.optim.Adam - lr: !ref - -lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.8 - patient: 0 - -lr_annealing_weights: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.9 - patient: 0 - -label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder - -checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer - checkpoints_dir: !ref - recoverables: - model: !ref - ssl_model: !ref - scheduler_model: !ref - scheduler_encoder: !ref - counter: !ref - tokenizer: !ref - -train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger - save_file: !ref - -error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - -cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - split_tokens: True diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/librispeech_prepare.py b/benchmarks/DASB/LibriSpeech/ASR/LSTM/librispeech_prepare.py deleted file mode 120000 index cf4adfd79..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/librispeech_prepare.py +++ /dev/null @@ -1 +0,0 @@ -../../librispeech_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_discrete_ssl.py b/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_discrete_ssl.py deleted file mode 100644 index 2aac19193..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_discrete_ssl.py +++ /dev/null @@ -1,333 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training an discrete tokens ctc ASR system with librispeech. - -Decoding is performed with greedy decoding at validation time. -At test time, beamsearch is used with an optional external language model. - -Authors - * Pooneh Mousavi 2024 -""" - -import os -import sys -import torch -import torchaudio -import logging -import speechbrain as sb -from speechbrain.utils.distributed import run_on_main, if_main_process -from hyperpyyaml import load_hyperpyyaml -from pathlib import Path - -logger = logging.getLogger(__name__) - - -# Define training procedure -class ASR(sb.Brain): - def compute_forward(self, batch, stage): - """Forward computations from the waveform batches to the output probabilities.""" - batch = batch.to(self.device) - wavs, wav_lens = batch.sig - - # Forward pass - # Feature extraction and attention pooling - with torch.no_grad(): - self.hparams.codec.to(self.device).eval() - tokens, _, _ = self.hparams.codec( - wavs, wav_lens, **self.hparams.tokenizer_config - ) - embeddings = self.modules.discrete_embedding_layer(tokens) - att_w = self.modules.attention_mlp(embeddings) - feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2) - y = self.modules.enc(feats) - y = y[0] # As it is an RNN output - # Compute outputs - p_tokens = None - logits = self.modules.ctc_lin(y) - p_ctc = self.hparams.log_softmax(logits) - if stage == sb.Stage.VALID: - p_tokens = sb.decoders.ctc_greedy_decode( - p_ctc, wav_lens, blank_id=self.hparams.blank_index - ) - elif stage == sb.Stage.TEST: - p_tokens = test_searcher(p_ctc, wav_lens) - - return p_ctc, wav_lens, p_tokens - - def compute_objectives(self, predictions, batch, stage): - """Computes the loss (CTC+NLL) given predictions and targets.""" - - p_ctc, wav_lens, predicted_tokens = predictions - ids = batch.id - tokens, tokens_lens = batch.tokens - loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) - - if stage == sb.Stage.VALID: - # Decode token terms to words - predicted_words = [ - "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ") - for utt_seq in predicted_tokens - ] - elif stage == sb.Stage.TEST: - predicted_words = [ - hyp[0].text.split(" ") for hyp in predicted_tokens - ] - - if stage != sb.Stage.TRAIN: - target_words = [wrd.split(" ") for wrd in batch.wrd] - self.wer_metric.append(ids, predicted_words, target_words) - self.cer_metric.append(ids, predicted_words, target_words) - - return loss - - def on_stage_start(self, stage, epoch): - """Gets called at the beginning of each epoch""" - if stage != sb.Stage.TRAIN: - self.cer_metric = self.hparams.cer_computer() - self.wer_metric = self.hparams.error_rate_computer() - - def on_stage_end(self, stage, stage_loss, epoch): - """Gets called at the end of an epoch.""" - # Compute/store important stats - stage_stats = {"loss": stage_loss} - if stage == sb.Stage.TRAIN: - self.train_stats = stage_stats - else: - stage_stats["CER"] = self.cer_metric.summarize("error_rate") - stage_stats["WER"] = self.wer_metric.summarize("error_rate") - - # Perform end-of-iteration things, like annealing, logging, etc. - if stage == sb.Stage.VALID: - old_lr_model, new_lr_model = self.hparams.lr_annealing_model( - stage_stats["loss"] - ) - # old_lr_weights, new_lr_weights = self.hparams.lr_annealing_weights( - # stage_stats["loss"] - # ) - sb.nnet.schedulers.update_learning_rate( - self.model_optimizer, new_lr_model - ) - # sb.nnet.schedulers.update_learning_rate( - # self.weights_optimizer, new_lr_weights - # ) - - self.hparams.train_logger.log_stats( - stats_meta={"epoch": epoch, "lr_model": old_lr_model}, - train_stats=self.train_stats, - valid_stats=stage_stats, - ) - self.checkpointer.save_and_keep_only( - meta={"WER": stage_stats["WER"]}, min_keys=["WER"], - ) - elif stage == sb.Stage.TEST: - self.hparams.train_logger.log_stats( - stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, - test_stats=stage_stats, - ) - if if_main_process(): - with open(self.hparams.test_wer_file, "w") as w: - self.wer_metric.write_stats(w) - - def init_optimizers(self): - # "Initializes the weights optimizer and model optimizer" - # self.weights_optimizer = self.hparams.weights_opt_class( - # self.hparams.attention_mlp.parameters() - # ) - self.model_optimizer = self.hparams.model_opt_class( - self.hparams.model.parameters() - ) - self.optimizers_dict = { - # "weights_optimizer": self.weights_optimizer, - "model_optimizer": self.model_optimizer, - } - # Initializing the weights - if self.checkpointer is not None: - self.checkpointer.add_recoverable("modelopt", self.model_optimizer) - # self.checkpointer.add_recoverable( - # "weights_opt", self.weights_optimizer - # ) - - -def dataio_prepare(hparams): - """This function prepares the datasets to be used in the brain class. - It also defines the data processing pipeline through user-defined functions.""" - data_folder = hparams["data_folder"] - - train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, - ) - - if hparams["sorting"] == "ascending": - # we sort training data to speed up training and get better results. - train_data = train_data.filtered_sorted(sort_key="duration") - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "descending": - train_data = train_data.filtered_sorted( - sort_key="duration", reverse=True - ) - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "random": - pass - - else: - raise NotImplementedError( - "sorting must be random, ascending or descending" - ) - - valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, - ) - valid_data = valid_data.filtered_sorted(sort_key="duration") - - # test is separate - test_datasets = {} - for csv_file in hparams["test_csv"]: - name = Path(csv_file).stem - test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=csv_file, replacements={"data_root": data_folder} - ) - test_datasets[name] = test_datasets[name].filtered_sorted( - sort_key="duration" - ) - - datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] - - # 2. Define audio pipeline: - @sb.utils.data_pipeline.takes("wav") - @sb.utils.data_pipeline.provides("sig") - def audio_pipeline(wav): - sig = sb.dataio.dataio.read_audio(wav) - info = torchaudio.info(wav) - resampled = torchaudio.transforms.Resample( - info.sample_rate, hparams["sample_rate"], - )(sig) - # resampled = resampled.unsqueeze(0) - return resampled - - sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) - label_encoder = sb.dataio.encoder.CTCTextEncoder() - - # 3. Define text pipeline: - @sb.utils.data_pipeline.takes("wrd") - @sb.utils.data_pipeline.provides( - "wrd", "char_list", "tokens_list", "tokens" - ) - def text_pipeline(wrd): - yield wrd - char_list = list(wrd) - yield char_list - tokens_list = label_encoder.encode_sequence(char_list) - yield tokens_list - tokens = torch.LongTensor(tokens_list) - yield tokens - - sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) - - lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt") - special_labels = { - "blank_label": hparams["blank_index"], - "unk_label": hparams["unk_index"], - } - label_encoder.load_or_create( - path=lab_enc_file, - from_didatasets=[train_data], - output_key="char_list", - special_labels=special_labels, - sequence_input=True, - ) - - # 4. Set output: - sb.dataio.dataset.set_output_keys( - datasets, ["id", "sig", "wrd", "char_list", "tokens"], - ) - return train_data, valid_data, test_datasets, label_encoder - - -if __name__ == "__main__": - - # CLI: - hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) - - # If distributed_launch=True then - # create ddp_group with the right communication protocol - sb.utils.distributed.ddp_init_group(run_opts) - - with open(hparams_file) as fin: - hparams = load_hyperpyyaml(fin, overrides) - - # Create experiment directory - sb.create_experiment_directory( - experiment_directory=hparams["output_folder"], - hyperparams_to_save=hparams_file, - overrides=overrides, - ) - - # Dataset prep (parsing Librispeech) - from librispeech_prepare import prepare_librispeech # noqa - - # multi-gpu (ddp) save data preparation - run_on_main( - prepare_librispeech, - kwargs={ - "data_folder": hparams["data_folder"], - "tr_splits": hparams["train_splits"], - "dev_splits": hparams["dev_splits"], - "te_splits": hparams["test_splits"], - "save_folder": hparams["output_folder"], - "merge_lst": hparams["train_splits"], - "merge_name": "train.csv", - "skip_prep": hparams["skip_prep"], - }, - ) - - # here we create the datasets objects as well as tokenization and encoding - train_data, valid_data, test_datasets, label_encoder = dataio_prepare( - hparams - ) - - # Trainer initialization - asr_brain = ASR( - modules=hparams["modules"], - hparams=hparams, - run_opts=run_opts, - checkpointer=hparams["checkpointer"], - ) - - # Loading the SSL model - # We dynamicaly add the tokenizer to our brain class. - asr_brain.tokenizer = label_encoder - - ind2lab = label_encoder.ind2lab - vocab_list = [ind2lab[x] for x in range(len(ind2lab))] - - from speechbrain.decoders.ctc import CTCBeamSearcher - - test_searcher = CTCBeamSearcher( - **hparams["test_beam_search"], vocab_list=vocab_list, - ) - - # Training - asr_brain.fit( - asr_brain.hparams.epoch_counter, - train_data, - valid_data, - train_loader_kwargs=hparams["train_dataloader_opts"], - valid_loader_kwargs=hparams["valid_dataloader_opts"], - ) - - # Testing - if not os.path.exists(hparams["output_wer_folder"]): - os.makedirs(hparams["output_wer_folder"]) - - for k in test_datasets.keys(): # keys are test_clean, test_other etc - asr_brain.hparams.test_wer_file = os.path.join( - hparams["output_wer_folder"], f"wer_{k}.txt" - ) - asr_brain.evaluate( - test_datasets[k], - test_loader_kwargs=hparams["test_dataloader_opts"], - min_key="WER", - ) diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_encodec.py b/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_encodec.py deleted file mode 100644 index d2215ce45..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_encodec.py +++ /dev/null @@ -1,340 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training an SSL-based ctc ASR system with librispeech. - -Decoding is performed with greedy decoding at validation time. -At test time, beamsearch is used with an optional external language model. - -Authors - * Adel Moumen 2024 - * Salah Zaiem 2023 - * Youcef Kemiche 2023 -""" - -import os -import sys -import torch -import torchaudio -import logging -import speechbrain as sb -from speechbrain.utils.distributed import run_on_main, if_main_process -from hyperpyyaml import load_hyperpyyaml -from pathlib import Path - -logger = logging.getLogger(__name__) - - -# Define training procedure -class ASR(sb.Brain): - def compute_forward(self, batch, stage): - """Forward computations from the waveform batches to the output probabilities.""" - batch = batch.to(self.device) - wavs, wav_lens = batch.sig - - # Forward pass - # Feature extraction and attention pooling - with torch.no_grad(): - self.hparams.codec.to(self.device).eval() - tokens, _ = self.hparams.codec.encode(wavs, wav_lens) - embeddings = self.modules.discrete_embedding_layer(tokens) - att_w = self.modules.attention_mlp(embeddings) - feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2) - y = self.modules.enc(feats) - y = y[0] # As it is an RNN output - # Compute outputs - p_tokens = None - logits = self.modules.ctc_lin(y) - p_ctc = self.hparams.log_softmax(logits) - if stage == sb.Stage.VALID: - p_tokens = sb.decoders.ctc_greedy_decode( - p_ctc, wav_lens, blank_id=self.hparams.blank_index - ) - elif stage == sb.Stage.TEST: - p_tokens = test_searcher(p_ctc, wav_lens) - - return p_ctc, wav_lens, p_tokens - - def compute_objectives(self, predictions, batch, stage): - """Computes the loss (CTC+NLL) given predictions and targets.""" - - p_ctc, wav_lens, predicted_tokens = predictions - ids = batch.id - tokens, tokens_lens = batch.tokens - loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) - - if stage == sb.Stage.VALID: - # Decode token terms to words - predicted_words = [ - "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ") - for utt_seq in predicted_tokens - ] - elif stage == sb.Stage.TEST: - predicted_words = [ - hyp[0].text.split(" ") for hyp in predicted_tokens - ] - - if stage != sb.Stage.TRAIN: - target_words = [wrd.split(" ") for wrd in batch.wrd] - self.wer_metric.append(ids, predicted_words, target_words) - self.cer_metric.append(ids, predicted_words, target_words) - - return loss - - def on_stage_start(self, stage, epoch): - """Gets called at the beginning of each epoch""" - if stage != sb.Stage.TRAIN: - self.cer_metric = self.hparams.cer_computer() - self.wer_metric = self.hparams.error_rate_computer() - - def on_stage_end(self, stage, stage_loss, epoch): - """Gets called at the end of an epoch.""" - # Compute/store important stats - stage_stats = {"loss": stage_loss} - if stage == sb.Stage.TRAIN: - self.train_stats = stage_stats - else: - stage_stats["CER"] = self.cer_metric.summarize("error_rate") - stage_stats["WER"] = self.wer_metric.summarize("error_rate") - - # Perform end-of-iteration things, like annealing, logging, etc. - if stage == sb.Stage.VALID: - old_lr_model, new_lr_model = self.hparams.lr_annealing_model( - stage_stats["loss"] - ) - # old_lr_weights, new_lr_weights = self.hparams.lr_annealing_weights( - # stage_stats["loss"] - # ) - sb.nnet.schedulers.update_learning_rate( - self.model_optimizer, new_lr_model - ) - # sb.nnet.schedulers.update_learning_rate( - # self.weights_optimizer, new_lr_weights - # ) - - self.hparams.train_logger.log_stats( - stats_meta={"epoch": epoch, "lr_model": old_lr_model}, - train_stats=self.train_stats, - valid_stats=stage_stats, - ) - self.checkpointer.save_and_keep_only( - meta={"WER": stage_stats["WER"]}, min_keys=["WER"], - ) - elif stage == sb.Stage.TEST: - self.hparams.train_logger.log_stats( - stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, - test_stats=stage_stats, - ) - if if_main_process(): - with open(self.hparams.test_wer_file, "w") as w: - self.wer_metric.write_stats(w) - - def init_optimizers(self): - "Initializes the weights optimizer and model optimizer" - # self.weights_optimizer = self.hparams.weights_opt_class( - # self.hparams.attention_mlp.parameters() - # ) - self.model_optimizer = self.hparams.model_opt_class( - self.hparams.model.parameters() - ) - self.optimizers_dict = { - # "weights_optimizer": self.weights_optimizer, - "model_optimizer": self.model_optimizer, - } - # Initializing the weights - if self.checkpointer is not None: - self.checkpointer.add_recoverable("modelopt", self.model_optimizer) - # self.checkpointer.add_recoverable( - # "weights_opt", self.weights_optimizer - # ) - - -def dataio_prepare(hparams): - """This function prepares the datasets to be used in the brain class. - It also defines the data processing pipeline through user-defined functions.""" - data_folder = hparams["data_folder"] - - train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, - ) - - if hparams["sorting"] == "ascending": - # we sort training data to speed up training and get better results. - train_data = train_data.filtered_sorted(sort_key="duration") - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "descending": - train_data = train_data.filtered_sorted( - sort_key="duration", reverse=True - ) - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "random": - pass - - else: - raise NotImplementedError( - "sorting must be random, ascending or descending" - ) - - valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, - ) - valid_data = valid_data.filtered_sorted(sort_key="duration") - - # test is separate - test_datasets = {} - for csv_file in hparams["test_csv"]: - name = Path(csv_file).stem - test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=csv_file, replacements={"data_root": data_folder} - ) - test_datasets[name] = test_datasets[name].filtered_sorted( - sort_key="duration" - ) - - datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] - - # 2. Define audio pipeline: - @sb.utils.data_pipeline.takes("wav") - @sb.utils.data_pipeline.provides("sig") - def audio_pipeline(wav): - sig = sb.dataio.dataio.read_audio(wav) - info = torchaudio.info(wav) - resampled = torchaudio.transforms.Resample( - info.sample_rate, hparams["sample_rate"], - )(sig) - # resampled = resampled.unsqueeze(0) - return resampled - - sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) - label_encoder = sb.dataio.encoder.CTCTextEncoder() - - # 3. Define text pipeline: - @sb.utils.data_pipeline.takes("wrd") - @sb.utils.data_pipeline.provides( - "wrd", "char_list", "tokens_list", "tokens" - ) - def text_pipeline(wrd): - yield wrd - char_list = list(wrd) - yield char_list - tokens_list = label_encoder.encode_sequence(char_list) - yield tokens_list - tokens = torch.LongTensor(tokens_list) - yield tokens - - sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) - - lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt") - special_labels = { - "blank_label": hparams["blank_index"], - "unk_label": hparams["unk_index"], - } - label_encoder.load_or_create( - path=lab_enc_file, - from_didatasets=[train_data], - output_key="char_list", - special_labels=special_labels, - sequence_input=True, - ) - - # 4. Set output: - sb.dataio.dataset.set_output_keys( - datasets, ["id", "sig", "wrd", "char_list", "tokens"], - ) - return train_data, valid_data, test_datasets, label_encoder - - -if __name__ == "__main__": - - # CLI: - hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) - - # If distributed_launch=True then - # create ddp_group with the right communication protocol - sb.utils.distributed.ddp_init_group(run_opts) - - with open(hparams_file) as fin: - hparams = load_hyperpyyaml(fin, overrides) - - # Create experiment directory - sb.create_experiment_directory( - experiment_directory=hparams["output_folder"], - hyperparams_to_save=hparams_file, - overrides=overrides, - ) - - if hparams["discrete_embedding_layer"].init: - hparams["discrete_embedding_layer"].init_embedding( - hparams["codec"] - .vocabulary[: hparams["num_codebooks"], :, :] - .flatten(0, 1) - ) - - # Dataset prep (parsing Librispeech) - from librispeech_prepare import prepare_librispeech # noqa - - # multi-gpu (ddp) save data preparation - run_on_main( - prepare_librispeech, - kwargs={ - "data_folder": hparams["data_folder"], - "tr_splits": hparams["train_splits"], - "dev_splits": hparams["dev_splits"], - "te_splits": hparams["test_splits"], - "save_folder": hparams["output_folder"], - "merge_lst": hparams["train_splits"], - "merge_name": "train.csv", - "skip_prep": hparams["skip_prep"], - }, - ) - - # here we create the datasets objects as well as tokenization and encoding - train_data, valid_data, test_datasets, label_encoder = dataio_prepare( - hparams - ) - - # Trainer initialization - asr_brain = ASR( - modules=hparams["modules"], - hparams=hparams, - run_opts=run_opts, - checkpointer=hparams["checkpointer"], - ) - - # Loading the SSL model - # We dynamicaly add the tokenizer to our brain class. - asr_brain.tokenizer = label_encoder - - ind2lab = label_encoder.ind2lab - vocab_list = [ind2lab[x] for x in range(len(ind2lab))] - - from speechbrain.decoders.ctc import CTCBeamSearcher - - test_searcher = CTCBeamSearcher( - **hparams["test_beam_search"], vocab_list=vocab_list, - ) - - # Training - asr_brain.fit( - asr_brain.hparams.epoch_counter, - train_data, - valid_data, - train_loader_kwargs=hparams["train_dataloader_opts"], - valid_loader_kwargs=hparams["valid_dataloader_opts"], - ) - - # Testing - if not os.path.exists(hparams["output_wer_folder"]): - os.makedirs(hparams["output_wer_folder"]) - - for k in test_datasets.keys(): # keys are test_clean, test_other etc - asr_brain.hparams.test_wer_file = os.path.join( - hparams["output_wer_folder"], f"wer_{k}.txt" - ) - asr_brain.evaluate( - test_datasets[k], - test_loader_kwargs=hparams["test_dataloader_opts"], - min_key="WER", - ) diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_speech_tokenizer.py b/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_speech_tokenizer.py deleted file mode 100644 index 1493b5972..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_speech_tokenizer.py +++ /dev/null @@ -1,335 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training an SSL-based ctc ASR system with librispeech. - -Decoding is performed with greedy decoding at validation time. -At test time, beamsearch is used with an optional external language model. - -Authors - * Adel Moumen 2024 - * Salah Zaiem 2023 - * Youcef Kemiche 2023 -""" - -import os -import sys -import torch -import torchaudio -import logging -import speechbrain as sb -from speechbrain.utils.distributed import run_on_main, if_main_process -from hyperpyyaml import load_hyperpyyaml -from pathlib import Path - -logger = logging.getLogger(__name__) - - -# Define training procedure -class ASR(sb.Brain): - def compute_forward(self, batch, stage): - """Forward computations from the waveform batches to the output probabilities.""" - batch = batch.to(self.device) - wavs, wav_lens = batch.sig - - # Forward pass - # Feature extraction and attention pooling - with torch.no_grad(): - self.hparams.codec.to(self.device).eval() - tokens = self.hparams.codec(wavs).permute(1, 2, 0)[ - :, :, : self.hparams.num_codebooks - ] - embeddings = self.modules.discrete_embedding_layer(tokens) - att_w = self.modules.attention_mlp(embeddings) - feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2) - y = self.modules.enc(feats) - y = y[0] # As it is an RNN output - # Compute outputs - p_tokens = None - logits = self.modules.ctc_lin(y) - p_ctc = self.hparams.log_softmax(logits) - if stage == sb.Stage.VALID: - p_tokens = sb.decoders.ctc_greedy_decode( - p_ctc, wav_lens, blank_id=self.hparams.blank_index - ) - elif stage == sb.Stage.TEST: - p_tokens = test_searcher(p_ctc, wav_lens) - - return p_ctc, wav_lens, p_tokens - - def compute_objectives(self, predictions, batch, stage): - """Computes the loss (CTC+NLL) given predictions and targets.""" - - p_ctc, wav_lens, predicted_tokens = predictions - ids = batch.id - tokens, tokens_lens = batch.tokens - loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) - - if stage == sb.Stage.VALID: - # Decode token terms to words - predicted_words = [ - "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ") - for utt_seq in predicted_tokens - ] - elif stage == sb.Stage.TEST: - predicted_words = [ - hyp[0].text.split(" ") for hyp in predicted_tokens - ] - - if stage != sb.Stage.TRAIN: - target_words = [wrd.split(" ") for wrd in batch.wrd] - self.wer_metric.append(ids, predicted_words, target_words) - self.cer_metric.append(ids, predicted_words, target_words) - - return loss - - def on_stage_start(self, stage, epoch): - """Gets called at the beginning of each epoch""" - if stage != sb.Stage.TRAIN: - self.cer_metric = self.hparams.cer_computer() - self.wer_metric = self.hparams.error_rate_computer() - - def on_stage_end(self, stage, stage_loss, epoch): - """Gets called at the end of an epoch.""" - # Compute/store important stats - stage_stats = {"loss": stage_loss} - if stage == sb.Stage.TRAIN: - self.train_stats = stage_stats - else: - stage_stats["CER"] = self.cer_metric.summarize("error_rate") - stage_stats["WER"] = self.wer_metric.summarize("error_rate") - - # Perform end-of-iteration things, like annealing, logging, etc. - if stage == sb.Stage.VALID: - old_lr_model, new_lr_model = self.hparams.lr_annealing_model( - stage_stats["loss"] - ) - # old_lr_weights, new_lr_weights = self.hparams.lr_annealing_weights( - # stage_stats["loss"] - # ) - sb.nnet.schedulers.update_learning_rate( - self.model_optimizer, new_lr_model - ) - # sb.nnet.schedulers.update_learning_rate( - # self.weights_optimizer, new_lr_weights - # ) - - self.hparams.train_logger.log_stats( - stats_meta={"epoch": epoch, "lr_model": old_lr_model}, - train_stats=self.train_stats, - valid_stats=stage_stats, - ) - self.checkpointer.save_and_keep_only( - meta={"WER": stage_stats["WER"]}, min_keys=["WER"], - ) - elif stage == sb.Stage.TEST: - self.hparams.train_logger.log_stats( - stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, - test_stats=stage_stats, - ) - if if_main_process(): - with open(self.hparams.test_wer_file, "w") as w: - self.wer_metric.write_stats(w) - - def init_optimizers(self): - "Initializes the weights optimizer and model optimizer" - # self.weights_optimizer = self.hparams.weights_opt_class( - # self.hparams.attention_mlp.parameters() - # ) - self.model_optimizer = self.hparams.model_opt_class( - self.hparams.model.parameters() - ) - self.optimizers_dict = { - # "weights_optimizer": self.weights_optimizer, - "model_optimizer": self.model_optimizer, - } - # Initializing the weights - if self.checkpointer is not None: - self.checkpointer.add_recoverable("modelopt", self.model_optimizer) - # self.checkpointer.add_recoverable( - # "weights_opt", self.weights_optimizer - # ) - - -def dataio_prepare(hparams): - """This function prepares the datasets to be used in the brain class. - It also defines the data processing pipeline through user-defined functions.""" - data_folder = hparams["data_folder"] - - train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, - ) - - if hparams["sorting"] == "ascending": - # we sort training data to speed up training and get better results. - train_data = train_data.filtered_sorted(sort_key="duration") - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "descending": - train_data = train_data.filtered_sorted( - sort_key="duration", reverse=True - ) - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "random": - pass - - else: - raise NotImplementedError( - "sorting must be random, ascending or descending" - ) - - valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, - ) - valid_data = valid_data.filtered_sorted(sort_key="duration") - - # test is separate - test_datasets = {} - for csv_file in hparams["test_csv"]: - name = Path(csv_file).stem - test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=csv_file, replacements={"data_root": data_folder} - ) - test_datasets[name] = test_datasets[name].filtered_sorted( - sort_key="duration" - ) - - datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] - - # 2. Define audio pipeline: - @sb.utils.data_pipeline.takes("wav") - @sb.utils.data_pipeline.provides("sig") - def audio_pipeline(wav): - sig = sb.dataio.dataio.read_audio(wav) - info = torchaudio.info(wav) - resampled = torchaudio.transforms.Resample( - info.sample_rate, hparams["sample_rate"], - )(sig) - # resampled = resampled.unsqueeze(0) - return resampled - - sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) - label_encoder = sb.dataio.encoder.CTCTextEncoder() - - # 3. Define text pipeline: - @sb.utils.data_pipeline.takes("wrd") - @sb.utils.data_pipeline.provides( - "wrd", "char_list", "tokens_list", "tokens" - ) - def text_pipeline(wrd): - yield wrd - char_list = list(wrd) - yield char_list - tokens_list = label_encoder.encode_sequence(char_list) - yield tokens_list - tokens = torch.LongTensor(tokens_list) - yield tokens - - sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) - - lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt") - special_labels = { - "blank_label": hparams["blank_index"], - "unk_label": hparams["unk_index"], - } - label_encoder.load_or_create( - path=lab_enc_file, - from_didatasets=[train_data], - output_key="char_list", - special_labels=special_labels, - sequence_input=True, - ) - - # 4. Set output: - sb.dataio.dataset.set_output_keys( - datasets, ["id", "sig", "wrd", "char_list", "tokens"], - ) - return train_data, valid_data, test_datasets, label_encoder - - -if __name__ == "__main__": - - # CLI: - hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) - - # If distributed_launch=True then - # create ddp_group with the right communication protocol - sb.utils.distributed.ddp_init_group(run_opts) - - with open(hparams_file) as fin: - hparams = load_hyperpyyaml(fin, overrides) - - # Create experiment directory - sb.create_experiment_directory( - experiment_directory=hparams["output_folder"], - hyperparams_to_save=hparams_file, - overrides=overrides, - ) - - # Dataset prep (parsing Librispeech) - from librispeech_prepare import prepare_librispeech # noqa - - # multi-gpu (ddp) save data preparation - run_on_main( - prepare_librispeech, - kwargs={ - "data_folder": hparams["data_folder"], - "tr_splits": hparams["train_splits"], - "dev_splits": hparams["dev_splits"], - "te_splits": hparams["test_splits"], - "save_folder": hparams["output_folder"], - "merge_lst": hparams["train_splits"], - "merge_name": "train.csv", - "skip_prep": hparams["skip_prep"], - }, - ) - - # here we create the datasets objects as well as tokenization and encoding - train_data, valid_data, test_datasets, label_encoder = dataio_prepare( - hparams - ) - - # Trainer initialization - asr_brain = ASR( - modules=hparams["modules"], - hparams=hparams, - run_opts=run_opts, - checkpointer=hparams["checkpointer"], - ) - - # Loading the SSL model - # We dynamicaly add the tokenizer to our brain class. - asr_brain.tokenizer = label_encoder - - ind2lab = label_encoder.ind2lab - vocab_list = [ind2lab[x] for x in range(len(ind2lab))] - - from speechbrain.decoders.ctc import CTCBeamSearcher - - test_searcher = CTCBeamSearcher( - **hparams["test_beam_search"], vocab_list=vocab_list, - ) - - # Training - asr_brain.fit( - asr_brain.hparams.epoch_counter, - train_data, - valid_data, - train_loader_kwargs=hparams["train_dataloader_opts"], - valid_loader_kwargs=hparams["valid_dataloader_opts"], - ) - - # Testing - if not os.path.exists(hparams["output_wer_folder"]): - os.makedirs(hparams["output_wer_folder"]) - - for k in test_datasets.keys(): # keys are test_clean, test_other etc - asr_brain.hparams.test_wer_file = os.path.join( - hparams["output_wer_folder"], f"wer_{k}.txt" - ) - asr_brain.evaluate( - test_datasets[k], - test_loader_kwargs=hparams["test_dataloader_opts"], - min_key="WER", - ) diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_weighted_ssl.py b/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_weighted_ssl.py deleted file mode 100644 index 4a7aed382..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_weighted_ssl.py +++ /dev/null @@ -1,322 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training an SSL-based ctc ASR system with librispeech. - -Decoding is performed with greedy decoding at validation time. -At test time, beamsearch is used with an optional external language model. - -Authors - * Adel Moumen 2024 - * Salah Zaiem 2023 - * Youcef Kemiche 2023 - * Pooneh Mousavi 2024 -""" - -import os -import sys -import torch -import logging -import speechbrain as sb -from speechbrain.utils.distributed import run_on_main, if_main_process -from hyperpyyaml import load_hyperpyyaml -from pathlib import Path - -logger = logging.getLogger(__name__) - - -# Define training procedure -class ASR(sb.Brain): - def compute_forward(self, batch, stage): - """Forward computations from the waveform batches to the output probabilities.""" - batch = batch.to(self.device) - wavs, wav_lens = batch.sig - - # Forward pass - feats = self.modules.weighted_ssl_model(wavs) - y = self.modules.enc(feats) - y = y[0] # As it is an RNN output - # Compute outputs - p_tokens = None - logits = self.modules.ctc_lin(y) - p_ctc = self.hparams.log_softmax(logits) - if stage == sb.Stage.VALID: - p_tokens = sb.decoders.ctc_greedy_decode( - p_ctc, wav_lens, blank_id=self.hparams.blank_index - ) - elif stage == sb.Stage.TEST: - p_tokens = test_searcher(p_ctc, wav_lens) - - return p_ctc, wav_lens, p_tokens - - def compute_objectives(self, predictions, batch, stage): - """Computes the loss (CTC+NLL) given predictions and targets.""" - - p_ctc, wav_lens, predicted_tokens = predictions - ids = batch.id - tokens, tokens_lens = batch.tokens - loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) - - if stage == sb.Stage.VALID: - # Decode token terms to words - predicted_words = [ - "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ") - for utt_seq in predicted_tokens - ] - elif stage == sb.Stage.TEST: - predicted_words = [ - hyp[0].text.split(" ") for hyp in predicted_tokens - ] - - if stage != sb.Stage.TRAIN: - target_words = [wrd.split(" ") for wrd in batch.wrd] - self.wer_metric.append(ids, predicted_words, target_words) - self.cer_metric.append(ids, predicted_words, target_words) - - return loss - - def on_stage_start(self, stage, epoch): - """Gets called at the beginning of each epoch""" - if stage != sb.Stage.TRAIN: - self.cer_metric = self.hparams.cer_computer() - self.wer_metric = self.hparams.error_rate_computer() - - def on_stage_end(self, stage, stage_loss, epoch): - """Gets called at the end of an epoch.""" - # Compute/store important stats - stage_stats = {"loss": stage_loss} - if stage == sb.Stage.TRAIN: - self.train_stats = stage_stats - else: - stage_stats["CER"] = self.cer_metric.summarize("error_rate") - stage_stats["WER"] = self.wer_metric.summarize("error_rate") - - # Perform end-of-iteration things, like annealing, logging, etc. - if stage == sb.Stage.VALID: - old_lr_model, new_lr_model = self.hparams.lr_annealing_model( - stage_stats["loss"] - ) - old_lr_weights, new_lr_weights = self.hparams.lr_annealing_weights( - stage_stats["loss"] - ) - sb.nnet.schedulers.update_learning_rate( - self.model_optimizer, new_lr_model - ) - sb.nnet.schedulers.update_learning_rate( - self.weights_optimizer, new_lr_weights - ) - - self.hparams.train_logger.log_stats( - stats_meta={"epoch": epoch, "lr_model": old_lr_model}, - train_stats=self.train_stats, - valid_stats=stage_stats, - ) - self.checkpointer.save_and_keep_only( - meta={"WER": stage_stats["WER"]}, min_keys=["WER"], - ) - elif stage == sb.Stage.TEST: - self.hparams.train_logger.log_stats( - stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, - test_stats=stage_stats, - ) - if if_main_process(): - with open(self.hparams.test_wer_file, "w") as w: - self.wer_metric.write_stats(w) - - def init_optimizers(self): - "Initializes the weights optimizer and model optimizer" - self.weights_optimizer = self.hparams.weights_opt_class( - [self.modules.weighted_ssl_model.weights] - ) - self.model_optimizer = self.hparams.model_opt_class( - self.hparams.model.parameters() - ) - self.optimizers_dict = { - "weights_optimizer": self.weights_optimizer, - "model_optimizer": self.model_optimizer, - } - # Initializing the weights - if self.checkpointer is not None: - self.checkpointer.add_recoverable("modelopt", self.model_optimizer) - self.checkpointer.add_recoverable( - "weights_opt", self.weights_optimizer - ) - - -def dataio_prepare(hparams): - """This function prepares the datasets to be used in the brain class. - It also defines the data processing pipeline through user-defined functions.""" - data_folder = hparams["data_folder"] - - train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, - ) - - if hparams["sorting"] == "ascending": - # we sort training data to speed up training and get better results. - train_data = train_data.filtered_sorted(sort_key="duration") - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "descending": - train_data = train_data.filtered_sorted( - sort_key="duration", reverse=True - ) - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "random": - pass - - else: - raise NotImplementedError( - "sorting must be random, ascending or descending" - ) - - valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, - ) - valid_data = valid_data.filtered_sorted(sort_key="duration") - - # test is separate - test_datasets = {} - for csv_file in hparams["test_csv"]: - name = Path(csv_file).stem - test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=csv_file, replacements={"data_root": data_folder} - ) - test_datasets[name] = test_datasets[name].filtered_sorted( - sort_key="duration" - ) - - datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] - - # 2. Define audio pipeline: - @sb.utils.data_pipeline.takes("wav") - @sb.utils.data_pipeline.provides("sig") - def audio_pipeline(wav): - sig = sb.dataio.dataio.read_audio(wav) - return sig - - sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) - label_encoder = sb.dataio.encoder.CTCTextEncoder() - - # 3. Define text pipeline: - @sb.utils.data_pipeline.takes("wrd") - @sb.utils.data_pipeline.provides( - "wrd", "char_list", "tokens_list", "tokens" - ) - def text_pipeline(wrd): - yield wrd - char_list = list(wrd) - yield char_list - tokens_list = label_encoder.encode_sequence(char_list) - yield tokens_list - tokens = torch.LongTensor(tokens_list) - yield tokens - - sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) - - lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt") - special_labels = { - "blank_label": hparams["blank_index"], - "unk_label": hparams["unk_index"], - } - label_encoder.load_or_create( - path=lab_enc_file, - from_didatasets=[train_data], - output_key="char_list", - special_labels=special_labels, - sequence_input=True, - ) - - # 4. Set output: - sb.dataio.dataset.set_output_keys( - datasets, ["id", "sig", "wrd", "char_list", "tokens"], - ) - return train_data, valid_data, test_datasets, label_encoder - - -if __name__ == "__main__": - - # CLI: - hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) - - # If distributed_launch=True then - # create ddp_group with the right communication protocol - sb.utils.distributed.ddp_init_group(run_opts) - - with open(hparams_file) as fin: - hparams = load_hyperpyyaml(fin, overrides) - - # Create experiment directory - sb.create_experiment_directory( - experiment_directory=hparams["output_folder"], - hyperparams_to_save=hparams_file, - overrides=overrides, - ) - - # Dataset prep (parsing Librispeech) - from librispeech_prepare import prepare_librispeech # noqa - - # multi-gpu (ddp) save data preparation - run_on_main( - prepare_librispeech, - kwargs={ - "data_folder": hparams["data_folder"], - "tr_splits": hparams["train_splits"], - "dev_splits": hparams["dev_splits"], - "te_splits": hparams["test_splits"], - "save_folder": hparams["output_folder"], - "merge_lst": hparams["train_splits"], - "merge_name": "train.csv", - "skip_prep": hparams["skip_prep"], - }, - ) - - # here we create the datasets objects as well as tokenization and encoding - train_data, valid_data, test_datasets, label_encoder = dataio_prepare( - hparams - ) - - # Trainer initialization - asr_brain = ASR( - modules=hparams["modules"], - hparams=hparams, - run_opts=run_opts, - checkpointer=hparams["checkpointer"], - ) - - # Loading the SSL model - # We dynamicaly add the tokenizer to our brain class. - asr_brain.tokenizer = label_encoder - - ind2lab = label_encoder.ind2lab - vocab_list = [ind2lab[x] for x in range(len(ind2lab))] - - from speechbrain.decoders.ctc import CTCBeamSearcher - - test_searcher = CTCBeamSearcher( - **hparams["test_beam_search"], vocab_list=vocab_list, - ) - - # Training - asr_brain.fit( - asr_brain.hparams.epoch_counter, - train_data, - valid_data, - train_loader_kwargs=hparams["train_dataloader_opts"], - valid_loader_kwargs=hparams["valid_dataloader_opts"], - ) - - # Testing - if not os.path.exists(hparams["output_wer_folder"]): - os.makedirs(hparams["output_wer_folder"]) - - for k in test_datasets.keys(): # keys are test_clean, test_other etc - asr_brain.hparams.test_wer_file = os.path.join( - hparams["output_wer_folder"], f"wer_{k}.txt" - ) - asr_brain.evaluate( - test_datasets[k], - test_loader_kwargs=hparams["test_dataloader_opts"], - min_key="WER", - ) diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/custom_model.py b/benchmarks/DASB/LibriSpeech/ASR/contextnet/custom_model.py deleted file mode 120000 index 4b3f08ebb..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/custom_model.py +++ /dev/null @@ -1 +0,0 @@ -../../../model/custom_model.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_dac.yaml b/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_dac.yaml deleted file mode 100644 index 4533e2e8d..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_dac.yaml +++ /dev/null @@ -1,172 +0,0 @@ -# ################################ -# Recipe for training an dac-based ctc ASR system with librispeech. -# Decoding is performed with ctc greedy or LM-rescored decoder. -# -# Authors -# * pooneh Mousavi 2024 -# ################################ - -# Seed needs to be set at top of yaml, before objects with parameters are made -seed: 1986 -__set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/MP3S-contextnet/dac/ -output_wer_folder: !ref / -save_folder: !ref /save -train_log: !ref /train_log.txt - -# Data files -data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech -# noise/ris dataset will automatically be downloaded -# data_folder_rirs: !ref -train_splits: ["train-clean-100"] -dev_splits: ["dev-clean"] -test_splits: ["test-clean", "test-other"] -skip_prep: False -ckpt_interval_minutes: 25 # save checkpoint every N min -train_csv: !ref /train-clean-100.csv -valid_csv: !ref /dev-clean.csv -test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv - -num_layers_ssl: 25 #Number of layers in the SSL model (should be 25 for large) -### Config for Tokenizer -# DAC parameters -# model_type: [16khz, 24khz, 44khz, 44khz] -# vocab_size: [1024, 1024, 1024, 1024] -# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps] -# max_num_codebooks: [12, 32, 9, 18] -# embedding_dim: [1024, 1024, 1024, 128] -model_type: 24khz -vocab_size: 1024 -model_bitrate: 8kbps -num_codebooks: 2 # NOTE: must be smaller or equal to the maximum number of codebooks for the given model type -sample_rate: 24000 -encoder_dim: 1024 - - -# Training parameters -number_of_epochs: 20 -lr: 0.0002 -sorting: ascending -precision: fp32 - -# With data_parallel batch_size is split into N jobs -# With DDP batch_size is multiplied by N jobs -# Must be 3 per GPU to fit 32GB of VRAM -batch_size: 4 -test_batch_size: 1 - -# Dataloader options -train_dataloader_opts: - batch_size: !ref - -valid_dataloader_opts: - batch_size: !ref - -test_dataloader_opts: - batch_size: !ref - -# Model parameters -activation: !name:torch.nn.Sigmoid -dnn_layers: 1 -dnn_neurons: 640 -freeze_encoder: True - -# Outputs -output_neurons: 30 - -# Decoding parameters -blank_index: 0 -unk_index: 1 - -test_beam_search: - beam_size: 143 - topk: 1 - blank_index: !ref - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -1.2 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - -# Functions and classes -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref - -# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -codec: !new:speechbrain.lobes.models.discrete.dac.DAC - model_type: !ref - model_bitrate: !ref - load_pretrained: True - tag: latest - -discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer - num_codebooks: !ref - vocab_size: !ref - emb_dim: !ref - -attention_mlp: !new:custom_model.AttentionMLP - input_dim: !ref - hidden_dim: !ref - -enc: !new:speechbrain.lobes.models.ContextNet.ContextNet - input_shape: [null, null, !ref ] - strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] - -# only unitary strides to keep the frame rate - - -ctc_lin: !new:speechbrain.nnet.linear.Linear - input_size: 640 - n_neurons: !ref - -log_softmax: !new:speechbrain.nnet.activations.Softmax - apply_log: True - -ctc_cost: !name:speechbrain.nnet.losses.ctc_loss - blank_index: !ref - -modules: - enc: !ref - ctc_lin: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - -model: !new:torch.nn.ModuleList - - [!ref , !ref , !ref , !ref ] - -model_opt_class: !name:torch.optim.Adam - lr: !ref - -lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.8 - patient: 0 - -label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder -checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer - checkpoints_dir: !ref - recoverables: - model: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - scheduler_model: !ref - counter: !ref - tokenizer: !ref - -train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger - save_file: !ref - -error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - -cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - split_tokens: True diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_discrete_ssl.yaml deleted file mode 100644 index c394c73c1..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_discrete_ssl.yaml +++ /dev/null @@ -1,214 +0,0 @@ -# ################################ -# Recipe for training an discrete_ssl-based ctc ASR system with librispeech. -# Decoding is performed with ctc greedy or LM-rescored decoder. -# -# Authors -# * pooneh Mousavi 2024 -# ################################ - -# Seed needs to be set at top of yaml, before objects with parameters are made -seed: 1986 -__set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/MP3S-contextnet/encodec/ -output_wer_folder: !ref / -save_folder: !ref /save -train_log: !ref /train_log.txt - -# Data files -data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech -# noise/ris dataset will automatically be downloaded -# data_folder_rirs: !ref -train_splits: ["train-clean-100"] -dev_splits: ["dev-clean"] -test_splits: ["test-clean", "test-other"] -skip_prep: False -ckpt_interval_minutes: 25 # save checkpoint every N min -train_csv: !ref /train-clean-100.csv -valid_csv: !ref /dev-clean.csv -test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv - -num_layers_ssl: 25 #Number of layers in the SSL model (should be 25 for large) - -### Configuration for discrete SSL model -# ssl_model_type: hubert, wavlm, wav2vec2 -# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large, facebook/wav2vec2-large -ssl_model_type: hubert # hubert, wavml or wav2vec2 -ssl_hub: facebook/hubert-large-ll60k -ssl_folder: !ref /ssl_checkpoint -kmeans_repo_id: speechbrain/SSL_Quantization -kmeans_cache_dir: !ref /kmeans_checkpoint -kmeans_dataset: LibriSpeech-100-360-500 -freeze_ssl: True -freeze_feature_extractor: True -num_clusters: 1000 - -### Config for Tokenizer -# Layer number should be among the supported layers for discrete SSL models(kmenas model should be available for that layer) -# ssl_layer_num: [3, 7, 12, 23] -# deduplicate: [False, False, False, False] -# bpe_tokenizer_path: [null , null, null, null] -ssl_layer_num: [1, 3, 7, 12, 18, 23] -num_codebooks: 6 -deduplicate: [False, False, False, False, False, False] -bpe_tokenizer_path: [null, null, null, null, null, null] -sample_rate: 16000 -encoder_dim: 1024 - -# Training parameters -number_of_epochs: 20 -lr: 0.0002 -sorting: ascending -precision: fp32 - -# With data_parallel batch_size is split into N jobs -# With DDP batch_size is multiplied by N jobs -# Must be 3 per GPU to fit 32GB of VRAM -batch_size: 4 -test_batch_size: 1 - -# Dataloader options -train_dataloader_opts: - batch_size: !ref - -valid_dataloader_opts: - batch_size: !ref - -test_dataloader_opts: - batch_size: !ref - -# Model parameters -activation: !name:torch.nn.Sigmoid -dnn_layers: 1 -dnn_neurons: 640 -freeze_encoder: True - -# Outputs -output_neurons: 30 - -# Decoding parameters -blank_index: 0 -unk_index: 1 - -test_beam_search: - beam_size: 143 - topk: 1 - blank_index: !ref - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -1.2 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - -# Functions and classes -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref - -# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -tokenizer_config: - SSL_layers: !ref - deduplicates: !ref - bpe_tokenizers: !ref - -ssl_model: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM - source: !ref - output_norm: False - freeze: !ref - freeze_feature_extractor: !ref - output_all_hiddens: True - save_path: !ref - hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT - source: !ref - output_norm: False - freeze: !ref - freeze_feature_extractor: !ref - output_all_hiddens: True - save_path: !ref - wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 - source: !ref - output_norm: False - freeze: !ref - freeze_feature_extractor: !ref - output_all_hiddens: True - save_path: !ref - -codec: !new:speechbrain.lobes.models.huggingface_transformers.discrete_ssl.DiscreteSSL - save_path: !ref - ssl_model: !ref - kmeans_dataset: !ref - kmeans_repo_id: !ref - num_clusters: !ref - -discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer - num_codebooks: !ref - vocab_size: !ref - emb_dim: !ref - -attention_mlp: !new:custom_model.AttentionMLP - input_dim: !ref - hidden_dim: !ref - -enc: !new:speechbrain.lobes.models.ContextNet.ContextNet - input_shape: [null, null, !ref ] - strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] - -# only unitary strides to keep the frame rate - - -ctc_lin: !new:speechbrain.nnet.linear.Linear - input_size: 640 - n_neurons: !ref - -log_softmax: !new:speechbrain.nnet.activations.Softmax - apply_log: True - -ctc_cost: !name:speechbrain.nnet.losses.ctc_loss - blank_index: !ref - -modules: - enc: !ref - ctc_lin: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - -model: !new:torch.nn.ModuleList - - [!ref , !ref , !ref , !ref ] - -model_opt_class: !name:torch.optim.Adam - lr: !ref - -lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.8 - patient: 0 - -label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder -checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer - checkpoints_dir: !ref - recoverables: - model: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - scheduler_model: !ref - counter: !ref - tokenizer: !ref - -train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger - save_file: !ref - -error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - -cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - split_tokens: True diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_encodec.yaml deleted file mode 100644 index 6163550e9..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_encodec.yaml +++ /dev/null @@ -1,178 +0,0 @@ -# ################################ -# Recipe for training an encodec-based ctc ASR system with librispeech. -# Decoding is performed with ctc greedy or LM-rescored decoder. -# -# Authors -# * pooneh Mousavi 2024 -# ################################ - -# Seed needs to be set at top of yaml, before objects with parameters are made -seed: 1986 -__set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/MP3S-contextnet/encodec/ -output_wer_folder: !ref / -save_folder: !ref /save -train_log: !ref /train_log.txt - -# Data files -data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech -# noise/ris dataset will automatically be downloaded -# data_folder_rirs: !ref -train_splits: ["train-clean-100"] -dev_splits: ["dev-clean"] -test_splits: ["test-clean", "test-other"] -skip_prep: False -ckpt_interval_minutes: 25 # save checkpoint every N min -train_csv: !ref /train-clean-100.csv -valid_csv: !ref /dev-clean.csv -test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv - -num_layers_ssl: 25 #Number of layers in the SSL model (should be 25 for large) -### Config for Tokenizer -# EnCodec parameters -# sample_rate: [24000, 24000, 24000, 24000] -# vocab_size: [1024, 1024, 1024, 1024] -# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] -# num_codebooks: [2, 4, 8, 16, 32] -vocab_size: 1024 -bandwidth: 1.5 -num_codebooks: 2 -sample_rate: 24000 -# Feature parameters -encoder_dim: 1024 -# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. -init_embedding: False -freeze_embedding: False - -# Training parameters -number_of_epochs: 20 -lr: 0.0002 -sorting: ascending -precision: fp32 - -# With data_parallel batch_size is split into N jobs -# With DDP batch_size is multiplied by N jobs -# Must be 3 per GPU to fit 32GB of VRAM -batch_size: 4 -test_batch_size: 1 - -# Dataloader options -train_dataloader_opts: - batch_size: !ref - -valid_dataloader_opts: - batch_size: !ref - -test_dataloader_opts: - batch_size: !ref - -# Model parameters -activation: !name:torch.nn.Sigmoid -dnn_layers: 1 -dnn_neurons: 640 -freeze_encoder: True - -# Outputs -output_neurons: 30 - -# Decoding parameters -blank_index: 0 -unk_index: 1 - -test_beam_search: - beam_size: 143 - topk: 1 - blank_index: !ref - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -1.2 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - -# Functions and classes -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref - -# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -codec: !new:speechbrain.lobes.models.huggingface_transformers.encodec.Encodec - source: facebook/encodec_24khz # Only the 24kHz version supports mono audio - save_path: !ref - sample_rate: !ref - bandwidth: !ref - flat_embeddings: False - freeze: True - renorm_embeddings: False - -discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer - num_codebooks: !ref - vocab_size: !ref - emb_dim: !ref - freeze: !ref - init: !ref - -attention_mlp: !new:custom_model.AttentionMLP - input_dim: !ref - hidden_dim: !ref - -enc: !new:speechbrain.lobes.models.ContextNet.ContextNet - input_shape: [null, null, !ref ] - strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] - -# only unitary strides to keep the frame rate - - -ctc_lin: !new:speechbrain.nnet.linear.Linear - input_size: 640 - n_neurons: !ref - -log_softmax: !new:speechbrain.nnet.activations.Softmax - apply_log: True - -ctc_cost: !name:speechbrain.nnet.losses.ctc_loss - blank_index: !ref - -modules: - enc: !ref - ctc_lin: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - -model: !new:torch.nn.ModuleList - - [!ref , !ref , !ref , !ref ] - -model_opt_class: !name:torch.optim.Adam - lr: !ref - -lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.8 - patient: 0 - -label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder -checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer - checkpoints_dir: !ref - recoverables: - model: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - scheduler_model: !ref - counter: !ref - tokenizer: !ref - -train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger - save_file: !ref - -error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - -cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - split_tokens: True diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_speech_tokenizer.yaml deleted file mode 100644 index aef1307ec..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_speech_tokenizer.yaml +++ /dev/null @@ -1,160 +0,0 @@ -# ################################ -# Recipe for training an speech_tokenizer-based ctc ASR system with librispeech. -# Decoding is performed with ctc greedy or LM-rescored decoder. -# -# Authors -# * pooneh Mousavi 2024 -# ################################ - -# Seed needs to be set at top of yaml, before objects with parameters are made -seed: 1986 -__set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/MP3S-contextnet/speech_tokenizer/ -output_wer_folder: !ref / -save_folder: !ref /save -train_log: !ref /train_log.txt - -# Data files -data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech -# noise/ris dataset will automatically be downloaded -# data_folder_rirs: !ref -train_splits: ["train-clean-100"] -dev_splits: ["dev-clean"] -test_splits: ["test-clean", "test-other"] -skip_prep: False -ckpt_interval_minutes: 25 # save checkpoint every N min -train_csv: !ref /train-clean-100.csv -valid_csv: !ref /dev-clean.csv -test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv - -num_layers_ssl: 25 #Number of layers in the SSL model (should be 25 for large) -### Config for Tokenizer -vocab_size: 1024 -num_codebooks: 2 -sample_rate: 16000 - -encoder_dim: 1024 -# Training parameters -number_of_epochs: 20 -lr: 0.0002 -sorting: ascending -precision: fp32 - -# With data_parallel batch_size is split into N jobs -# With DDP batch_size is multiplied by N jobs -# Must be 3 per GPU to fit 32GB of VRAM -batch_size: 4 -test_batch_size: 1 - -# Dataloader options -train_dataloader_opts: - batch_size: !ref - -valid_dataloader_opts: - batch_size: !ref - -test_dataloader_opts: - batch_size: !ref - -# Model parameters -activation: !name:torch.nn.Sigmoid -dnn_layers: 1 -dnn_neurons: 640 -freeze_encoder: True - -# Outputs -output_neurons: 30 - -# Decoding parameters -blank_index: 0 -unk_index: 1 - -test_beam_search: - beam_size: 143 - topk: 1 - blank_index: !ref - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -1.2 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - -# Functions and classes -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref - -# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -codec: !new:speechbrain.lobes.models.discrete.speechtokenizer_interface.SpeechTokenizer_interface - source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio - save_path: !ref -discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer - num_codebooks: !ref - vocab_size: !ref - emb_dim: !ref - -attention_mlp: !new:custom_model.AttentionMLP - input_dim: !ref - hidden_dim: !ref - -enc: !new:speechbrain.lobes.models.ContextNet.ContextNet - input_shape: [null, null, !ref ] - strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] - -# only unitary strides to keep the frame rate - - -ctc_lin: !new:speechbrain.nnet.linear.Linear - input_size: 640 - n_neurons: !ref - -log_softmax: !new:speechbrain.nnet.activations.Softmax - apply_log: True - -ctc_cost: !name:speechbrain.nnet.losses.ctc_loss - blank_index: !ref - -modules: - enc: !ref - ctc_lin: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - -model: !new:torch.nn.ModuleList - - [!ref , !ref , !ref , !ref ] - -model_opt_class: !name:torch.optim.Adam - lr: !ref - -lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.8 - patient: 0 - -label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder -checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer - checkpoints_dir: !ref - recoverables: - model: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - scheduler_model: !ref - counter: !ref - tokenizer: !ref - -train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger - save_file: !ref - -error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - -cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - split_tokens: True diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_weighted_ssl.yaml b/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_weighted_ssl.yaml deleted file mode 100644 index 6d806f0a5..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_weighted_ssl.yaml +++ /dev/null @@ -1,157 +0,0 @@ -# ################################ -# Recipe for training an encodec-based ctc ASR system with librispeech. -# Decoding is performed with ctc greedy or LM-rescored decoder. -# -# Authors -# * pooneh Mousavi 2024 -# ################################ - -# Seed needs to be set at top of yaml, before objects with parameters are made -seed: 1986 -__set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/MP3S-contextnet/encodec/ -output_wer_folder: !ref / -save_folder: !ref /save -train_log: !ref /train_log.txt - -# Data files -data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech -# noise/ris dataset will automatically be downloaded -# data_folder_rirs: !ref -train_splits: ["train-clean-100"] -dev_splits: ["dev-clean"] -test_splits: ["test-clean", "test-other"] -skip_prep: False -ckpt_interval_minutes: 25 # save checkpoint every N min -train_csv: !ref /train-clean-100.csv -valid_csv: !ref /dev-clean.csv -test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv - -num_layers_ssl: 25 #Number of layers in the SSL model (should be 25 for large) -ssl_hub: microsoft/wavlm-large -ssl_folder: !ref /ssl_checkpoints -encoder_dim: 1024 - -# Training parameters -number_of_epochs: 2 -lr: 0.0002 -lr_weights: 0.01 -sorting: ascending -precision: fp32 -sample_rate: 16000 - -# With data_parallel batch_size is split into N jobs -# With DDP batch_size is multiplied by N jobs -# Must be 3 per GPU to fit 32GB of VRAM -batch_size: 4 -test_batch_size: 1 - -# Dataloader options -train_dataloader_opts: - batch_size: !ref - -valid_dataloader_opts: - batch_size: !ref - -test_dataloader_opts: - batch_size: !ref - -# Model parameters -activation: !name:torch.nn.Sigmoid -dnn_layers: 1 -dnn_neurons: 640 -freeze_encoder: True - -# Outputs -output_neurons: 30 - -# Decoding parameters -blank_index: 0 -unk_index: 1 - -test_beam_search: - beam_size: 143 - topk: 1 - blank_index: !ref - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -1.2 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - -# Functions and classes -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref - -weighted_ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.WeightedSSLModel # yamllint disable-line rule:line-length - hub: !ref - save_path: !ref - -enc: !new:speechbrain.lobes.models.ContextNet.ContextNet - input_shape: [null, null, !ref ] - strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] - -# only unitary strides to keep the frame rate - - -ctc_lin: !new:speechbrain.nnet.linear.Linear - input_size: 640 - n_neurons: !ref - -log_softmax: !new:speechbrain.nnet.activations.Softmax - apply_log: True - -ctc_cost: !name:speechbrain.nnet.losses.ctc_loss - blank_index: !ref - -modules: - enc: !ref - ctc_lin: !ref - weighted_ssl_model: !ref - -model: !new:torch.nn.ModuleList - - [!ref , !ref ] - -model_opt_class: !name:torch.optim.Adam - lr: !ref - -weights_opt_class: !name:torch.optim.Adam - lr: !ref - -lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.8 - patient: 0 - -lr_annealing_weights: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.9 - patient: 0 - -label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder -checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer - checkpoints_dir: !ref - recoverables: - model: !ref - ssl_model: !ref - scheduler_model: !ref - scheduler_encoder: !ref - counter: !ref - tokenizer: !ref - -train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger - save_file: !ref - -error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - -cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - split_tokens: True diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/librispeech_prepare.py b/benchmarks/DASB/LibriSpeech/ASR/contextnet/librispeech_prepare.py deleted file mode 120000 index cf4adfd79..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/librispeech_prepare.py +++ /dev/null @@ -1 +0,0 @@ -../../librispeech_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_dac.py b/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_dac.py deleted file mode 100644 index a177e48a5..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_dac.py +++ /dev/null @@ -1,321 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training an discrete tokens + ctc ASR system with librispeech. -Decoding is performed with greedy decoding at validation time. -At test time, beamsearch is used with an optional external language model. - -Authors - * Pooneh Mousavi 2024 -""" - -import os -import sys -import torch -import logging -import speechbrain as sb -from speechbrain.utils.distributed import run_on_main, if_main_process -from hyperpyyaml import load_hyperpyyaml -from pathlib import Path -import torchaudio - -logger = logging.getLogger(__name__) - - -# Define training procedure -class ASR(sb.Brain): - def compute_forward(self, batch, stage): - """Forward computations from the waveform batches to the output probabilities.""" - batch = batch.to(self.device) - wavs, wav_lens = batch.sig - - # Forward pass - # Feature extraction and attention pooling - with torch.no_grad(): - self.hparams.codec.to(self.device).eval() - tokens, _ = self.hparams.codec( - wavs.unsqueeze(1), n_quantizers=self.hparams.num_codebooks - ) - embeddings = self.modules.discrete_embedding_layer( - tokens.movedim(-2, -1) - ) - att_w = self.modules.attention_mlp(embeddings) - feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2) - y = self.modules.enc(feats) - - # Compute outputs - p_tokens = None - logits = self.modules.ctc_lin(y) - p_ctc = self.hparams.log_softmax(logits) - - if stage == sb.Stage.VALID: - p_tokens = sb.decoders.ctc_greedy_decode( - p_ctc, wav_lens, blank_id=self.hparams.blank_index - ) - elif stage == sb.Stage.TEST: - p_tokens = test_searcher(p_ctc, wav_lens) - - return p_ctc, wav_lens, p_tokens - - def compute_objectives(self, predictions, batch, stage): - """Computes the loss (CTC+NLL) given predictions and targets.""" - - p_ctc, wav_lens, predicted_tokens = predictions - ids = batch.id - tokens, tokens_lens = batch.tokens - loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) - - if stage == sb.Stage.VALID: - # Decode token terms to words - predicted_words = [ - "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ") - for utt_seq in predicted_tokens - ] - elif stage == sb.Stage.TEST: - predicted_words = [ - hyp[0].text.split(" ") for hyp in predicted_tokens - ] - - if stage != sb.Stage.TRAIN: - target_words = [wrd.split(" ") for wrd in batch.wrd] - self.wer_metric.append(ids, predicted_words, target_words) - self.cer_metric.append(ids, predicted_words, target_words) - - return loss - - def on_stage_start(self, stage, epoch): - """Gets called at the beginning of each epoch""" - if stage != sb.Stage.TRAIN: - self.cer_metric = self.hparams.cer_computer() - self.wer_metric = self.hparams.error_rate_computer() - - def on_stage_end(self, stage, stage_loss, epoch): - """Gets called at the end of an epoch.""" - # Compute/store important stats - stage_stats = {"loss": stage_loss} - if stage == sb.Stage.TRAIN: - self.train_stats = stage_stats - else: - stage_stats["CER"] = self.cer_metric.summarize("error_rate") - stage_stats["WER"] = self.wer_metric.summarize("error_rate") - - # Perform end-of-iteration things, like annealing, logging, etc. - if stage == sb.Stage.VALID: - old_lr_model, new_lr_model = self.hparams.lr_annealing_model( - stage_stats["loss"] - ) - sb.nnet.schedulers.update_learning_rate( - self.model_optimizer, new_lr_model - ) - - self.hparams.train_logger.log_stats( - stats_meta={"epoch": epoch, "lr_model": old_lr_model}, - train_stats=self.train_stats, - valid_stats=stage_stats, - ) - self.checkpointer.save_and_keep_only( - meta={"WER": stage_stats["WER"]}, min_keys=["WER"], - ) - elif stage == sb.Stage.TEST: - self.hparams.train_logger.log_stats( - stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, - test_stats=stage_stats, - ) - if if_main_process(): - with open(self.hparams.test_wer_file, "w") as w: - self.wer_metric.write_stats(w) - - def init_optimizers(self): - "Initializes the model optimizer" - self.model_optimizer = self.hparams.model_opt_class( - self.hparams.model.parameters() - ) - self.optimizers_dict = { - "model_optimizer": self.model_optimizer, - } - # Initializing the weights - if self.checkpointer is not None: - self.checkpointer.add_recoverable("modelopt", self.model_optimizer) - - -def dataio_prepare(hparams): - """This function prepares the datasets to be used in the brain class. - It also defines the data processing pipeline through user-defined functions.""" - data_folder = hparams["data_folder"] - - train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, - ) - - if hparams["sorting"] == "ascending": - # we sort training data to speed up training and get better results. - train_data = train_data.filtered_sorted(sort_key="duration") - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "descending": - train_data = train_data.filtered_sorted( - sort_key="duration", reverse=True - ) - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "random": - pass - - else: - raise NotImplementedError( - "sorting must be random, ascending or descending" - ) - - valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, - ) - valid_data = valid_data.filtered_sorted(sort_key="duration") - - # test is separate - test_datasets = {} - for csv_file in hparams["test_csv"]: - name = Path(csv_file).stem - test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=csv_file, replacements={"data_root": data_folder} - ) - test_datasets[name] = test_datasets[name].filtered_sorted( - sort_key="duration" - ) - - datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] - - # 2. Define audio pipeline: - @sb.utils.data_pipeline.takes("wav") - @sb.utils.data_pipeline.provides("sig") - def audio_pipeline(wav): - sig = sb.dataio.dataio.read_audio(wav) - info = torchaudio.info(wav) - resampled = torchaudio.transforms.Resample( - info.sample_rate, hparams["sample_rate"], - )(sig) - # resampled = resampled.unsqueeze(0) - return resampled - - sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) - label_encoder = sb.dataio.encoder.CTCTextEncoder() - - # 3. Define text pipeline: - @sb.utils.data_pipeline.takes("wrd") - @sb.utils.data_pipeline.provides( - "wrd", "char_list", "tokens_list", "tokens" - ) - def text_pipeline(wrd): - yield wrd - char_list = list(wrd) - yield char_list - tokens_list = label_encoder.encode_sequence(char_list) - yield tokens_list - tokens = torch.LongTensor(tokens_list) - yield tokens - - sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) - - lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt") - special_labels = { - "blank_label": hparams["blank_index"], - "unk_label": hparams["unk_index"], - } - label_encoder.load_or_create( - path=lab_enc_file, - from_didatasets=[train_data], - output_key="char_list", - special_labels=special_labels, - sequence_input=True, - ) - - # 4. Set output: - sb.dataio.dataset.set_output_keys( - datasets, ["id", "sig", "wrd", "char_list", "tokens"], - ) - return train_data, valid_data, test_datasets, label_encoder - - -if __name__ == "__main__": - - # CLI: - hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) - - # If distributed_launch=True then - # create ddp_group with the right communication protocol - sb.utils.distributed.ddp_init_group(run_opts) - - with open(hparams_file) as fin: - hparams = load_hyperpyyaml(fin, overrides) - - # Create experiment directory - sb.create_experiment_directory( - experiment_directory=hparams["output_folder"], - hyperparams_to_save=hparams_file, - overrides=overrides, - ) - - # Dataset prep (parsing Librispeech) - from librispeech_prepare import prepare_librispeech # noqa - - # multi-gpu (ddp) save data preparation - run_on_main( - prepare_librispeech, - kwargs={ - "data_folder": hparams["data_folder"], - "tr_splits": hparams["train_splits"], - "dev_splits": hparams["dev_splits"], - "te_splits": hparams["test_splits"], - "save_folder": hparams["output_folder"], - "merge_lst": hparams["train_splits"], - "merge_name": "train.csv", - "skip_prep": hparams["skip_prep"], - }, - ) - - # here we create the datasets objects as well as tokenization and encoding - train_data, valid_data, test_datasets, label_encoder = dataio_prepare( - hparams - ) - - # Trainer initialization - asr_brain = ASR( - modules=hparams["modules"], - hparams=hparams, - run_opts=run_opts, - checkpointer=hparams["checkpointer"], - ) - - # We dynamicaly add the tokenizer to our brain class. - asr_brain.tokenizer = label_encoder - - ind2lab = label_encoder.ind2lab - vocab_list = [ind2lab[x] for x in range(len(ind2lab))] - - from speechbrain.decoders.ctc import CTCBeamSearcher - - test_searcher = CTCBeamSearcher( - **hparams["test_beam_search"], vocab_list=vocab_list, - ) - - # Training - asr_brain.fit( - asr_brain.hparams.epoch_counter, - train_data, - valid_data, - train_loader_kwargs=hparams["train_dataloader_opts"], - valid_loader_kwargs=hparams["valid_dataloader_opts"], - ) - - # Testing - if not os.path.exists(hparams["output_wer_folder"]): - os.makedirs(hparams["output_wer_folder"]) - - for k in test_datasets.keys(): # keys are test_clean, test_other etc - asr_brain.hparams.test_wer_file = os.path.join( - hparams["output_wer_folder"], f"wer_{k}.txt" - ) - asr_brain.evaluate( - test_datasets[k], - test_loader_kwargs=hparams["test_dataloader_opts"], - min_key="WER", - ) diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_discrete_ssl.py b/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_discrete_ssl.py deleted file mode 100644 index 640f6a220..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_discrete_ssl.py +++ /dev/null @@ -1,319 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training an discrete tokens + ctc ASR system with librispeech. -Decoding is performed with greedy decoding at validation time. -At test time, beamsearch is used with an optional external language model. - -Authors - * Pooneh Mousavi 2024 -""" - -import os -import sys -import torch -import logging -import speechbrain as sb -from speechbrain.utils.distributed import run_on_main, if_main_process -from hyperpyyaml import load_hyperpyyaml -from pathlib import Path -import torchaudio - -logger = logging.getLogger(__name__) - - -# Define training procedure -class ASR(sb.Brain): - def compute_forward(self, batch, stage): - """Forward computations from the waveform batches to the output probabilities.""" - batch = batch.to(self.device) - wavs, wav_lens = batch.sig - - # Forward pass - # Feature extraction and attention pooling - with torch.no_grad(): - self.hparams.codec.to(self.device).eval() - tokens, _, _ = self.hparams.codec( - wavs, wav_lens, **self.hparams.tokenizer_config - ) - embeddings = self.modules.discrete_embedding_layer(tokens) - att_w = self.modules.attention_mlp(embeddings) - feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2) - y = self.modules.enc(feats) - - # Compute outputs - p_tokens = None - logits = self.modules.ctc_lin(y) - p_ctc = self.hparams.log_softmax(logits) - - if stage == sb.Stage.VALID: - p_tokens = sb.decoders.ctc_greedy_decode( - p_ctc, wav_lens, blank_id=self.hparams.blank_index - ) - elif stage == sb.Stage.TEST: - p_tokens = test_searcher(p_ctc, wav_lens) - - return p_ctc, wav_lens, p_tokens - - def compute_objectives(self, predictions, batch, stage): - """Computes the loss (CTC+NLL) given predictions and targets.""" - - p_ctc, wav_lens, predicted_tokens = predictions - ids = batch.id - tokens, tokens_lens = batch.tokens - loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) - - if stage == sb.Stage.VALID: - # Decode token terms to words - predicted_words = [ - "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ") - for utt_seq in predicted_tokens - ] - elif stage == sb.Stage.TEST: - predicted_words = [ - hyp[0].text.split(" ") for hyp in predicted_tokens - ] - - if stage != sb.Stage.TRAIN: - target_words = [wrd.split(" ") for wrd in batch.wrd] - self.wer_metric.append(ids, predicted_words, target_words) - self.cer_metric.append(ids, predicted_words, target_words) - - return loss - - def on_stage_start(self, stage, epoch): - """Gets called at the beginning of each epoch""" - if stage != sb.Stage.TRAIN: - self.cer_metric = self.hparams.cer_computer() - self.wer_metric = self.hparams.error_rate_computer() - - def on_stage_end(self, stage, stage_loss, epoch): - """Gets called at the end of an epoch.""" - # Compute/store important stats - stage_stats = {"loss": stage_loss} - if stage == sb.Stage.TRAIN: - self.train_stats = stage_stats - else: - stage_stats["CER"] = self.cer_metric.summarize("error_rate") - stage_stats["WER"] = self.wer_metric.summarize("error_rate") - - # Perform end-of-iteration things, like annealing, logging, etc. - if stage == sb.Stage.VALID: - old_lr_model, new_lr_model = self.hparams.lr_annealing_model( - stage_stats["loss"] - ) - sb.nnet.schedulers.update_learning_rate( - self.model_optimizer, new_lr_model - ) - - self.hparams.train_logger.log_stats( - stats_meta={"epoch": epoch, "lr_model": old_lr_model}, - train_stats=self.train_stats, - valid_stats=stage_stats, - ) - self.checkpointer.save_and_keep_only( - meta={"WER": stage_stats["WER"]}, min_keys=["WER"], - ) - elif stage == sb.Stage.TEST: - self.hparams.train_logger.log_stats( - stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, - test_stats=stage_stats, - ) - if if_main_process(): - with open(self.hparams.test_wer_file, "w") as w: - self.wer_metric.write_stats(w) - - def init_optimizers(self): - "Initializes the model optimizer" - self.model_optimizer = self.hparams.model_opt_class( - self.hparams.model.parameters() - ) - self.optimizers_dict = { - "model_optimizer": self.model_optimizer, - } - # Initializing the weights - if self.checkpointer is not None: - self.checkpointer.add_recoverable("modelopt", self.model_optimizer) - - -def dataio_prepare(hparams): - """This function prepares the datasets to be used in the brain class. - It also defines the data processing pipeline through user-defined functions.""" - data_folder = hparams["data_folder"] - - train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, - ) - - if hparams["sorting"] == "ascending": - # we sort training data to speed up training and get better results. - train_data = train_data.filtered_sorted(sort_key="duration") - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "descending": - train_data = train_data.filtered_sorted( - sort_key="duration", reverse=True - ) - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "random": - pass - - else: - raise NotImplementedError( - "sorting must be random, ascending or descending" - ) - - valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, - ) - valid_data = valid_data.filtered_sorted(sort_key="duration") - - # test is separate - test_datasets = {} - for csv_file in hparams["test_csv"]: - name = Path(csv_file).stem - test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=csv_file, replacements={"data_root": data_folder} - ) - test_datasets[name] = test_datasets[name].filtered_sorted( - sort_key="duration" - ) - - datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] - - # 2. Define audio pipeline: - @sb.utils.data_pipeline.takes("wav") - @sb.utils.data_pipeline.provides("sig") - def audio_pipeline(wav): - sig = sb.dataio.dataio.read_audio(wav) - info = torchaudio.info(wav) - resampled = torchaudio.transforms.Resample( - info.sample_rate, hparams["sample_rate"], - )(sig) - # resampled = resampled.unsqueeze(0) - return resampled - - sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) - label_encoder = sb.dataio.encoder.CTCTextEncoder() - - # 3. Define text pipeline: - @sb.utils.data_pipeline.takes("wrd") - @sb.utils.data_pipeline.provides( - "wrd", "char_list", "tokens_list", "tokens" - ) - def text_pipeline(wrd): - yield wrd - char_list = list(wrd) - yield char_list - tokens_list = label_encoder.encode_sequence(char_list) - yield tokens_list - tokens = torch.LongTensor(tokens_list) - yield tokens - - sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) - - lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt") - special_labels = { - "blank_label": hparams["blank_index"], - "unk_label": hparams["unk_index"], - } - label_encoder.load_or_create( - path=lab_enc_file, - from_didatasets=[train_data], - output_key="char_list", - special_labels=special_labels, - sequence_input=True, - ) - - # 4. Set output: - sb.dataio.dataset.set_output_keys( - datasets, ["id", "sig", "wrd", "char_list", "tokens"], - ) - return train_data, valid_data, test_datasets, label_encoder - - -if __name__ == "__main__": - - # CLI: - hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) - - # If distributed_launch=True then - # create ddp_group with the right communication protocol - sb.utils.distributed.ddp_init_group(run_opts) - - with open(hparams_file) as fin: - hparams = load_hyperpyyaml(fin, overrides) - - # Create experiment directory - sb.create_experiment_directory( - experiment_directory=hparams["output_folder"], - hyperparams_to_save=hparams_file, - overrides=overrides, - ) - - # Dataset prep (parsing Librispeech) - from librispeech_prepare import prepare_librispeech # noqa - - # multi-gpu (ddp) save data preparation - run_on_main( - prepare_librispeech, - kwargs={ - "data_folder": hparams["data_folder"], - "tr_splits": hparams["train_splits"], - "dev_splits": hparams["dev_splits"], - "te_splits": hparams["test_splits"], - "save_folder": hparams["output_folder"], - "merge_lst": hparams["train_splits"], - "merge_name": "train.csv", - "skip_prep": hparams["skip_prep"], - }, - ) - - # here we create the datasets objects as well as tokenization and encoding - train_data, valid_data, test_datasets, label_encoder = dataio_prepare( - hparams - ) - - # Trainer initialization - asr_brain = ASR( - modules=hparams["modules"], - hparams=hparams, - run_opts=run_opts, - checkpointer=hparams["checkpointer"], - ) - - # We dynamicaly add the tokenizer to our brain class. - asr_brain.tokenizer = label_encoder - - ind2lab = label_encoder.ind2lab - vocab_list = [ind2lab[x] for x in range(len(ind2lab))] - - from speechbrain.decoders.ctc import CTCBeamSearcher - - test_searcher = CTCBeamSearcher( - **hparams["test_beam_search"], vocab_list=vocab_list, - ) - - # Training - asr_brain.fit( - asr_brain.hparams.epoch_counter, - train_data, - valid_data, - train_loader_kwargs=hparams["train_dataloader_opts"], - valid_loader_kwargs=hparams["valid_dataloader_opts"], - ) - - # Testing - if not os.path.exists(hparams["output_wer_folder"]): - os.makedirs(hparams["output_wer_folder"]) - - for k in test_datasets.keys(): # keys are test_clean, test_other etc - asr_brain.hparams.test_wer_file = os.path.join( - hparams["output_wer_folder"], f"wer_{k}.txt" - ) - asr_brain.evaluate( - test_datasets[k], - test_loader_kwargs=hparams["test_dataloader_opts"], - min_key="WER", - ) diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_encodec.py b/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_encodec.py deleted file mode 100644 index eb7232303..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_encodec.py +++ /dev/null @@ -1,316 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training an discrete tokens + ctc ASR system with librispeech. -Decoding is performed with greedy decoding at validation time. -At test time, beamsearch is used with an optional external language model. - -Authors - * Pooneh Mousavi 2024 -""" - -import os -import sys -import torch -import logging -import speechbrain as sb -from speechbrain.utils.distributed import run_on_main, if_main_process -from hyperpyyaml import load_hyperpyyaml -from pathlib import Path -import torchaudio - -logger = logging.getLogger(__name__) - - -# Define training procedure -class ASR(sb.Brain): - def compute_forward(self, batch, stage): - """Forward computations from the waveform batches to the output probabilities.""" - batch = batch.to(self.device) - wavs, wav_lens = batch.sig - - # Forward pass - with torch.no_grad(): - self.hparams.codec.to(self.device).eval() - tokens, _ = self.hparams.codec.encode(wavs, wav_lens) - embeddings = self.modules.discrete_embedding_layer(tokens) - att_w = self.modules.attention_mlp(embeddings) - feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2) - y = self.modules.enc(feats) - - # Compute outputs - p_tokens = None - logits = self.modules.ctc_lin(y) - p_ctc = self.hparams.log_softmax(logits) - - if stage == sb.Stage.VALID: - p_tokens = sb.decoders.ctc_greedy_decode( - p_ctc, wav_lens, blank_id=self.hparams.blank_index - ) - elif stage == sb.Stage.TEST: - p_tokens = test_searcher(p_ctc, wav_lens) - - return p_ctc, wav_lens, p_tokens - - def compute_objectives(self, predictions, batch, stage): - """Computes the loss (CTC+NLL) given predictions and targets.""" - - p_ctc, wav_lens, predicted_tokens = predictions - ids = batch.id - tokens, tokens_lens = batch.tokens - loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) - - if stage == sb.Stage.VALID: - # Decode token terms to words - predicted_words = [ - "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ") - for utt_seq in predicted_tokens - ] - elif stage == sb.Stage.TEST: - predicted_words = [ - hyp[0].text.split(" ") for hyp in predicted_tokens - ] - - if stage != sb.Stage.TRAIN: - target_words = [wrd.split(" ") for wrd in batch.wrd] - self.wer_metric.append(ids, predicted_words, target_words) - self.cer_metric.append(ids, predicted_words, target_words) - - return loss - - def on_stage_start(self, stage, epoch): - """Gets called at the beginning of each epoch""" - if stage != sb.Stage.TRAIN: - self.cer_metric = self.hparams.cer_computer() - self.wer_metric = self.hparams.error_rate_computer() - - def on_stage_end(self, stage, stage_loss, epoch): - """Gets called at the end of an epoch.""" - # Compute/store important stats - stage_stats = {"loss": stage_loss} - if stage == sb.Stage.TRAIN: - self.train_stats = stage_stats - else: - stage_stats["CER"] = self.cer_metric.summarize("error_rate") - stage_stats["WER"] = self.wer_metric.summarize("error_rate") - - # Perform end-of-iteration things, like annealing, logging, etc. - if stage == sb.Stage.VALID: - old_lr_model, new_lr_model = self.hparams.lr_annealing_model( - stage_stats["loss"] - ) - sb.nnet.schedulers.update_learning_rate( - self.model_optimizer, new_lr_model - ) - - self.hparams.train_logger.log_stats( - stats_meta={"epoch": epoch, "lr_model": old_lr_model}, - train_stats=self.train_stats, - valid_stats=stage_stats, - ) - self.checkpointer.save_and_keep_only( - meta={"WER": stage_stats["WER"]}, min_keys=["WER"], - ) - elif stage == sb.Stage.TEST: - self.hparams.train_logger.log_stats( - stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, - test_stats=stage_stats, - ) - if if_main_process(): - with open(self.hparams.test_wer_file, "w") as w: - self.wer_metric.write_stats(w) - - def init_optimizers(self): - "Initializes the model optimizer" - self.model_optimizer = self.hparams.model_opt_class( - self.hparams.model.parameters() - ) - self.optimizers_dict = { - "model_optimizer": self.model_optimizer, - } - # Initializing the weights - if self.checkpointer is not None: - self.checkpointer.add_recoverable("modelopt", self.model_optimizer) - - -def dataio_prepare(hparams): - """This function prepares the datasets to be used in the brain class. - It also defines the data processing pipeline through user-defined functions.""" - data_folder = hparams["data_folder"] - - train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, - ) - - if hparams["sorting"] == "ascending": - # we sort training data to speed up training and get better results. - train_data = train_data.filtered_sorted(sort_key="duration") - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "descending": - train_data = train_data.filtered_sorted( - sort_key="duration", reverse=True - ) - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "random": - pass - - else: - raise NotImplementedError( - "sorting must be random, ascending or descending" - ) - - valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, - ) - valid_data = valid_data.filtered_sorted(sort_key="duration") - - # test is separate - test_datasets = {} - for csv_file in hparams["test_csv"]: - name = Path(csv_file).stem - test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=csv_file, replacements={"data_root": data_folder} - ) - test_datasets[name] = test_datasets[name].filtered_sorted( - sort_key="duration" - ) - - datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] - - # 2. Define audio pipeline: - @sb.utils.data_pipeline.takes("wav") - @sb.utils.data_pipeline.provides("sig") - def audio_pipeline(wav): - sig = sb.dataio.dataio.read_audio(wav) - info = torchaudio.info(wav) - resampled = torchaudio.transforms.Resample( - info.sample_rate, hparams["sample_rate"], - )(sig) - # resampled = resampled.unsqueeze(0) - return resampled - - sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) - label_encoder = sb.dataio.encoder.CTCTextEncoder() - - # 3. Define text pipeline: - @sb.utils.data_pipeline.takes("wrd") - @sb.utils.data_pipeline.provides( - "wrd", "char_list", "tokens_list", "tokens" - ) - def text_pipeline(wrd): - yield wrd - char_list = list(wrd) - yield char_list - tokens_list = label_encoder.encode_sequence(char_list) - yield tokens_list - tokens = torch.LongTensor(tokens_list) - yield tokens - - sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) - - lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt") - special_labels = { - "blank_label": hparams["blank_index"], - "unk_label": hparams["unk_index"], - } - label_encoder.load_or_create( - path=lab_enc_file, - from_didatasets=[train_data], - output_key="char_list", - special_labels=special_labels, - sequence_input=True, - ) - - # 4. Set output: - sb.dataio.dataset.set_output_keys( - datasets, ["id", "sig", "wrd", "char_list", "tokens"], - ) - return train_data, valid_data, test_datasets, label_encoder - - -if __name__ == "__main__": - - # CLI: - hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) - - # If distributed_launch=True then - # create ddp_group with the right communication protocol - sb.utils.distributed.ddp_init_group(run_opts) - - with open(hparams_file) as fin: - hparams = load_hyperpyyaml(fin, overrides) - - # Create experiment directory - sb.create_experiment_directory( - experiment_directory=hparams["output_folder"], - hyperparams_to_save=hparams_file, - overrides=overrides, - ) - - # Dataset prep (parsing Librispeech) - from librispeech_prepare import prepare_librispeech # noqa - - # multi-gpu (ddp) save data preparation - run_on_main( - prepare_librispeech, - kwargs={ - "data_folder": hparams["data_folder"], - "tr_splits": hparams["train_splits"], - "dev_splits": hparams["dev_splits"], - "te_splits": hparams["test_splits"], - "save_folder": hparams["output_folder"], - "merge_lst": hparams["train_splits"], - "merge_name": "train.csv", - "skip_prep": hparams["skip_prep"], - }, - ) - - # here we create the datasets objects as well as tokenization and encoding - train_data, valid_data, test_datasets, label_encoder = dataio_prepare( - hparams - ) - - # Trainer initialization - asr_brain = ASR( - modules=hparams["modules"], - hparams=hparams, - run_opts=run_opts, - checkpointer=hparams["checkpointer"], - ) - - # We dynamicaly add the tokenizer to our brain class. - asr_brain.tokenizer = label_encoder - - ind2lab = label_encoder.ind2lab - vocab_list = [ind2lab[x] for x in range(len(ind2lab))] - - from speechbrain.decoders.ctc import CTCBeamSearcher - - test_searcher = CTCBeamSearcher( - **hparams["test_beam_search"], vocab_list=vocab_list, - ) - - # Training - asr_brain.fit( - asr_brain.hparams.epoch_counter, - train_data, - valid_data, - train_loader_kwargs=hparams["train_dataloader_opts"], - valid_loader_kwargs=hparams["valid_dataloader_opts"], - ) - - # Testing - if not os.path.exists(hparams["output_wer_folder"]): - os.makedirs(hparams["output_wer_folder"]) - - for k in test_datasets.keys(): # keys are test_clean, test_other etc - asr_brain.hparams.test_wer_file = os.path.join( - hparams["output_wer_folder"], f"wer_{k}.txt" - ) - asr_brain.evaluate( - test_datasets[k], - test_loader_kwargs=hparams["test_dataloader_opts"], - min_key="WER", - ) diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_speech_tokenizer.py b/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_speech_tokenizer.py deleted file mode 100644 index cd784c80c..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_speech_tokenizer.py +++ /dev/null @@ -1,319 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training an discrete tokens + ctc ASR system with librispeech. -Decoding is performed with greedy decoding at validation time. -At test time, beamsearch is used with an optional external language model. - -Authors - * Pooneh Mousavi 2024 -""" - -import os -import sys -import torch -import logging -import speechbrain as sb -from speechbrain.utils.distributed import run_on_main, if_main_process -from hyperpyyaml import load_hyperpyyaml -from pathlib import Path -import torchaudio - -logger = logging.getLogger(__name__) - - -# Define training procedure -class ASR(sb.Brain): - def compute_forward(self, batch, stage): - """Forward computations from the waveform batches to the output probabilities.""" - batch = batch.to(self.device) - wavs, wav_lens = batch.sig - - # Forward pass - # Feature extraction and attention pooling - with torch.no_grad(): - self.hparams.codec.to(self.device).eval() - tokens = self.hparams.codec(wavs).permute(1, 2, 0)[ - :, :, : self.hparams.num_codebooks - ] - embeddings = self.modules.discrete_embedding_layer(tokens) - att_w = self.modules.attention_mlp(embeddings) - feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2) - y = self.modules.enc(feats) - - # Compute outputs - p_tokens = None - logits = self.modules.ctc_lin(y) - p_ctc = self.hparams.log_softmax(logits) - - if stage == sb.Stage.VALID: - p_tokens = sb.decoders.ctc_greedy_decode( - p_ctc, wav_lens, blank_id=self.hparams.blank_index - ) - elif stage == sb.Stage.TEST: - p_tokens = test_searcher(p_ctc, wav_lens) - - return p_ctc, wav_lens, p_tokens - - def compute_objectives(self, predictions, batch, stage): - """Computes the loss (CTC+NLL) given predictions and targets.""" - - p_ctc, wav_lens, predicted_tokens = predictions - ids = batch.id - tokens, tokens_lens = batch.tokens - loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) - - if stage == sb.Stage.VALID: - # Decode token terms to words - predicted_words = [ - "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ") - for utt_seq in predicted_tokens - ] - elif stage == sb.Stage.TEST: - predicted_words = [ - hyp[0].text.split(" ") for hyp in predicted_tokens - ] - - if stage != sb.Stage.TRAIN: - target_words = [wrd.split(" ") for wrd in batch.wrd] - self.wer_metric.append(ids, predicted_words, target_words) - self.cer_metric.append(ids, predicted_words, target_words) - - return loss - - def on_stage_start(self, stage, epoch): - """Gets called at the beginning of each epoch""" - if stage != sb.Stage.TRAIN: - self.cer_metric = self.hparams.cer_computer() - self.wer_metric = self.hparams.error_rate_computer() - - def on_stage_end(self, stage, stage_loss, epoch): - """Gets called at the end of an epoch.""" - # Compute/store important stats - stage_stats = {"loss": stage_loss} - if stage == sb.Stage.TRAIN: - self.train_stats = stage_stats - else: - stage_stats["CER"] = self.cer_metric.summarize("error_rate") - stage_stats["WER"] = self.wer_metric.summarize("error_rate") - - # Perform end-of-iteration things, like annealing, logging, etc. - if stage == sb.Stage.VALID: - old_lr_model, new_lr_model = self.hparams.lr_annealing_model( - stage_stats["loss"] - ) - sb.nnet.schedulers.update_learning_rate( - self.model_optimizer, new_lr_model - ) - - self.hparams.train_logger.log_stats( - stats_meta={"epoch": epoch, "lr_model": old_lr_model}, - train_stats=self.train_stats, - valid_stats=stage_stats, - ) - self.checkpointer.save_and_keep_only( - meta={"WER": stage_stats["WER"]}, min_keys=["WER"], - ) - elif stage == sb.Stage.TEST: - self.hparams.train_logger.log_stats( - stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, - test_stats=stage_stats, - ) - if if_main_process(): - with open(self.hparams.test_wer_file, "w") as w: - self.wer_metric.write_stats(w) - - def init_optimizers(self): - "Initializes the model optimizer" - self.model_optimizer = self.hparams.model_opt_class( - self.hparams.model.parameters() - ) - self.optimizers_dict = { - "model_optimizer": self.model_optimizer, - } - # Initializing the weights - if self.checkpointer is not None: - self.checkpointer.add_recoverable("modelopt", self.model_optimizer) - - -def dataio_prepare(hparams): - """This function prepares the datasets to be used in the brain class. - It also defines the data processing pipeline through user-defined functions.""" - data_folder = hparams["data_folder"] - - train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, - ) - - if hparams["sorting"] == "ascending": - # we sort training data to speed up training and get better results. - train_data = train_data.filtered_sorted(sort_key="duration") - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "descending": - train_data = train_data.filtered_sorted( - sort_key="duration", reverse=True - ) - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "random": - pass - - else: - raise NotImplementedError( - "sorting must be random, ascending or descending" - ) - - valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, - ) - valid_data = valid_data.filtered_sorted(sort_key="duration") - - # test is separate - test_datasets = {} - for csv_file in hparams["test_csv"]: - name = Path(csv_file).stem - test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=csv_file, replacements={"data_root": data_folder} - ) - test_datasets[name] = test_datasets[name].filtered_sorted( - sort_key="duration" - ) - - datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] - - # 2. Define audio pipeline: - @sb.utils.data_pipeline.takes("wav") - @sb.utils.data_pipeline.provides("sig") - def audio_pipeline(wav): - sig = sb.dataio.dataio.read_audio(wav) - info = torchaudio.info(wav) - resampled = torchaudio.transforms.Resample( - info.sample_rate, hparams["sample_rate"], - )(sig) - # resampled = resampled.unsqueeze(0) - return resampled - - sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) - label_encoder = sb.dataio.encoder.CTCTextEncoder() - - # 3. Define text pipeline: - @sb.utils.data_pipeline.takes("wrd") - @sb.utils.data_pipeline.provides( - "wrd", "char_list", "tokens_list", "tokens" - ) - def text_pipeline(wrd): - yield wrd - char_list = list(wrd) - yield char_list - tokens_list = label_encoder.encode_sequence(char_list) - yield tokens_list - tokens = torch.LongTensor(tokens_list) - yield tokens - - sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) - - lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt") - special_labels = { - "blank_label": hparams["blank_index"], - "unk_label": hparams["unk_index"], - } - label_encoder.load_or_create( - path=lab_enc_file, - from_didatasets=[train_data], - output_key="char_list", - special_labels=special_labels, - sequence_input=True, - ) - - # 4. Set output: - sb.dataio.dataset.set_output_keys( - datasets, ["id", "sig", "wrd", "char_list", "tokens"], - ) - return train_data, valid_data, test_datasets, label_encoder - - -if __name__ == "__main__": - - # CLI: - hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) - - # If distributed_launch=True then - # create ddp_group with the right communication protocol - sb.utils.distributed.ddp_init_group(run_opts) - - with open(hparams_file) as fin: - hparams = load_hyperpyyaml(fin, overrides) - - # Create experiment directory - sb.create_experiment_directory( - experiment_directory=hparams["output_folder"], - hyperparams_to_save=hparams_file, - overrides=overrides, - ) - - # Dataset prep (parsing Librispeech) - from librispeech_prepare import prepare_librispeech # noqa - - # multi-gpu (ddp) save data preparation - run_on_main( - prepare_librispeech, - kwargs={ - "data_folder": hparams["data_folder"], - "tr_splits": hparams["train_splits"], - "dev_splits": hparams["dev_splits"], - "te_splits": hparams["test_splits"], - "save_folder": hparams["output_folder"], - "merge_lst": hparams["train_splits"], - "merge_name": "train.csv", - "skip_prep": hparams["skip_prep"], - }, - ) - - # here we create the datasets objects as well as tokenization and encoding - train_data, valid_data, test_datasets, label_encoder = dataio_prepare( - hparams - ) - - # Trainer initialization - asr_brain = ASR( - modules=hparams["modules"], - hparams=hparams, - run_opts=run_opts, - checkpointer=hparams["checkpointer"], - ) - - # We dynamicaly add the tokenizer to our brain class. - asr_brain.tokenizer = label_encoder - - ind2lab = label_encoder.ind2lab - vocab_list = [ind2lab[x] for x in range(len(ind2lab))] - - from speechbrain.decoders.ctc import CTCBeamSearcher - - test_searcher = CTCBeamSearcher( - **hparams["test_beam_search"], vocab_list=vocab_list, - ) - - # Training - asr_brain.fit( - asr_brain.hparams.epoch_counter, - train_data, - valid_data, - train_loader_kwargs=hparams["train_dataloader_opts"], - valid_loader_kwargs=hparams["valid_dataloader_opts"], - ) - - # Testing - if not os.path.exists(hparams["output_wer_folder"]): - os.makedirs(hparams["output_wer_folder"]) - - for k in test_datasets.keys(): # keys are test_clean, test_other etc - asr_brain.hparams.test_wer_file = os.path.join( - hparams["output_wer_folder"], f"wer_{k}.txt" - ) - asr_brain.evaluate( - test_datasets[k], - test_loader_kwargs=hparams["test_dataloader_opts"], - min_key="WER", - ) diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_weighted_ssl.py b/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_weighted_ssl.py deleted file mode 100644 index 6d053fceb..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_weighted_ssl.py +++ /dev/null @@ -1,318 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training an SSL-based ctc ASR system with librispeech. -Decoding is performed with greedy decoding at validation time. -At test time, beamsearch is used with an optional external language model. - -Authors - * Pooneh Mousavi 2024 -""" - -import os -import sys -import torch -import logging -import speechbrain as sb -from speechbrain.utils.distributed import run_on_main, if_main_process -from hyperpyyaml import load_hyperpyyaml -from pathlib import Path - -logger = logging.getLogger(__name__) - - -# Define training procedure -class ASR(sb.Brain): - def compute_forward(self, batch, stage): - """Forward computations from the waveform batches to the output probabilities.""" - batch = batch.to(self.device) - wavs, wav_lens = batch.sig - - # Forward pass - feats = self.modules.weighted_ssl_model(wavs) - y = self.modules.enc(feats) - - # Compute outputs - p_tokens = None - logits = self.modules.ctc_lin(y) - p_ctc = self.hparams.log_softmax(logits) - - if stage == sb.Stage.VALID: - p_tokens = sb.decoders.ctc_greedy_decode( - p_ctc, wav_lens, blank_id=self.hparams.blank_index - ) - elif stage == sb.Stage.TEST: - p_tokens = test_searcher(p_ctc, wav_lens) - - return p_ctc, wav_lens, p_tokens - - def compute_objectives(self, predictions, batch, stage): - """Computes the loss (CTC+NLL) given predictions and targets.""" - - p_ctc, wav_lens, predicted_tokens = predictions - ids = batch.id - tokens, tokens_lens = batch.tokens - loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) - - if stage == sb.Stage.VALID: - # Decode token terms to words - predicted_words = [ - "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ") - for utt_seq in predicted_tokens - ] - elif stage == sb.Stage.TEST: - predicted_words = [ - hyp[0].text.split(" ") for hyp in predicted_tokens - ] - - if stage != sb.Stage.TRAIN: - target_words = [wrd.split(" ") for wrd in batch.wrd] - self.wer_metric.append(ids, predicted_words, target_words) - self.cer_metric.append(ids, predicted_words, target_words) - - return loss - - def on_stage_start(self, stage, epoch): - """Gets called at the beginning of each epoch""" - if stage != sb.Stage.TRAIN: - self.cer_metric = self.hparams.cer_computer() - self.wer_metric = self.hparams.error_rate_computer() - - def on_stage_end(self, stage, stage_loss, epoch): - """Gets called at the end of an epoch.""" - # Compute/store important stats - stage_stats = {"loss": stage_loss} - if stage == sb.Stage.TRAIN: - self.train_stats = stage_stats - else: - stage_stats["CER"] = self.cer_metric.summarize("error_rate") - stage_stats["WER"] = self.wer_metric.summarize("error_rate") - - # Perform end-of-iteration things, like annealing, logging, etc. - if stage == sb.Stage.VALID: - old_lr_model, new_lr_model = self.hparams.lr_annealing_model( - stage_stats["loss"] - ) - old_lr_weights, new_lr_weights = self.hparams.lr_annealing_weights( - stage_stats["loss"] - ) - sb.nnet.schedulers.update_learning_rate( - self.model_optimizer, new_lr_model - ) - sb.nnet.schedulers.update_learning_rate( - self.weights_optimizer, new_lr_weights - ) - - self.hparams.train_logger.log_stats( - stats_meta={"epoch": epoch, "lr_model": old_lr_model}, - train_stats=self.train_stats, - valid_stats=stage_stats, - ) - self.checkpointer.save_and_keep_only( - meta={"WER": stage_stats["WER"]}, min_keys=["WER"], - ) - elif stage == sb.Stage.TEST: - self.hparams.train_logger.log_stats( - stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, - test_stats=stage_stats, - ) - if if_main_process(): - with open(self.hparams.test_wer_file, "w") as w: - self.wer_metric.write_stats(w) - - def init_optimizers(self): - "Initializes the weights optimizer and model optimizer" - self.weights_optimizer = self.hparams.weights_opt_class( - [self.modules.weighted_ssl_model.weights] - ) - self.model_optimizer = self.hparams.model_opt_class( - self.hparams.model.parameters() - ) - self.optimizers_dict = { - "weights_optimizer": self.weights_optimizer, - "model_optimizer": self.model_optimizer, - } - # Initializing the weights - if self.checkpointer is not None: - self.checkpointer.add_recoverable("modelopt", self.model_optimizer) - self.checkpointer.add_recoverable( - "weights_opt", self.weights_optimizer - ) - - -def dataio_prepare(hparams): - """This function prepares the datasets to be used in the brain class. - It also defines the data processing pipeline through user-defined functions.""" - data_folder = hparams["data_folder"] - - train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, - ) - - if hparams["sorting"] == "ascending": - # we sort training data to speed up training and get better results. - train_data = train_data.filtered_sorted(sort_key="duration") - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "descending": - train_data = train_data.filtered_sorted( - sort_key="duration", reverse=True - ) - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "random": - pass - - else: - raise NotImplementedError( - "sorting must be random, ascending or descending" - ) - - valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, - ) - valid_data = valid_data.filtered_sorted(sort_key="duration") - - # test is separate - test_datasets = {} - for csv_file in hparams["test_csv"]: - name = Path(csv_file).stem - test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=csv_file, replacements={"data_root": data_folder} - ) - test_datasets[name] = test_datasets[name].filtered_sorted( - sort_key="duration" - ) - - datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] - - # 2. Define audio pipeline: - @sb.utils.data_pipeline.takes("wav") - @sb.utils.data_pipeline.provides("sig") - def audio_pipeline(wav): - sig = sb.dataio.dataio.read_audio(wav) - return sig - - sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) - label_encoder = sb.dataio.encoder.CTCTextEncoder() - - # 3. Define text pipeline: - @sb.utils.data_pipeline.takes("wrd") - @sb.utils.data_pipeline.provides( - "wrd", "char_list", "tokens_list", "tokens" - ) - def text_pipeline(wrd): - yield wrd - char_list = list(wrd) - yield char_list - tokens_list = label_encoder.encode_sequence(char_list) - yield tokens_list - tokens = torch.LongTensor(tokens_list) - yield tokens - - sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) - - lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt") - special_labels = { - "blank_label": hparams["blank_index"], - "unk_label": hparams["unk_index"], - } - label_encoder.load_or_create( - path=lab_enc_file, - from_didatasets=[train_data], - output_key="char_list", - special_labels=special_labels, - sequence_input=True, - ) - - # 4. Set output: - sb.dataio.dataset.set_output_keys( - datasets, ["id", "sig", "wrd", "char_list", "tokens"], - ) - return train_data, valid_data, test_datasets, label_encoder - - -if __name__ == "__main__": - - # CLI: - hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) - - # If distributed_launch=True then - # create ddp_group with the right communication protocol - sb.utils.distributed.ddp_init_group(run_opts) - - with open(hparams_file) as fin: - hparams = load_hyperpyyaml(fin, overrides) - - # Create experiment directory - sb.create_experiment_directory( - experiment_directory=hparams["output_folder"], - hyperparams_to_save=hparams_file, - overrides=overrides, - ) - - # Dataset prep (parsing Librispeech) - from librispeech_prepare import prepare_librispeech # noqa - - # multi-gpu (ddp) save data preparation - run_on_main( - prepare_librispeech, - kwargs={ - "data_folder": hparams["data_folder"], - "tr_splits": hparams["train_splits"], - "dev_splits": hparams["dev_splits"], - "te_splits": hparams["test_splits"], - "save_folder": hparams["output_folder"], - "merge_lst": hparams["train_splits"], - "merge_name": "train.csv", - "skip_prep": hparams["skip_prep"], - }, - ) - - # here we create the datasets objects as well as tokenization and encoding - train_data, valid_data, test_datasets, label_encoder = dataio_prepare( - hparams - ) - - # Trainer initialization - asr_brain = ASR( - modules=hparams["modules"], - hparams=hparams, - run_opts=run_opts, - checkpointer=hparams["checkpointer"], - ) - - # We dynamicaly add the tokenizer to our brain class. - asr_brain.tokenizer = label_encoder - - ind2lab = label_encoder.ind2lab - vocab_list = [ind2lab[x] for x in range(len(ind2lab))] - - from speechbrain.decoders.ctc import CTCBeamSearcher - - test_searcher = CTCBeamSearcher( - **hparams["test_beam_search"], vocab_list=vocab_list, - ) - - # Training - asr_brain.fit( - asr_brain.hparams.epoch_counter, - train_data, - valid_data, - train_loader_kwargs=hparams["train_dataloader_opts"], - valid_loader_kwargs=hparams["valid_dataloader_opts"], - ) - - # Testing - if not os.path.exists(hparams["output_wer_folder"]): - os.makedirs(hparams["output_wer_folder"]) - - for k in test_datasets.keys(): # keys are test_clean, test_other etc - asr_brain.hparams.test_wer_file = os.path.join( - hparams["output_wer_folder"], f"wer_{k}.txt" - ) - asr_brain.evaluate( - test_datasets[k], - test_loader_kwargs=hparams["test_dataloader_opts"], - min_key="WER", - ) diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml new file mode 100644 index 000000000..8b9581dc9 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml @@ -0,0 +1,220 @@ +# ############################################################################ +# Model: E2E ASR with CTC +# Auido Tokenizer: Encodec +# Encoder: LSTM Encoder +# Decoder: CTC beam searcher and greedy searcher +# Tokens: character +# Training: Librispeech 960h +# Authors: +# - Pooneh Mousavi 2024 +# - Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results/LSTM// +output_wer_folder: !ref /wer.txt +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !PLACEHOLDER # e.g., path/to/cache +# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES +# then data_folder_rirs should be /localscratch/xxx_corpus +# otherwise the dataset will automatically be downloaded +# data_folder_rirs: !ref +train_splits: ["train-clean-100"] #["train-clean-100", "train-clean-360", "train-other-500"] +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /dev-clean.csv + - !ref /test-clean.csv + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +pretrain_embeddings_folder: none # Optional: If pretrain_embeddings is True, this should be set to the path where the pretrained embeddings are saved. + +####################### Training Parameters #################################### +number_of_epochs: 20 +batch_size_exponent: 4 # @orion_step1: --batch_size_exponent~"uniform(2, 4,discrete=True)" +batch_size: !ref 2 ** +test_batch_size: 1 +grad_accumulation_factor: 2 +max_grad_norm: 5.0 +sorting: descending #random +num_workers: 8 +loss_reduction: batchmean +precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean +valid_search_interval: 1 +avg_checkpoints: 10 # Number of checkpoints to average for evaluation +cache_size: 1.e+10 + +lr_model: 0.0001 # @orion_step1: --lr_model~"loguniform(0.00001,0.5)" +weight_decay: 0.0005 + + +# Training parameters +dynamic_batching: True +max_batch_length_train: 850 +max_batch_len_val: 100 +num_bucket: 200 +shuffle: False # if true re-creates batches at each epoch shuffling examples. +max_batch_ex: 128 +batch_ordering: random + +dynamic_batch_sampler_train: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +dynamic_batch_sampler_val: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + + +####################### Model parameters ########################### +# Tokenizer parameters +# These parameters should be set according to the tokenizer used to extract tokens saved in . +vocab_size: 1024 +num_codebooks: 2 +sample_rate: 24000 + +# Feature parameters +encoder_dim: 1024 +# If set to True, encoder_dim should match the dimension of the tokenizer. For Encodec, it is 128. +pretrain_embeddings: False +freeze_embedding: False + +# LSTM +activation: !name:torch.nn.Sigmoid +dnn_layers: 2 # @orion_step1: --dnn_layers~"uniform(1, 4,discrete=True)" +dnn_neurons: 1024 +dropout: 0.2 +output_neurons: 31 + +# BPE parameters +# BPE parameters +token_type: char # ["unigram", "bpe", "char"] +character_coverage: 1.0 +blank_index: 0 +bos_index: 1 +eos_index: 2 + +# Decoding parameters +beam_size: 100 +beam_prune_logp: -12.0 +token_prune_min_logp: -1.2 +prune_history: False + +############################## models ################################ +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + +discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer + num_codebooks: !ref + vocab_size: !ref + emb_dim: !ref + # hidden_dim: !ref + freeze: !ref + init: !ref + +attention_mlp: !new:model.custom_model.AttentionMLP + input_dim: !ref + hidden_dim: !ref + +encoder: !new:speechbrain.nnet.RNN.LSTM + input_shape: [Null, Null, !ref ] + num_layers: !ref + bidirectional: True + dropout: !ref + hidden_size: !ref + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: 2048 + n_neurons: !ref + +modules: + encoder: !ref + ctc_lin: !ref + attention_mlp: !ref + # tokenizer: !ref + discrete_embedding_layer: !ref + + +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref ] + +####################### Decoding & optimiser ########################### +# Decoding parameters +test_beam_search: + blank_index: !ref + beam_size: !ref + beam_prune_logp: !ref + token_prune_min_logp: !ref + prune_history: !ref + alpha: 0.8 + beta: 1.2 + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 + +model_opt_class: !name:torch.optim.AdamW + lr: !ref + betas: (0.9, 0.98) + eps: 0.000000001 + weight_decay: !ref + +############################## Logging and Pretrainer ########################## +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + counter: !ref + + +# Functions and classes +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True +wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml new file mode 100644 index 000000000..eab197c68 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml @@ -0,0 +1,214 @@ +# ############################################################################ +# Model: E2E ASR with CTC +# Auido Tokenizer: Encodec +# Encoder: LSTM Encoder +# Decoder: CTC beam searcher and greedy searcher +# Tokens: character +# Training: Librispeech 960h +# Authors: +# - Pooneh Mousavi 2024 +# - Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results/LSTM// +output_wer_folder: !ref /wer.txt +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !PLACEHOLDER # e.g., path/to/cache +# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES +# then data_folder_rirs should be /localscratch/xxx_corpus +# otherwise the dataset will automatically be downloaded +# data_folder_rirs: !ref +train_splits: ["train-clean-100"] #["train-clean-100", "train-clean-360", "train-other-500"] +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /dev-clean.csv + - !ref /test-clean.csv + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +pretrain_embeddings_folder: none # Optional: If pretrain_embeddings is True, this should be set to the path where the pretrained embeddings are saved. + +####################### Training Parameters #################################### +number_of_epochs: 20 +batch_size_exponent: 4 # @orion_step1: --batch_size_exponent~"uniform(2, 4,discrete=True)" +batch_size: !ref 2 ** +test_batch_size: 1 +grad_accumulation_factor: 2 +max_grad_norm: 5.0 +sorting: descending #random +num_workers: 8 +loss_reduction: batchmean +precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean +valid_search_interval: 1 +avg_checkpoints: 10 # Number of checkpoints to average for evaluation +cache_size: 1.e+10 + +lr_model: 0.0001 # @orion_step1: --lr_model~"loguniform(0.00001,0.5)" +weight_decay: 0.0005 + + +# Training parameters +dynamic_batching: True +max_batch_length_train: 850 +max_batch_len_val: 100 +num_bucket: 200 +shuffle: False # if true re-creates batches at each epoch shuffling examples. +max_batch_ex: 128 +batch_ordering: random + +dynamic_batch_sampler_train: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +dynamic_batch_sampler_val: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + + +####################### Model parameters ########################### +# Tokenizer parameters +# These parameters should be set according to the tokenizer used to extract tokens saved in . +vocab_size: 1024 +num_codebooks: 2 +sample_rate: 24000 + +# Feature parameters +encoder_dim: 1024 +# If set to True, encoder_dim should match the dimension of the tokenizer. For Encodec, it is 128. +pretrain_embeddings: False +freeze_embedding: False + +# Contextnet + +output_neurons: 31 + +# BPE parameters +# BPE parameters +token_type: char # ["unigram", "bpe", "char"] +character_coverage: 1.0 +blank_index: 0 +bos_index: 1 +eos_index: 2 + +# Decoding parameters +beam_size: 100 +beam_prune_logp: -12.0 +token_prune_min_logp: -1.2 +prune_history: False + +############################## models ################################ +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + +discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer + num_codebooks: !ref + vocab_size: !ref + emb_dim: !ref + # hidden_dim: !ref + freeze: !ref + init: !ref + +attention_mlp: !new:model.custom_model.AttentionMLP + input_dim: !ref + hidden_dim: !ref + +encoder: !new:speechbrain.lobes.models.ContextNet.ContextNet + input_shape: [null, null, !ref ] + strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: 640 + n_neurons: !ref + +modules: + encoder: !ref + ctc_lin: !ref + attention_mlp: !ref + # tokenizer: !ref + discrete_embedding_layer: !ref + + +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref ] + +####################### Decoding & optimiser ########################### +# Decoding parameters +test_beam_search: + blank_index: !ref + beam_size: !ref + beam_prune_logp: !ref + token_prune_min_logp: !ref + prune_history: !ref + alpha: 0.8 + beta: 1.2 + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 + +model_opt_class: !name:torch.optim.AdamW + lr: !ref + betas: (0.9, 0.98) + eps: 0.000000001 + weight_decay: !ref + +############################## Logging and Pretrainer ########################## +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + counter: !ref + + +# Functions and classes +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True +wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats diff --git a/benchmarks/DASB/LibriSpeech/ASR/librispeech_prepare.py b/benchmarks/DASB/LibriSpeech/ASR/librispeech_prepare.py new file mode 120000 index 000000000..a3126ec94 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR/librispeech_prepare.py @@ -0,0 +1 @@ +../librispeech_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriSpeech/ASR/train.py b/benchmarks/DASB/LibriSpeech/ASR/train.py new file mode 100644 index 000000000..ec6ac1b42 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR/train.py @@ -0,0 +1,457 @@ +#!/usr/bin/env/python3 +"""Recipe for training an discrete tokens ctc ASR system with librispeech. + +Decoding is performed with greedy decoding at validation time. +At test time, beamsearch is used with an optional external language model. + +Authors + * Pooneh Mousavi 2024 + * Jarod Duret 2024 +""" + +import os +import sys +import time +import torch +import torchaudio +import logging +import speechbrain as sb +from speechbrain.utils.distributed import run_on_main, if_main_process +from speechbrain.tokenizers.SentencePiece import SentencePiece +from hyperpyyaml import load_hyperpyyaml +from pathlib import Path + +base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) +sys.path.append(base_dir) + + +logger = logging.getLogger(__name__) + + +# Define training procedure +class ASR(sb.Brain): + def compute_forward(self, batch, stage): + """Forward computations from the waveform batches to the output probabilities.""" + batch = batch.to(self.device) + wavs, wav_lens = batch.sig + in_toks, _ = batch.speech_tokens + + in_embs = self.modules.discrete_embedding_layer( + in_toks + ) # [B, T, N-Q, D] + + # Attention-Pooling + att_w = self.modules.attention_mlp(in_embs) # [B, T, N-Q, 1] + in_embs = torch.matmul(att_w.transpose(2, -1), in_embs).squeeze( + -2 + ) # [B, T, D] + + # forward modules + if type(self.modules.encoder).__name__ == "ContextNet": + enc_out = self.modules.encoder(in_embs) + + elif type(self.modules.encoder).__name__ == "LSTM": + enc_out, _ = self.modules.encoder(in_embs) + + else: + raise NotImplementedError + + # output layer for ctc log-probabilities + logits = self.modules.ctc_lin(enc_out) + p_ctc = self.hparams.log_softmax(logits) + + p_tokens = None + if stage == sb.Stage.VALID: + p_tokens = sb.decoders.ctc_greedy_decode( + p_ctc, wav_lens, blank_id=self.hparams.blank_index + ) + elif stage == sb.Stage.TEST: + p_tokens = test_searcher(p_ctc, wav_lens) + + return p_ctc, wav_lens, p_tokens + + def compute_objectives(self, predictions, batch, stage): + """Computes the loss (CTC+NLL) given predictions and targets.""" + + p_ctc, wav_lens, predicted_tokens = predictions + ids = batch.id + tokens, tokens_lens = batch.tokens + + # Label Augmentation + if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): + tokens = self.hparams.wav_augment.replicate_labels(tokens) + tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens) + + loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) + + if stage == sb.Stage.VALID: + # Decode token terms to words + predicted_words = self.tokenizer( + predicted_tokens, task="decode_from_list" + ) + elif stage == sb.Stage.TEST: + predicted_words = [ + hyp[0].text.split(" ") for hyp in predicted_tokens + ] + + if stage != sb.Stage.TRAIN: + target_words = [wrd.split(" ") for wrd in batch.wrd] + self.wer_metric.append(ids, predicted_words, target_words) + self.cer_metric.append(ids, predicted_words, target_words) + + return loss + + def on_stage_start(self, stage, epoch): + """Gets called at the beginning of each epoch""" + if stage != sb.Stage.TRAIN: + self.cer_metric = self.hparams.cer_computer() + self.wer_metric = self.hparams.wer_computer() + + def on_stage_end(self, stage, stage_loss, epoch): + """Gets called at the end of a epoch.""" + # Compute/store important stats + stage_stats = {"loss": stage_loss} + if stage == sb.Stage.TRAIN: + self.train_stats = stage_stats + else: + stage_stats["CER"] = self.cer_metric.summarize("error_rate") + stage_stats["WER"] = self.wer_metric.summarize("error_rate") + current_epoch = self.hparams.epoch_counter.current + valid_search_interval = self.hparams.valid_search_interval + if ( + current_epoch % valid_search_interval == 0 + or stage == sb.Stage.TEST + ): + stage_stats["WER"] = self.wer_metric.summarize("error_rate") + + # log stats and save checkpoint at end-of-epoch + if stage == sb.Stage.VALID: + if type(self.hparams.scheduler).__name__ == "NewBobScheduler": + lr, new_lr = self.hparams.scheduler(stage_stats["loss"]) + sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr) + elif type(self.hparams.scheduler).__name__ == "LinearNoamScheduler": + lr = self.hparams.scheduler.current_lr + else: + raise NotImplementedError + + optimizer = self.optimizer.__class__.__name__ + epoch_stats = { + "epoch": epoch, + "lr": lr, + "optimizer": optimizer, + } + self.hparams.train_logger.log_stats( + stats_meta=epoch_stats, + train_stats=self.train_stats, + valid_stats=stage_stats, + ) + self.checkpointer.save_and_keep_only( + meta={"WER": stage_stats["WER"], "epoch": epoch}, + min_keys=["WER"], + num_to_keep=self.hparams.avg_checkpoints, + ) + + elif stage == sb.Stage.TEST: + self.hparams.train_logger.log_stats( + stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, + test_stats=stage_stats, + ) + if if_main_process(): + with open( + self.hparams.output_wer_folder, "w", encoding="utf-8" + ) as w: + self.wer_metric.write_stats(w) + + def on_fit_batch_end(self, batch, outputs, loss, should_step): + if ( + should_step + and type(self.hparams.scheduler).__name__ == "LinearNoamScheduler" + ): + self.hparams.scheduler(self.optimizer) + + +def dataio_prepare(hparams, tokenizer): + """This function prepares the datasets to be used in the brain class. + It also defines the data processing pipeline through user-defined functions. + """ + data_folder = hparams["data_folder"] + + train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( + csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, + ) + + if hparams["sorting"] == "ascending": + # we sort training data to speed up training and get better results. + train_data = train_data.filtered_sorted(sort_key="duration") + # when sorting do not shuffle in dataloader ! otherwise is pointless + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "descending": + train_data = train_data.filtered_sorted( + sort_key="duration", reverse=True + ) + # when sorting do not shuffle in dataloader ! otherwise is pointless + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "random": + pass + + else: + raise NotImplementedError( + "sorting must be random, ascending or descending" + ) + + valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( + csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, + ) + valid_data = valid_data.filtered_sorted(sort_key="duration") + + # test is separate + test_datasets = {} + for csv_file in hparams["test_csv"]: + name = Path(csv_file).stem + test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv( + csv_path=csv_file, replacements={"data_root": data_folder} + ) + test_datasets[name] = test_datasets[name].filtered_sorted( + sort_key="duration" + ) + + datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] + + # 1. Define tokens pipeline: + tokens_loader = hparams["tokens_loader"] + num_codebooks = hparams["num_codebooks"] + + @sb.utils.data_pipeline.takes("id") + @sb.utils.data_pipeline.provides("speech_tokens") + def tokens_pipeline(id): + tokens = tokens_loader.tokens_by_uttid(id, num_codebooks=num_codebooks) + return tokens + + sb.dataio.dataset.add_dynamic_item(datasets, tokens_pipeline) + + # 2. Define audio pipeline: + @sb.utils.data_pipeline.takes("wav") + @sb.utils.data_pipeline.provides("sig") + def audio_pipeline(wav): + sig = sb.dataio.dataio.read_audio(wav) + info = torchaudio.info(wav) + resampled = torchaudio.transforms.Resample( + info.sample_rate, hparams["sample_rate"], + )(sig) + # resampled = resampled.unsqueeze(0) + return resampled + + sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) + + # 3. Define text pipeline: + @sb.utils.data_pipeline.takes("wrd") + @sb.utils.data_pipeline.provides( + "wrd", "char_list", "tokens_list", "tokens" + ) + def text_pipeline(wrd): + yield wrd + char_list = list(wrd) + yield char_list + tokens_list = tokenizer.sp.encode_as_ids(wrd) + yield tokens_list + tokens = torch.LongTensor(tokens_list) + yield tokens + + sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) + + # 4. Set output: + sb.dataio.dataset.set_output_keys( + datasets, ["id", "sig", "wrd", "char_list", "tokens", "speech_tokens"], + ) + + # 5. If Dynamic Batching is used, we instantiate the needed samplers. + train_batch_sampler = None + valid_batch_sampler = None + if hparams["dynamic_batching"]: + from speechbrain.dataio.sampler import DynamicBatchSampler # noqa + + dynamic_hparams_train = hparams["dynamic_batch_sampler_train"] + dynamic_hparams_val = hparams["dynamic_batch_sampler_val"] + + train_batch_sampler = DynamicBatchSampler( + train_data, + length_func=lambda x: x["duration"], + **dynamic_hparams_train, + ) + + valid_batch_sampler = DynamicBatchSampler( + valid_data, + length_func=lambda x: x["duration"], + **dynamic_hparams_val, + ) + + return ( + train_data, + valid_data, + test_datasets, + train_batch_sampler, + valid_batch_sampler, + ) + + +if __name__ == "__main__": + + # CLI: + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + with open(hparams_file) as fin: + hparams = load_hyperpyyaml(fin, overrides) + + # If distributed_launch=True then + # create ddp_group with the right communication protocol + sb.utils.distributed.ddp_init_group(run_opts) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + # Dataset prep (parsing Librispeech) + from librispeech_prepare import prepare_librispeech # noqa + + # multi-gpu (ddp) save data preparation + run_on_main( + prepare_librispeech, + kwargs={ + "data_folder": hparams["data_folder"], + "tr_splits": hparams["train_splits"], + "dev_splits": hparams["dev_splits"], + "te_splits": hparams["test_splits"], + "save_folder": hparams["cached_data_folder"], + "merge_lst": hparams["train_splits"], + "merge_name": "train.csv", + "skip_prep": hparams["skip_prep"], + }, + ) + + # Defining tokenizer and loading it + tokenizer = SentencePiece( + model_dir=hparams["cached_data_folder"], + vocab_size=hparams["output_neurons"], + annotation_train=hparams["train_csv"], + annotation_read="wrd", + model_type=hparams["token_type"], + character_coverage=hparams["character_coverage"], + bos_id=hparams["bos_index"], + eos_id=hparams["eos_index"], + ) + + # here we create the datasets objects as well as tokenization and encoding + ( + train_data, + valid_data, + test_datasets, + train_bsampler, + valid_bsampler, + ) = dataio_prepare(hparams, tokenizer) + + # Use pretrained embeddings + if hparams["pretrain_embeddings"]: + tokens_loader = hparams["tokens_loader"] + embs = tokens_loader.load_pretrained_embeddings( + hparams["pretain_embeddings_folder"] + ) + if isinstance(hparams["num_codebooks"], int): + embs = embs[ + : hparams["num_codebooks"] * hparams["vocab_size"], + ] + # For discrete SSL, num_codebooks is a list used to determine which layers to use. + # It is not sequential and can be, for example, [0, 1] or [1, 4]. + elif isinstance(hparams["num_codebooks"], list): + indices = [ + i + for codebook_idx in hparams["num_codebooks"] + for i in range( + codebook_idx * hparams["vocab_size"], + (codebook_idx + 1) * hparams["vocab_size"], + ) + ] + indices = torch.tensor(indices, dtype=torch.long) + embs = embs[indices] + hparams["discrete_embedding_layer"].init_embedding(embs) + + # Log number of parameters/buffers + model_params = sum( + [ + x.numel() + for module in hparams["modules"].values() + for x in module.state_dict().values() + ] + ) + hparams["train_logger"].log_stats( + stats_meta={ + "Model parameters/buffers (M)": f"{model_params / 1e6:.2f}", + }, + ) + + # Trainer initialization + asr_brain = ASR( + modules=hparams["modules"], + opt_class=hparams["model_opt_class"], + hparams=hparams, + run_opts=run_opts, + checkpointer=hparams["checkpointer"], + ) + + # Adding objects to trainer. + asr_brain.tokenizer = tokenizer + vocab_list = [ + tokenizer.sp.id_to_piece(i) for i in range(tokenizer.sp.vocab_size()) + ] + + from speechbrain.decoders.ctc import CTCBeamSearcher + + test_searcher = CTCBeamSearcher( + **hparams["test_beam_search"], vocab_list=vocab_list, + ) + + train_dataloader_opts = hparams["train_dataloader_opts"] + valid_dataloader_opts = hparams["valid_dataloader_opts"] + + if train_bsampler is not None: + train_dataloader_opts = { + "batch_sampler": train_bsampler, + "num_workers": hparams["num_workers"], + } + + if valid_bsampler is not None: + valid_dataloader_opts = {"batch_sampler": valid_bsampler} + + # Measure time + start_time = time.time() # Start the timer + # Training + asr_brain.fit( + asr_brain.hparams.epoch_counter, + train_data, + valid_data, + train_loader_kwargs=hparams["train_dataloader_opts"], + valid_loader_kwargs=hparams["valid_dataloader_opts"], + ) + + end_time = time.time() # End the timer + # Calculate elapsed time + elapsed_time = end_time - start_time + logger.info(f"Model execution time: {elapsed_time:.6f} seconds") + + if hparams["testing"]: + # Testing + if not os.path.exists(hparams["output_wer_folder"]): + os.makedirs(hparams["output_wer_folder"]) + + for k in test_datasets.keys(): # keys are test_clean, test_other etc + asr_brain.hparams.output_wer_folder = os.path.join( + hparams["output_wer_folder"], f"wer_{k}.txt" + ) + asr_brain.evaluate( + test_datasets[k], + test_loader_kwargs=hparams["test_dataloader_opts"], + min_key="WER", + ) diff --git a/benchmarks/DASB/LibriSpeech/extraction/extract.py b/benchmarks/DASB/LibriSpeech/extraction/extract.py new file mode 100644 index 000000000..3a649d24f --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/extraction/extract.py @@ -0,0 +1,96 @@ +#!/usr/bin/env/python3 +"""Recipe for extracting a discrete tokens with librispeech. + +Authors + * Jarod Duret 2024 +""" + +import os +import sys +import logging +import pathlib as pl +import speechbrain as sb +from speechbrain.dataio.dataset import DynamicItemDataset +from speechbrain.utils.distributed import run_on_main +from hyperpyyaml import load_hyperpyyaml + +base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) +sys.path.append(base_dir) + +print(base_dir) + +logger = logging.getLogger(__name__) + + +if __name__ == "__main__": + # CLI: + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + with open(hparams_file) as fin: + hparams = load_hyperpyyaml(fin, overrides) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + # Dataset prep (parsing Librispeech) + from librispeech_prepare import prepare_librispeech # noqa + + # multi-gpu (ddp) save data preparation + run_on_main( + prepare_librispeech, + kwargs={ + "data_folder": hparams["data_folder"], + "tr_splits": hparams["train_splits"], + "dev_splits": hparams["dev_splits"], + "te_splits": hparams["test_splits"], + "save_folder": hparams["cached_data_folder"], + "merge_lst": hparams["train_splits"], + "merge_name": "train.csv", + "skip_prep": hparams["skip_prep"], + }, + ) + + tokens_extractor = hparams["tokens_extractor"] + data_folder = hparams["data_folder"] + datasets = [] + for split in ["train", "valid"]: + csv_path = hparams[f"{split}_csv"] + name = pl.Path(csv_path).stem + dataset = sb.dataio.dataset.DynamicItemDataset.from_csv( + csv_path=csv_path, replacements={"data_root": data_folder}, + ) + datasets.append(dataset) + + for split in hparams["test_csv"]: + name = pl.Path(split).stem + dataset = sb.dataio.dataset.DynamicItemDataset.from_csv( + csv_path=split, replacements={"data_root": data_folder}, + ) + datasets.append(dataset) + + merged_data = { + key: value + for dataset in datasets + for key, value in dataset.data.items() + } + merged_dataset = DynamicItemDataset(merged_data) + + save_folder = pl.Path(hparams["save_folder"]) + logger.info("Extracting dataset tokens ...") + tokens_extractor.extract_tokens( + merged_dataset, + hparams["num_codebooks"], + (save_folder / "librispeech").as_posix(), + ) + + if hparams["save_embedding"]: + save_folder = pl.Path(hparams["save_folder"]) + logger.info("Saving embeddings ...") + tokens_extractor.save_pretrained_embeddings( + (save_folder / "embeddings").as_posix(), + vocab_size=hparams["vocab_size"], + num_codebooks=hparams["num_codebooks"], + ) diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml new file mode 100644 index 000000000..349597c55 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml @@ -0,0 +1,66 @@ +# ############################################################################ +# Auido Tokenizer: DAC +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/dac +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !ref # e.g., path/to/cache +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /test-clean.csv + - !ref /test-other.csv + +batch_size: 1 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +####################### Model parameters ########################### +# Tokenizer parameters +# DAC parameters +# model_type: [16khz, 24khz, 44khz, 44khz] +# vocab_size: [1024, 1024, 1024, 1024] +# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps] +# max_num_codebooks: [12, 32, 9, 18] +# embedding_dim: [1024, 1024, 1024, 128] +model_type: 24khz +vocab_size: 1024 +model_bitrate: 8kbps +num_codebooks: 32 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +save_embedding: False + +tokenizer: !new:utils.tokenizer_interface.DACTokenizer + model_type: !ref + model_bitrate: !ref + load_pretrained: True + tag: latest + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml new file mode 100644 index 000000000..cd8ae126e --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml @@ -0,0 +1,104 @@ +# ############################################################################ +# Auido Tokenizer: WavLM +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/wavlm +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !ref # e.g., path/to/cache +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /test-clean.csv + - !ref /test-other.csv + +batch_size: 1 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +### Configuration for discrete SSL model +# | SSL Model | HF Encoder | K-Means Dataset | K-Means Size | SSL Layers | Vocoder Model | +# |------------|----------------------------------------|-----------------|--------------|----------------------|---------------------------------------------| +# | WavLM | microsoft/wavlm-large | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | speechbrain/hifigan-wavlm-k1000-LibriTTS | +# | HuBERT | facebook/hubert-large-ll60k | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | speechbrain/hifigan-hubert-k1000-LibriTTS | +# | Wav2Vec2 | facebook/wav2vec2-large | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | speechbrain/hifigan-wav2vec2-k1000-LibriTTS | + + +# ssl_model_type: HuBERT, WavLM, Wav2Vec2 +# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large, facebook/wav2vec2-large +ssl_model_type: WavLM +ssl_hub: microsoft/wavlm-large +ssl_folder: !ref /ssl_checkpoint +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech +vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS +freeze_ssl: True +freeze_feature_extractor: True +vocab_size: 1000 +save_embedding: False + +### Config for Tokenizer +# Layer number should be among the supported layers for discrete SSL models(kmenas model should be available for that layer) +num_codebooks: [1, 3, 7, 12, 18, 23] +deduplicate: [False, False, False, False, False, False] +bpe_tokenizer_path: [null, null, null, null, null, null] +sample_rate: 16000 +encoder_dim: 1024 + +ssl_model: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + WavLM: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + HuBERT: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + Wav2Vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + +tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer + save_path: !ref + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml new file mode 100644 index 000000000..9f6c8b4ed --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml @@ -0,0 +1,64 @@ +# ############################################################################ +# Auido Tokenizer: Encodec +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/encodec +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !ref # e.g., path/to/cache +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /test-clean.csv + - !ref /test-other.csv + +batch_size: 1 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# EnCodec parameters +# sample_rate: [24000, 24000, 24000, 24000] +# vocab_size: [1024, 1024, 1024, 1024] +# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] +# num_codebooks: [2, 4, 8, 16, 32] +bandwidth: 24.0 +num_codebooks: 32 +vocab_size: 1024 +sample_rate: 24000 +save_embedding: False + +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml new file mode 100644 index 000000000..7871d6212 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml @@ -0,0 +1,60 @@ +# ############################################################################ +# Auido Tokenizer: Mimi +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/dac +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !ref # e.g., path/to/cache +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /test-clean.csv + - !ref /test-other.csv + +batch_size: 1 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +####################### Model parameters ########################### +# Tokenizer parameters +model_hub: kyutai/mimi +vocab_size: 1024 +num_codebooks: 32 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +save_embedding: False + +tokenizer: !new:utils.tokenizer_interface.MimiTokenizer + source: !ref + save_path: !ref + num_codebooks: !ref + sample_rate: !ref + + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml new file mode 100644 index 000000000..3090e9f79 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml @@ -0,0 +1,56 @@ +# ############################################################################ +# Auido Tokenizer: Speech Tokenizer +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/speech_tokenizer +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !ref # e.g., path/to/cache +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /test-clean.csv + - !ref /test-other.csv + +batch_size: 1 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +vocab_size: 1024 +num_codebooks: 8 +sample_rate: 16000 +encoder_dim: 1024 +freeze_embedding: False +save_embedding: False + +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper + source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml new file mode 100644 index 000000000..9d5a6c24e --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml @@ -0,0 +1,59 @@ +# ############################################################################ +# Auido Tokenizer: SQCodec +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/encodec +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !ref # e.g., path/to/cache +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /test-clean.csv + - !ref /test-other.csv + +batch_size: 1 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# EnCodec parameters +config: config.yaml +checkpoint: ckpt_00190000.pth +sample_rate: 16000 +save_embedding: False +num_codebooks: 4 +tokenizer_save_path: !PLACEHOLDER + +# wavtokenizer model +tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer + save_path: !ref + checkpoint: !ref + config: !ref + sample_rate: !ref + + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml new file mode 100644 index 000000000..9a8b754eb --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml @@ -0,0 +1,62 @@ +# ############################################################################ +# Auido Tokenizer: wavtokenizer +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/encodec +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !ref # e.g., path/to/cache +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /test-clean.csv + - !ref /test-other.csv + +batch_size: 1 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# EnCodec parameters +model_hub: novateur/WavTokenizer-medium-music-audio-75token +config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml +checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt +sample_rate: 24000 +save_embedding: False +num_codebooks: 1 +vocab_size: 4096 + +# wavtokenizer model +tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper + source: !ref + save_path: !ref + checkpoint: !ref + config: !ref + sample_rate: !ref + freeze: True + + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriSpeech/extraction/librispeech_prepare.py b/benchmarks/DASB/LibriSpeech/extraction/librispeech_prepare.py new file mode 120000 index 000000000..a3126ec94 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/extraction/librispeech_prepare.py @@ -0,0 +1 @@ +../librispeech_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/Tokotron.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/Tokotron.py similarity index 100% rename from benchmarks/DASB/LJSpeech/TTS/tokotron/Tokotron.py rename to benchmarks/DASB/LibriTTS/TTS/tokotron/Tokotron.py diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/custom_model.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/custom_model.py similarity index 100% rename from benchmarks/DASB/LibriSpeech/ASR/LSTM/custom_model.py rename to benchmarks/DASB/LibriTTS/TTS/tokotron/custom_model.py diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/data.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/data.py new file mode 120000 index 000000000..d65702b6c --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/data.py @@ -0,0 +1 @@ +../../../utils/data.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/eval.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/eval.py new file mode 120000 index 000000000..0ca6d4644 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/eval.py @@ -0,0 +1 @@ +../../../utils/eval.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py new file mode 100644 index 000000000..9b2801ee8 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py @@ -0,0 +1,547 @@ +"""Evaluates a checkpoint using an MOS estimation tool + +Authors +* Artem Ploujnikov 2024 +""" + +# TODO: There are too many evaluation scripts. Refactor to extract common +# features + +import speechbrain as sb +import json +import logging +import csv +import torch +import torchaudio +import string +import re +from pathlib import Path +from types import SimpleNamespace +from torch.nn import ModuleDict +from data import undo_batch +from eval import vocoder_to_device +from torch.utils.flop_counter import FlopCounterMode +from contextlib import nullcontext + + +logger = logging.getLogger(__name__) + + +class TokotronEvaluator: + """An evaluator class for the TTS model + + Arguments + --------- + hparams: dict + hyperparameters (as a dictionary) + device : str | torch.device + the device + """ + + def __init__(self, hparams, create_waveform_fn, device): + self.hparams = SimpleNamespace(**hparams) + self.create_waveform_fn = create_waveform_fn + self.device = device + modules = self.hparams.modules + self.modules = ModuleDict(modules).to(self.device) + self.spk_emb_model = self.hparams.spk_emb_model( + run_opts={"device": device} + ) + self.modules.model.vocoder = None + self.enabled_evaluators = set(self.hparams.evaluations.split(",")) + evaluators = hparams.get("evaluators", {}) + if evaluators: + self.evaluators = { + key: evaluator_f(run_opts={"device": device}) + for key, evaluator_f in evaluators.items() + if key in self.enabled_evaluators + } + else: + self.evaluators = {} + + if not self.evaluators: + logger.warning( + "No evaluators were defined - this run will produce samples only" + ) + + self.attention = [] + + def on_evaluate_start(self, stage, epoch): + """Invoked when evaluation starts + + Arguments + --------- + + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + self.stage = stage + self.epoch = epoch + self.output_folder = self.get_output_folder(stage, epoch) + self.samples_folder = self.output_folder / "samples" + self.samples_folder.mkdir(parents=True, exist_ok=True) + logger.info( + "Starting evaluation, results will be saved in %s", + self.output_folder, + ) + self.create_reports() + self.modules.model.show_inference_progress = False + self.item_ids = [] + details_keys = list(self.evaluators.keys()) + self.details = {evaluator_key: [] for evaluator_key in details_keys} + self.sample_text = [] + self.sample_file_names = [] + self.ref_file_names = [] + if hasattr(self.modules, "vocoder"): + vocoder_to_device(self.modules.vocoder, self.device) + + def get_output_folder(self, stage, epoch): + """Computes the output folder of evaluation results + for the specified stage and epoch. + + If the folder does not exists, it will be created. + + Arguments + --------- + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + + Returns + ------- + """ + output_folder = ( + Path(self.hparams.output_folder) / "eval" / stage.name.lower() + ) + if epoch is not None: + output_folder = output_folder / str(epoch) + output_folder.mkdir(parents=True, exist_ok=True) + return output_folder + + def on_evaluate_end(self): + """Invoked when evaluation starts + + Arguments + --------- + + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + self.write_summary() + logger.info("Evaluation done") + + def create_reports(self): + """Creates report files and report writers""" + self.report_files = {} + self.report_writers = {} + for evaluator_key in self.enabled_evaluators: + columns = self.get_report_columns(evaluator_key) + file_name = self.output_folder / f"{evaluator_key}.csv" + resume = file_name.exists() and file_name.stat().st_size > 0 + report_file = open(file_name, "a+") + self.report_files[evaluator_key] = report_file + writer = csv.DictWriter(report_file, columns) + if not resume: + writer.writeheader() + self.report_writers[evaluator_key] = writer + if self.hparams.eval_perf: + self.perf_file = open(self.output_folder / "perf.csv", "w") + self.perf_writer = csv.DictWriter( + self.perf_file, + [ + "uttid", + "infer_flops", + "steps", + "infer_flops_per_step", + "vocoder_flops", + "total_flops", + "total_flops_per_step", + ], + ) + self.perf_writer.writeheader() + + def infer(self, tokens, tokens_length, emb): + """Performs inference + + Arguments + --------- + tokens : torch.Tensor + A token sequence + tokens_length : torch.Tensor + Relative lengths + emb : dict + Embeddings for conditioning + + Returns + ------- + wav : torch.Tensor + The waveform + stats : dict + Statistics""" + stats = {} + if self.hparams.eval_perf: + flop_counter = FlopCounterMode() + else: + flop_counter = nullcontext() + + with flop_counter: + infer_out = self.modules.model.infer( + input_tokens=tokens, input_length=tokens_length, emb=emb + ) + if self.hparams.eval_perf: + steps = (infer_out.length * infer_out.audio.size(1)).sum().item() + total_flops = flop_counter.get_total_flops() + stats = { + "infer_flops": total_flops, + "steps": steps, + "infer_flops_per_step": total_flops / steps, + } + return infer_out, stats + + def vocoder(self, infer_out, emb): + """Runs the vocoder to create a waveform + + Arguments + --------- + infer_out : Tokotron.TokotronInfernceOutput + Inference output + emb : dict + Embeddings for conditioning + + Returns + ------- + wav : torch.Tensor + The waveform + stats : dict + Statistics""" + stats = {} + if self.hparams.eval_perf: + flop_counter = FlopCounterMode() + else: + flop_counter = nullcontext() + + with flop_counter: + wav = self.create_waveform_fn( + infer_out.audio, length=infer_out.length, emb=emb + ) + if wav.dim() > 2: + wav = wav.squeeze(1) + + if self.hparams.eval_perf: + flops = flop_counter.get_total_flops() + stats = {"vocoder_flops": flops} + return wav, stats + + def read_reports(self): + """Invoked when resuming""" + for evaluator_key in self.enabled_evaluators: + file_name = self.output_folder / f"{evaluator_key}.csv" + if file_name.exists(): + logger.info("%s exists, reading") + with open(file_name) as report_file: + reader = csv.DictReader(report_file) + for row in reader: + del row["uttid"] + row = { + key: handle_number(value) + for key, value in row.items() + } + self.details[evaluator_key].append(row) + + def get_report_columns(self, evaluator_key): + """Returns the columns for the specified evaluator + + Arguments + --------- + evaluator_key : str + the identifier of the evaluator + + Returns + ------- + columns : list[str] + a list of column headers + """ + bogus_wavs = torch.randn(2, 10000, device=self.device) + bogus_length = torch.tensor([1.0, 1.0], device=self.device) + if evaluator_key in self.evaluators: + evaluator = self.evaluators[evaluator_key] + result = evaluator.evaluate( + wavs=bogus_wavs, + length=bogus_length, + text=["BOGUS"] * len(bogus_wavs), + wavs_ref=bogus_wavs, + length_ref=bogus_length, + ) + + return ["uttid"] + list(result.details.keys()) + + def evaluate_batch(self, batch): + """Runs evaluation on a single batch of speech + + Arguments + --------- + batch : speechbrain.dataio.batch.PaddedBatch + the batch to be evaluated""" + with torch.no_grad(): + batch = batch.to(self.device) + tokens, tokens_length = batch.tokens + if hasattr(self.modules, "vocoder"): + vocoder_to_device(self.modules.vocoder, self.device) + if hasattr(self.modules.vocoder, "device"): + self.modules.vocoder.device = self.device + audio_resampled = torchaudio.functional.resample( + batch.sig.data, + self.hparams.sample_rate, + self.hparams.model_sample_rate, + ) + mel_spec = self.spk_emb_model.mel_spectogram(audio=audio_resampled) + spk_emb = self.spk_emb_model.encode_mel_spectrogram_batch( + mel_spec, batch.sig.lengths + ).squeeze(1) + infer_out, perf_stats = self.infer( + tokens=tokens, tokens_length=tokens_length, emb={"spk": spk_emb} + ) + wav, vocoder_stats = self.vocoder(infer_out, spk_emb) + perf_stats.update(vocoder_stats) + length = infer_out.length + if wav.dim() > 2: + wav = wav.squeeze(1) + + self.save_samples(batch, wav, infer_out.length) + self.item_ids.extend(batch.uttid) + for evaluator_key, evaluator in self.evaluators.items(): + result = evaluator.evaluate( + wavs=wav, + length=length, + text=batch.label_norm_eval, + wavs_ref=batch.sig.data, + length_ref=batch.sig.lengths, + sample_rate_ref=self.hparams.sample_rate, + sample_rate=self.hparams.model_sample_rate, + ) + details = undo_batch(result.details) + self.write_result(evaluator_key, batch.uttid, details) + self.details[evaluator_key].extend(details) + + if self.hparams.eval_perf: + perf_stats.update(vocoder_stats) + perf_stats["total_flops"] = ( + perf_stats["vocoder_flops"] + perf_stats["infer_flops"] + ) + perf_stats["total_flops_per_step"] = ( + perf_stats["total_flops"] / perf_stats["steps"] + ) + self.write_perf_stats(batch.uttid, perf_stats) + + def write_result(self, evaluator_key, uttid, details): + """Outputs the result details to the report for the specified evaluator + + Arguments + --------- + evaluator_key : str + The evaluator key + batch : list + The list of IDs + details : list + a list of evaluation details, one dictionary per item + """ + writer = self.report_writers[evaluator_key] + for uttid, details_item in zip(uttid, details): + report_details = { + "uttid": uttid, + **details_item, + } + writer.writerow(ascii_only(flatten(report_details))) + self.report_files[evaluator_key].flush() + + def save_samples(self, batch, wav, length): + """Saves the samples generated by the TTS system + + Arguments + --------- + batch : speechbrain.dataio.batch.PaddedBatch + the batch being evaluated + wav : torch.Tensor + the waveform + length: torch.Tensor + relative lengths + """ + wav_length_abs = (length * wav.size(1)).int() + for item_id, infer_wav, wav_length in zip( + batch.uttid, wav, wav_length_abs + ): + file_name = str(self.samples_folder / f"{item_id}_pred.wav") + infer_wav_cut = infer_wav[: wav_length.item()].cpu() + sb.dataio.dataio.write_audio( + file_name, + infer_wav_cut, + samplerate=self.hparams.model_sample_rate, + ) + self.sample_file_names.append(file_name) + + def write_summary(self): + """Outputs summarized statistics""" + summary = self.compute_summary() + file_name = self.output_folder / "summary.json" + with open(file_name, "w") as output_file: + json.dump(summary, output_file, indent=4) + + def write_perf_stats(self, uttid, details): + """Outputs performance statistics + + Arguments + --------- + uttid : list + A list of utterance IDs + details : dict + Performance details""" + self.perf_writer.writerow({"uttid": " ".join(uttid), **details}) + self.perf_file.flush() + + def compute_summary(self): + """Computes the summarized statistics""" + return { + f"{evaluator_key}_{stat_key}": value + for evaluator_key in self.enabled_evaluators + if evaluator_key in self.details + for metric_key in self.hparams.eval_summary[evaluator_key][ + "descriptive" + ] + for stat_key, value in descriptive_statistics( + items=self.details[evaluator_key], key=metric_key, + ).items() + } + + +def flatten(value): + """Converts tensors to scalars and lists of strings to strings + + Arguments + --------- + value : dict + the dictionary to flatten + + Returns + ------- + result : dict + a flattened dictionary + """ + return { + key: item_value.item() if torch.is_tensor(item_value) else item_value + for key, item_value in value.items() + } + + +RE_PUNCTUATION = re.compile( + "|".join(re.escape(char) for char in string.punctuation) +) + +RE_NON_ASCII = re.compile(r"[^\x00-\x7F]+") + + +def ascii_only(values): + """Retains only ASCII characters from the values in a + dictionary + + Arguments + --------- + values : dict + a key/value dictionary + + Returns + ------- + result : dict + The same dictionary but with non-ASCII characters + """ + return { + key: RE_NON_ASCII.sub("", value) if isinstance(value, str) else value + for key, value in values.items() + } + + +@sb.utils.data_pipeline.takes("label_norm") +@sb.utils.data_pipeline.provides("label_norm_eval") +def label_norm_pipeline(label): + """Normalizes labels for ASR comparison, converting to uppercase and removing + punctuation + + Arguments + --------- + label : str + The unnormalized label + + Returns + ------- + result : str + The normalized label + """ + label = label.upper() + label = RE_PUNCTUATION.sub("", label) + return label + + +@sb.utils.data_pipeline.takes("wav") +@sb.utils.data_pipeline.provides("sig") +def audio_ref_pipeline(wav): + """The audio loading pipeline for references + + Arguments + --------- + wav : str + The file path + + Returns + ------- + sig : torch.Tensor + The waveform + """ + sig = sb.dataio.dataio.read_audio(wav) + return sig + + +def descriptive_statistics(items, key): + """Computes descriptive statistics for the summary + + Arguments + --------- + items : list + a list of dictionaries with metric values for each item + key : str + """ + values = torch.tensor([item[key] for item in items]) + quantiles = torch.tensor([0.25, 0.5, 0.75]) + q1, median, q3 = values.quantile(quantiles) + stats = { + "mean": values.mean(), + "std": values.std(), + "min": values.min(), + "max": values.max(), + "median": median, + "q1": q1, + "q3": q3, + "iqr": q3 - q1, + } + return { + f"{key}_{stat_key}": value.item() for stat_key, value in stats.items() + } + + +RE_INTEGER = re.compile(r"^-?\d+$") +RE_FLOAT = re.compile(r"^-?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?$") + + +def handle_number(value): + """Converts a value to a number, if applicable""" + if RE_INTEGER.match(value): + value = int(value) + elif RE_FLOAT.match(value): + value = float(value) + return value diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/arpabet.txt b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/arpabet.txt new file mode 100644 index 000000000..105a1dd9d --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/arpabet.txt @@ -0,0 +1,50 @@ +AA +AE +AH +AO +AW +AY +B +CH +D +DH +EH +ER +EY +F +G +HH +IH +IY +JH +K +L +M +N +NG +OW +OY +P +R +S +SH +T +TH +UH +UW +V +W +Y +Z +ZH +' +" +! +( +) +, +- +. +: +; +? diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/char_en.txt b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/char_en.txt new file mode 100644 index 000000000..f43d3b08d --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/char_en.txt @@ -0,0 +1,38 @@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +' +" +! +( +) +, +- +. +: +; +? + \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml new file mode 100644 index 000000000..b39b11009 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml @@ -0,0 +1,57 @@ +eval_dataset: valid +eval_suffix: "" +eval_sample_rate: 16000 +eval_spk_sim_sample_rate: 16000 +eval_samples: null +eval_interval: 1 +eval_subset: null +eval_asr_beam_size: 66 +eval_asr_type: encoder_decoder +eval_asr_source: openai/whisper-small +eval_spk_sim_source: microsoft/wavlm-base-sv +evaluations: utmos,asr,spk_sim +tmp_folder: null +eval_utmos_source: chaanks/wav2vec2-small +eval_utmos_save_path: !ref /utmos +eval_utmos_model_name: utmos.ckpt +eval_utmos_model_url: https://huggingface.co/chaanks/UTMOS/resolve/main +eval_utmos_domain_id: null +eval_utmos_judge_id: null +eval_perf: False + + +eval_asr: !name:eval.WhisperASRSpeechEvaluator + source: !ref + sample_rate: !ref + savedir: !ref + +eval_utmos: !name:eval.UTMOSSpeechEvaluator + source: !ref + save_path: !ref + model_name: !ref + model_url: !ref + domain_id: !ref + judge_id: !ref + +eval_spk_sim: !name:eval.SpkSimWavLM + source: !ref + savedir: !ref + model_sample_rate: !ref + +evaluators: + utmos: !ref + asr: !ref + spk_sim: !ref + +eval_summary: + asr: + descriptive: ["wer", "cer", "wer_ref", "cer_ref", "dwer", "dcer"] + utmos: + descriptive: ["utmos"] + spk_sim: + descriptive: ["score"] + +eval_summary_log: + utmos: utmos_utmos_mean + dwer: asr_dwer_median + spk_sim: spk_sim_score_mean diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml new file mode 100644 index 000000000..3fa047b31 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml @@ -0,0 +1,295 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/transformer/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +prepare_save_folder: !ref /prepared +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +freeze_token_model: True +token_model_src: "facebook/encodec_24khz" +g2p_src: flexthink/soundchoice-g2p +token_offset: 1 +vocoder_takes_spk_emb: False +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 1000 +reset_annealing_epoch: null +batch_size: 16 +valid_batch_size: !ref +extract_features_batch_size: 32 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 +eos_index: 0 +eos_width: 1 +audio_token_shift: 0 + +# stages related parameters +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Inference parameters +eos_mode: gate +scale_factor: 4 + +# Beam Search-specific parameters +min_decode_ratio: 1.0 +max_decode_ratio: 10.0 +beam_size: 5 + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 5000 +infer_max_audio_length: 1000 +debug_infer_max_audio_length: 10 + +# DAC-specific settings +model_type: 24khz +model_bitrate: 8kbps + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: ./hparams/char_en.txt +token_list_file_phn: ./hparams/arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:benchmarks.DASB.model.Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref +use_silence_padding: True + + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +extract_features_opts: + dataloader_opts: + batch_size: !ref + num_workers: !ref + token_model: !ref + sample_rate: !ref + model_sample_rate: !ref + spk_emb_model: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" +d_ffn: 2048 +z_dim: 128 +hidden_dim: 2048 +n_dim: 16 +decoder_chunk_size: -1 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +audio_num_tokens: 1024 +audio_emb_size: 128 +audio_emb_freeze: True +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 2 +bandwidth: 1.5 +attention_type: regularMHA + +############################## models ################################ + +emb: + spk: + kind: "pretrained" + dim: 192 + vocoder: !ref + +model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length + input_num_tokens: !ref + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + z_dim: !ref + hidden_dim: !ref + n_dim: !ref + decoder_chunk_size: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + vocoder: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + eos_mode: !ref + infer_max_audio_length: !ref + audio_token_shift: !ref + scale_factor: !ref + emb: !ref + +tokenizer: !new:utils.tokenizer_interface.DACTokenizer + model_type: !ref + model_bitrate: !ref + n_codebooks: !ref + load_pretrained: True + tag: latest + +modules: + model: !ref + tokenizer: !ref + compute_cost: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:benchmarks.DASB.model.Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + eos_mode: !ref + eos_index: !ref + eos_width: !ref + audio_tokens_per_step: !ref + audio_token_shift: !ref + + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +generator: !new:model.custom_model.SaveableGenerator + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + generator: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml new file mode 100644 index 000000000..1fe2ebca9 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -0,0 +1,344 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results/transformer/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +data_folder_alignments: null # e.g., /path/to/LibriSpeech +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +ssl_model_type: wavlm +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +init_from: null +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + +# Position shift +use_position_shift: True +max_position_shift: 1000 +position_shift_seed: 42 +position_shift_probability: 1.0 + +freeze_token_model: True + +token_model_src: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: microsoft/wavlm-large + hubert: facebook/hubert-large-ll60k + wav2vec2: facebook/wav2vec2-large +g2p_src: flexthink/soundchoice-g2p +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech +vocoder_repo_id: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + hubert: speechbrain/hifigan-hubert-k1000-LibriTTS + wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS + wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +select_layers: null +token_offset: 1 +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +spk_emb_discrete_src: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: flexthink/discrete_wavlm_spk_rec_ecapatdn_lite + hubert: flexthink/discrete_hubert_spk_rec_ecapatdn_lite + wav2vec2: flexthink/discrete_wav2vec2_spk_rec_ecapatdn_lite +asr_src: speechbrain/asr-transformer-transformerlm-librispeech +spk_emb_shuffle: True + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 1000 +reset_annealing_epoch: null +batch_size: 16 +valid_batch_size: !ref +batch_size_guided: 2 +extract_features_batch_size: 32 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 +eos_index: 0 +eos_width: 1 +audio_token_shift: 0 + +# stages related parameters +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Inference parameters +eos_mode: gate +scale_factor: 4 + +# Embedding Injection +spk_emb_injection: null + +# Beam Search-specific parameters +min_decode_ratio: 1.0 +max_decode_ratio: 10.0 +beam_size: 5 + + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 16000 +max_audio_length: 2000 +infer_max_audio_length: !ref +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref +use_silence_padding: True + + +# Token model (pretrained) +ssl_model: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True + hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True + wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True + + +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +token_model_kwargs: + SSL_layers: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" +layerwise_renorm: True +d_ffn: 2048 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +vocab_size: 1000 +audio_dim: 1024 +audio_emb_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + discrete: 1024 + continuous: 128 +audio_emb_freeze: False +audio_emb_lr: 0.00001 +audio_emb_weight_decay: 0.001 +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 6 +attention_type: regularMHA +############################## models ################################ +emb: + spk: + kind: "pretrained" + dim: 192 + injection: !ref + +model: !new:Tokotron.TokotronTransformerModel + input_num_tokens: !ref # yamllint disable-line rule:line-length + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + eos_mode: !ref + infer_max_audio_length: !ref + audio_token_shift: !ref + scale_factor: !ref + representation_mode: !ref + emb: !ref + +tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer + save_path: !ref + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref + +modules: + model: !ref + tokenizer: !ref + compute_cost: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref +compute_cost: !new:Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + eos_mode: !ref + eos_index: !ref + eos_width: !ref + audio_tokens_per_step: !ref + audio_token_shift: !ref + representation_mode: !ref + + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +generator: !new:model.custom_model.SaveableGenerator + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + generator: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml new file mode 100644 index 000000000..3820c8407 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml @@ -0,0 +1,297 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results/transformer/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +alignments_folder: null +prepare_save_folder: !ref /prepared +pretrained_model_save_folder: !ref +representation_mode: discrete +vocoder_model_name: encodec +vocoder_model_path: !ref / +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +init_from: null +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + +freeze_token_model: True +token_model_src: "facebook/encodec_24khz" +g2p_src: flexthink/soundchoice-g2p +token_offset: 1 +vocoder_type: encodec +vocoder_src: "charactr/vocos-encodec-24khz" +vocoder_takes_spk_emb: False +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +spk_emb_shuffle: True + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 1000 +reset_annealing_epoch: null +batch_size: 16 +valid_batch_size: !ref +extract_features_batch_size: 32 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 +eos_index: 0 +eos_width: 1 +audio_token_shift: 0 + +# stages related parameters +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Inference parameters +inference_mode: autoregressive +eos_mode: gate +scale_factor: 4 + +# Embedding Injection +spk_emb_injection: null + +# Beam Search-specific parameters +min_decode_ratio: 1.0 +max_decode_ratio: 10.0 +beam_size: 5 + + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 5000 +infer_max_audio_length: 1000 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: ./hparams/char_en.txt +token_list_file_phn: ./hparams/arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +# Guides +guides_enabled: False + + +silence_padding: !ref +use_silence_padding: True + +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" +d_ffn: 2048 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +audio_num_tokens: 1024 +audio_emb_size: 128 +audio_emb_freeze: True +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 2 +bandwidth: 1.5 +attention_type: regularMHA + +############################## models ################################ +emb: + spk: + kind: "pretrained" + dim: 192 + vocoder: !ref + injection: !ref + +model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length + input_num_tokens: !ref + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + eos_mode: !ref + infer_max_audio_length: !ref + audio_token_shift: !ref + scale_factor: !ref + emb: !ref + + +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + + +modules: + model: !ref + compute_cost: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + eos_mode: !ref + eos_index: !ref + eos_width: !ref + audio_tokens_per_step: !ref + audio_token_shift: !ref + + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +generator: !new:model.custom_model.SaveableGenerator + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + generator: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml new file mode 100644 index 000000000..3820c8407 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml @@ -0,0 +1,297 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results/transformer/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +alignments_folder: null +prepare_save_folder: !ref /prepared +pretrained_model_save_folder: !ref +representation_mode: discrete +vocoder_model_name: encodec +vocoder_model_path: !ref / +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +init_from: null +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + +freeze_token_model: True +token_model_src: "facebook/encodec_24khz" +g2p_src: flexthink/soundchoice-g2p +token_offset: 1 +vocoder_type: encodec +vocoder_src: "charactr/vocos-encodec-24khz" +vocoder_takes_spk_emb: False +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +spk_emb_shuffle: True + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 1000 +reset_annealing_epoch: null +batch_size: 16 +valid_batch_size: !ref +extract_features_batch_size: 32 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 +eos_index: 0 +eos_width: 1 +audio_token_shift: 0 + +# stages related parameters +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Inference parameters +inference_mode: autoregressive +eos_mode: gate +scale_factor: 4 + +# Embedding Injection +spk_emb_injection: null + +# Beam Search-specific parameters +min_decode_ratio: 1.0 +max_decode_ratio: 10.0 +beam_size: 5 + + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 5000 +infer_max_audio_length: 1000 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: ./hparams/char_en.txt +token_list_file_phn: ./hparams/arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +# Guides +guides_enabled: False + + +silence_padding: !ref +use_silence_padding: True + +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" +d_ffn: 2048 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +audio_num_tokens: 1024 +audio_emb_size: 128 +audio_emb_freeze: True +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 2 +bandwidth: 1.5 +attention_type: regularMHA + +############################## models ################################ +emb: + spk: + kind: "pretrained" + dim: 192 + vocoder: !ref + injection: !ref + +model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length + input_num_tokens: !ref + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + eos_mode: !ref + infer_max_audio_length: !ref + audio_token_shift: !ref + scale_factor: !ref + emb: !ref + + +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + + +modules: + model: !ref + compute_cost: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + eos_mode: !ref + eos_index: !ref + eos_width: !ref + audio_tokens_per_step: !ref + audio_token_shift: !ref + + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +generator: !new:model.custom_model.SaveableGenerator + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + generator: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_fairseq_hubert.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_fairseq_hubert.yaml new file mode 100644 index 000000000..d30420925 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_fairseq_hubert.yaml @@ -0,0 +1,314 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results/transformer/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +data_folder_alignments: null # e.g., /path/to/LibriSpeech +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +init_from: null +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + +# Position shift +use_position_shift: True +max_position_shift: 1000 +position_shift_seed: 42 +position_shift_probability: 1.0 + +freeze_token_model: True + +g2p_src: flexthink/soundchoice-g2p +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech +model_path: !ref /fairseq-hubert +feature_extractor_path: !ref /mhubert_base_25hz_cp_mls_cv_sp_fisher.pt +kmeans_path: !ref /mhubert_base_25hz_cp_mls_cv_sp_fisher_L11_km500.bin +vocoder_dense_model_name: "mhubert-base-25hz" +vocoder_quantizer_model_name: "kmeans" +vocoder_vocab_size: 500 + +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +select_layers: null +token_offset: 1 +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +spk_emb_shuffle: True + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 1000 +reset_annealing_epoch: null +batch_size: 16 +valid_batch_size: !ref +batch_size_guided: 2 +extract_features_batch_size: 32 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 +eos_index: 0 +eos_width: 1 +audio_token_shift: 0 + +# stages related parameters +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Inference parameters +eos_mode: gate +scale_factor: 4 + +# Embedding Injection +spk_emb_injection: null + +# Beam Search-specific parameters +min_decode_ratio: 1.0 +max_decode_ratio: 10.0 +beam_size: 5 + + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 16000 +max_audio_length: 2000 +infer_max_audio_length: !ref +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref +use_silence_padding: True + + +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +token_model_kwargs: + SSL_layers: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" +layerwise_renorm: True +d_ffn: 2048 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +vocab_size: 1000 +audio_dim: 1024 +audio_emb_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + discrete: 1024 + continuous: 128 +audio_emb_freeze: False +audio_emb_lr: 0.00001 +audio_emb_weight_decay: 0.001 +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 6 +attention_type: regularMHA +############################## models ################################ +emb: + spk: + kind: "pretrained" + dim: 192 + injection: !ref + +model: !new:Tokotron.TokotronTransformerModel + input_num_tokens: !ref # yamllint disable-line rule:line-length + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + eos_mode: !ref + infer_max_audio_length: !ref + audio_token_shift: !ref + scale_factor: !ref + representation_mode: !ref + emb: !ref + +vocoder: !apply:textless.vocoders.hifigan.vocoder.CodeHiFiGANVocoder.by_name + dense_model_name: !ref #"mhubert-base-25hz" + quantizer_model_name: !ref # "kmeans", + vocab_size: !ref #500 + +tokenizer: !new:utils.tokenizer_interface.FairseqHuBERTTokenizer + feat_extractor_path: !ref + km_path: !ref + layer: !ref + vocoder: !ref + +modules: + model: !ref + tokenizer: !ref + compute_cost: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref +compute_cost: !new:Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + eos_mode: !ref + eos_index: !ref + eos_width: !ref + audio_tokens_per_step: !ref + audio_token_shift: !ref + representation_mode: !ref + + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +generator: !new:model.custom_model.SaveableGenerator + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + generator: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml new file mode 100644 index 000000000..8d74e195c --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml @@ -0,0 +1,284 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results/transformer/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + +# Data files +cached_data_folder: !PLACEHOLDER +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +representation_mode: discrete +vocoder_model_name: vocos +vocoder_model_path: !ref / +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +init_from: null +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + + +freeze_token_model: True +model_hub: kyutai/mimi +g2p_src: flexthink/soundchoice-g2p +token_offset: 1 +vocoder_takes_spk_emb: False +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +spk_emb_shuffle: True + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 1000 +reset_annealing_epoch: null +batch_size: 16 +valid_batch_size: !ref +extract_features_batch_size: 32 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 +eos_index: 0 +eos_width: 1 +audio_token_shift: 0 + +# stages related parameters +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Inference parameters +inference_mode: autoregressive +eos_mode: gate +scale_factor: 4 + +# Beam Search-specific parameters +min_decode_ratio: 1.0 +max_decode_ratio: 10.0 +beam_size: 5 + + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 24000 +max_audio_length: 5000 +infer_max_audio_length: 1000 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref +use_silence_padding: True + + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" +d_ffn: 2048 +decoder_chunk_size: -1 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +vocab_size: 2048 +audio_emb_size: 1024 +audio_emb_freeze: True +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 8 +attention_type: regularMHA + +############################## models ################################ +emb: + spk: + kind: "pretrained" + dim: 192 + vocoder: !ref + +model: !new:Tokotron.TokotronTransformerModel + input_num_tokens: !ref # yamllint disable-line rule:line-length + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + eos_mode: !ref + infer_max_audio_length: !ref + audio_token_shift: !ref + scale_factor: !ref + representation_mode: !ref + emb: !ref + +tokenizer: !new:utils.tokenizer_interface.MimiTokenizer + source: !ref + save_path: !ref + num_codebooks: !ref + +modules: + model: !ref + tokenizer: !ref + compute_cost: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:model.Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + eos_mode: !ref + eos_index: !ref + eos_width: !ref + audio_tokens_per_step: !ref + audio_token_shift: !ref + + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +generator: !new:model.custom_model.SaveableGenerator + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + generator: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml similarity index 68% rename from benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml rename to benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml index ac80bdac0..1dfc9a1d7 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -2,61 +2,52 @@ # Model: Tokenized TTS (WhisperSpeech-inspired) # Authors: Artem Ploujnikov # ############################################################################ - -experiment_name: tokotron/continuous_ssl - # Seed needs to be set at top of yaml, before objects with parameters are made + seed: 74443 __set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/// +run_name: !PLACEHOLDER +output_folder: !ref results/transformer/ save_folder: !ref /save train_log: !ref /train_log.txt - -# Model type -ssl_model_type: wavlm -representation_mode: continuous +testing: True # If set to True, the test evlaution is done, otherwise skipped. # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech -prepare_save_folder: !ref /prepared/continuous- +prepare_save_folder: !ref /prepared pretrained_model_save_folder: !ref -vocoder_model_name: !ref unithifigan-dasb--continuous +representation_mode: discrete +vocoder_model_name: vocos vocoder_model_path: !ref / prepare_archive_path: null prepare_skip_ignore_folders: False +data_mode: lite train_json: !ref /train.json valid_json: !ref /valid.json test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] frozen_split_path: null sample_path: null progress_folder: !ref /progress -progress_archive: !ref /progress.tar progress_current: !ref /current progress_meta: !ref /meta.yaml +init_from: null num_audio_samples: 32 samples_interval: 5 -freeze_ssl_model: True -ssl_model_src: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - wavlm: microsoft/wavlm-large - hubert: facebook/hubert-large-ll60k - wav2vec2: facebook/wav2vec2-large-960h-lv60-self - -g2p_src: speechbrain/soundchoice-g2p -ssl_model_layers: [1, 3, 7, 12, 18, 23] +freeze_token_model: True +token_model_src: "fnlp/SpeechTokenizer" +g2p_src: flexthink/soundchoice-g2p token_offset: 1 -vocoder_src: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-LibriTTS - hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-LibriTTS - wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-LibriTTS +vocoder_takes_spk_emb: False spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec -use_spk_emb: False - -vocoder_available_layers: [1, 3, 7, 12, 18, 23] splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] @@ -66,8 +57,11 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text -number_of_epochs: 50 +number_of_epochs: 1000 +reset_annealing_epoch: null batch_size: 16 +valid_batch_size: !ref +extract_features_batch_size: 32 grad_accumulation_factor: 1 max_grad_norm: 0.01 sorting: random @@ -87,7 +81,7 @@ eos_width: 1 audio_token_shift: 0 # stages related parameters -lr: 0.0005 +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -113,8 +107,8 @@ beam_size: 5 # Feature parameters sample_rate: 22050 model_sample_rate: 16000 -max_audio_length: 1000 -infer_max_audio_length: !ref +max_audio_length: 5000 +infer_max_audio_length: 1000 debug_infer_max_audio_length: 10 # Label encoder @@ -128,7 +122,7 @@ token_list_file: !apply:speechbrain.utils.hparams.choice phonemes: !ref # Gate offset -gate_offset: !apply:Tokotron.distance_diff_loss_ramp +gate_offset: !apply:benchmarks.DASB.model.Tokotron.distance_diff_loss_ramp beta: !ref gamma: !ref max_weight: !ref @@ -138,29 +132,11 @@ use_silence_padding: True # Token model (pretrained) -ssl_model: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM - source: !ref - save_path: !ref - freeze: !ref - output_all_hiddens: True - hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT - source: !ref - save_path: !ref - freeze: !ref - output_all_hiddens: True - wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 - source: !ref - save_path: !ref - freeze: !ref - output_all_hiddens: True - spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams source: !ref savedir: !ref /ecapa + # Dataloader options train_dataloader_opts: batch_size: !ref @@ -171,7 +147,7 @@ train_dataloader_opts: value: !ref valid_dataloader_opts: - batch_size: !ref + batch_size: !ref num_workers: !ref collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: @@ -193,9 +169,9 @@ sample_dataloader_opts: extract_features_opts: dataloader_opts: - batch_size: !ref - ssl_model: !ref - ssl_model_layers: !ref + batch_size: !ref + num_workers: !ref + token_model: !ref sample_rate: !ref model_sample_rate: !ref spk_emb_model: !ref @@ -205,19 +181,20 @@ extract_features_opts: # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 -dec_num_layers: 12 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" d_ffn: 2048 +z_dim: 128 +hidden_dim: 2048 +n_dim: 16 +decoder_chunk_size: -1 transformer_dropout: 0.2 target_dropout: 0.2 activation: !name:torch.nn.GELU -audio_num_tokens: 1000 -audio_dim: 1024 +audio_num_tokens: 1024 audio_emb_size: 128 -audio_emb_freeze: False +audio_emb_freeze: True audio_emb_pretrained: False -audio_emb_lr: 0.00001 -audio_emb_weight_decay: 0.001 text_num_tokens: 39 phn_num_tokens: 52 input_num_tokens: !apply:speechbrain.utils.hparams.choice @@ -229,17 +206,22 @@ audio_tokens_per_step: 6 attention_type: regularMHA ############################## models ################################ +emb: + spk: + kind: "pretrained" + dim: 192 + vocoder: !ref -vocoder: !apply:speechbrain.inference.vocoders.HIFIGAN.from_hparams - source: !ref - savedir: !ref - -model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length +model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length input_num_tokens: !ref audio_num_tokens: !ref audio_tokens_per_step: !ref d_model: !ref d_ffn: !ref + z_dim: !ref + hidden_dim: !ref + n_dim: !ref + decoder_chunk_size: !ref nhead: !ref enc_num_layers: !ref dec_num_layers: !ref @@ -247,6 +229,7 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line- target_dropout: !ref activation: !ref attention_type: !ref + vocoder: !ref gate_threshold: !ref gate_offset: !ref audio_emb_size: !ref @@ -255,21 +238,23 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line- eos_mode: !ref infer_max_audio_length: !ref audio_token_shift: !ref - decoder_mode: !ref scale_factor: !ref - representation_mode: continuous + emb: !ref +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper + source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio + save_path: !ref modules: model: !ref - vocoder: !ref + tokenizer: !ref compute_cost: !ref # define two optimizers here for two-stage training opt_class: !name:torch.optim.Adam lr: !ref -compute_cost: !new:Tokotron.TokotronLoss +compute_cost: !new:benchmarks.DASB.model.Tokotron.TokotronLoss guided_attention_weight: !ref guided_attention_sigma: !ref gate_weight: !ref @@ -282,13 +267,13 @@ compute_cost: !new:Tokotron.TokotronLoss eos_width: !ref audio_tokens_per_step: !ref audio_token_shift: !ref - representation_mode: continuous -lr_annealing: !new:Tokotron.TargetedNoamScheduler - lr_initial: [!ref , !ref ] +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref n_warmup_steps: !ref - param_group: 0 + +generator: !new:model.custom_model.SaveableGenerator checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref @@ -296,10 +281,7 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer model: !ref lr_scheduler: !ref counter: !ref - -freezer: !new:preparation.Freezer - save_path: !ref - archive_path: !ref + generator: !ref epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml new file mode 100644 index 000000000..78975b1a0 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml @@ -0,0 +1,284 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results/transformer/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + +# Data files +cached_data_folder: !PLACEHOLDER +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +representation_mode: discrete +vocoder_model_name: vocos +vocoder_model_path: !ref / +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +init_from: null +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + + +freeze_token_model: True +model_hub: kyutai/mimi +g2p_src: flexthink/soundchoice-g2p +token_offset: 1 +vocoder_takes_spk_emb: False +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +spk_emb_shuffle: True + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 1000 +reset_annealing_epoch: null +batch_size: 16 +valid_batch_size: !ref +extract_features_batch_size: 32 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 +eos_index: 0 +eos_width: 1 +audio_token_shift: 0 + +# stages related parameters +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Inference parameters +inference_mode: autoregressive +eos_mode: gate +scale_factor: 4 + +# Beam Search-specific parameters +min_decode_ratio: 1.0 +max_decode_ratio: 10.0 +beam_size: 5 + + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 5000 +infer_max_audio_length: 1000 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref +use_silence_padding: True + + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" +d_ffn: 2048 +decoder_chunk_size: -1 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +vocab_size: 2048 +audio_emb_size: 1024 +audio_emb_freeze: True +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 8 +attention_type: regularMHA + +############################## models ################################ +emb: + spk: + kind: "pretrained" + dim: 192 + vocoder: !ref + +model: !new:Tokotron.TokotronTransformerModel + input_num_tokens: !ref # yamllint disable-line rule:line-length + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + eos_mode: !ref + infer_max_audio_length: !ref + audio_token_shift: !ref + scale_factor: !ref + representation_mode: !ref + emb: !ref + +tokenizer: !new:utils.tokenizer_interface.MimiTokenizer + source: !ref + save_path: !ref + num_codebooks: !ref + +modules: + model: !ref + tokenizer: !ref + compute_cost: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:model.Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + eos_mode: !ref + eos_index: !ref + eos_width: !ref + audio_tokens_per_step: !ref + audio_token_shift: !ref + + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +generator: !new:model.custom_model.SaveableGenerator + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + generator: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/libritts_prepare.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/libritts_prepare.py new file mode 120000 index 000000000..489ab4011 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/libritts_prepare.py @@ -0,0 +1 @@ +../../libritts_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py new file mode 100644 index 000000000..abb2cda88 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -0,0 +1,1075 @@ +#!/usr/bin/env/python3 +"""Recipe for training a Text-to-Speech system based on tokenized audio + +Inspired by WhisperSpeech +https://github.com/collabora/WhisperSpeech + +However, this is not an implementation of WhisperSpeech, but rather +a radical simplification of it that uses only an acoustic model + + +Authors + * Artem Ploujnikov 2024 +""" + + +import logging +import speechbrain as sb +import math +import torch +import sys +from functools import partial +from pathlib import Path +from hyperpyyaml import load_hyperpyyaml +from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset +from speechbrain.dataio.dataio import clean_padding_ +from speechbrain.utils.distributed import run_on_main +import re +import string + +base_dir = str(Path(__file__).resolve().parent.parent.parent.parent) +sys.path.append(base_dir) + +from model.Tokotron import ( # noqa: E402 + RepresentationMode, + get_silence_repr, + get_silence_token, + use_silence_padding, + feature_pad_to, +) +from evaluate import TokotronEvaluator # noqa: E402 + +logger = logging.getLogger(__name__) + +SPECIAL_TOKEN_COUNT = 1 + + +# Brain class for speech recognition training +class TokotronBrain(sb.Brain): + """Class that manages the training loop. See speechbrain.core.Brain.""" + + def __init__( + self, + modules=None, + opt_class=None, + hparams=None, + run_opts=None, + checkpointer=None, + ): + super().__init__( + modules, opt_class, hparams, run_opts, checkpointer, + ) + self.evaluator = TokotronEvaluator( + hparams=hparams, + create_waveform_fn=self.create_waveform, + device=self.device, + ) + self.representation_mode = RepresentationMode( + self.hparams.representation_mode + ) + + def create_waveform(self, audio, length, emb): + """Creates a waveform from a discrete or continuous audio + representation + + Arguments + --------- + audio : torch.Tensor + An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) + lengths : torch.Tensor + A 1-D tensor + emb: dict + Embeddings (speaker, etc) + + Returns + ------- + wav : torch.Tensor + """ + self.modules.tokenizer.device = self.device + if hasattr(self.modules.tokenizer, "codec_vocoder"): + self.modules.tokenizer.codec_vocoder.to(self.device) + self.modules.tokenizer.codec_vocoder.device = self.device + with torch.no_grad(): + wav = self.modules.tokenizer.tokens_to_sig( + audio, **self.token_model_kwargs + ) + clean_padding_(wav, length) + wav = wav.to(self.device) + return wav + + def compute_forward(self, batch, stage): + """Runs all the computation of the Tokotron TTS + + Arguments + --------- + batch : PaddedBatch + This batch object contains all the relevant tensors for computation. + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + + Returns + ------- + predictions : dict + TTS predictions + """ + batch = batch.to(self.device) + tokens, tokens_length = batch.tokens + features = self.prepare_features(batch) + ( + audio_bos, + audio_bos_length, + audio_tgt, + audio_tgt_length, + spk_emb, + ) = features + + predictions = self.modules.model( + input_tokens=tokens, + input_length=tokens_length, + audio=audio_bos, + audio_length=audio_bos_length, + emb={"spk": spk_emb}, + ) + + return predictions, features + + def prepare_features(self, batch): + """Prepares Tokotron TTS features + + Arguments + --------- + batch : PaddedBatch + A batch of data + + Returns + ------- + audio_bos : torch.Tensor + Audio represnetations (discrete or continuous) with the BOS marker + audio_bos_length : torch.Tensor + Relative lengths of audio representations with the BOS marker + audio_tgt : torch.Tensor + Audio prediction targets + audio_tgt_length : torch.Tensor + Audio prediction targets - relative lengths + spk_emb : torch.Tensor + Speaker embeddings + """ + if self.hparams.spk_emb_shuffle: + wav, wav_length = batch.spk_emb_random_match + else: + wav, wav_length = batch.sig + spk_emb = self._compute_spk(wav, wav_length).squeeze(1) + + if self.representation_mode == RepresentationMode.DISCRETE: + audio_bos, audio_bos_length = batch.audio_bos + audio_tgt, audio_tgt_length = batch.audio_pad + else: + wav, audio_length = batch.sig + audio = self.modules.ssl_model(wav) + audio = audio[self.hparams.ssl_model_layers, :, :, :].permute( + 1, 2, 0, 3 + ) + batch_size, _, heads, dim = audio.shape + bos = torch.zeros_like(audio[:, :1, :, :]).reshape( + batch_size, self.hparams.bos_width, heads, dim + ) + audio_bos = torch.concatenate([bos, audio], dim=1) + audio_bos_length = audio_length * audio.size(1) / audio_bos.size(1) + audio_tgt = audio + audio_tgt_length = audio_length + + return audio_bos, audio_bos_length, audio_tgt, audio_tgt_length, spk_emb + + def _compute_spk(self, wav, wav_length): + mel_spec = self.spk_emb_model.mel_spectogram(wav.squeeze(1)) + spk_emb_pred = self.spk_emb_model.encode_mel_spectrogram_batch( + mel_spec, wav_length + ) + return spk_emb_pred + + def compute_objectives(self, predictions, batch, stage): + """Computes the loss given the predicted and targeted outputs. We here + do multi-task learning and the loss is a weighted sum of the ctc + seq2seq + costs. + + Arguments + --------- + predictions : dict + The output dict from `compute_forward`. + batch : PaddedBatch + This batch object contains all the relevant tensors for computation. + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + + Returns + ------- + loss : torch.Tensor + A one-element tensor used for backpropagating the gradient. + """ + batch = batch.to(self.device) + + predictions, features = predictions + ( + audio_bos, + audio_bos_length, + audio_tgt, + audio_tgt_length, + spk_emb, + ) = features + + loss_details = self.hparams.compute_cost( + predictions=predictions, + audio=audio_tgt, + audio_length=audio_tgt_length, + input_tokens=batch.tokens.data, + input_length=batch.tokens.lengths, + ) + self.loss_metric.append( + batch.uttid, + predictions=predictions, + audio=audio_tgt, + audio_length=audio_tgt_length, + input_tokens=batch.tokens.data, + input_length=batch.tokens.lengths, + reduction="batch", + ) + return loss_details.loss + + def on_stage_start(self, stage, epoch): + """Gets called at the beginning of each epoch. + + Arguments + --------- + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + if hasattr(self.modules, "vocoder") and hasattr( + self.modules.vocoder, "model" + ): + self.modules.vocoder.model.device = self.device + self.loss_metric = sb.utils.metric_stats.MultiMetricStats( + metric=self.hparams.compute_cost, batch_eval=True, + ) + if ( + self.hparams.audio_emb_pretrained + and epoch == 1 + and stage == sb.Stage.TRAIN + ): + # TODO: Clean this up + if hasattr(self.hparams.token_model, "vocabulary"): + vocabulary = self.hparams.token_model.vocabulary + elif hasattr(self.hparams.token_model, "vocabularies"): + vocabulary = torch.stack( + [ + torch.from_numpy(voc) + for voc in self.hparams.token_model.vocabularies + ] + ) + self.modules.model.init_audio_emb(vocabulary) + # Load the compression model only if compression is enables + pretrained_run_opts = {"device": self.device} + self.spk_emb_model = self.hparams.spk_emb_model( + run_opts=pretrained_run_opts + ) + self.representation_mode = RepresentationMode( + self.hparams.representation_mode + ) + # If speaker embedding shuffling is enabled, re-initialize them for the + # epoch + if self.hparams.spk_emb_shuffle: + stage_key = stage.name.lower() + self.resample_fn[stage_key](epoch=epoch) + + # Reset the learning rate - if supported. This is useful when fine-tuning + # a model pre-trained on another dataset + if ( + stage == sb.Stage.TRAIN + and self.hparams.reset_annealing_epoch is not None + and epoch is not None + and epoch == self.hparams.reset_annealing_epoch + ): + self.hparams.lr_annealing.n_steps = 0 + + self.is_evaluating = False + if stage == sb.Stage.VALID: + if self.is_eval_epoch(epoch): + self.evaluator.on_evaluate_start(stage, epoch) + self.is_evaluating = True + else: + logger.info("No evaluation on epoch %d", epoch) + elif stage == sb.Stage.TEST: + self.evaluator.on_evaluate_start(stage, epoch) + self.is_evaluating = True + self.token_model_kwargs = getattr( + self.hparams, "token_model_kwargs", {} + ) + + def is_eval_epoch(self, epoch): + """Determines whether or not evaluation should be performed + in the specieied epoch + + Arguments + --------- + epoch : int + The epoch number. If omitted, the epoch number from the + epoch counter will be used + + Returns + ------- + eval_epoch : bool + True if evaluation should be run in this epoch, false + otherwise""" + if epoch is None: + epoch = self.hparams.epoch_counter.current + return epoch % self.hparams.eval_interval == 0 + + def on_fit_start(self): + """Gets called at the beginning of ``fit()``, on multiple processes + if ``distributed_count > 0`` and backend is ddp. + + Default implementation compiles the jit modules, initializes + optimizers, and loads the latest checkpoint to resume training. + """ + # Run this *after* starting all processes since jit/compiled modules + # cannot be pickled. + self._compile() + + # Wrap modules with parallel backend after jit + self._wrap_distributed() + + # Initialize optimizers after parameters are configured + self.init_optimizers() + + # Load latest checkpoint to resume training if interrupted + if self.checkpointer is not None and not getattr( + self, "_ckpt_recovered", False + ): + checkpoint = self.checkpointer.recover_if_possible() + if not checkpoint: + self.check_init() + self._ckpt_recovered = True + + def check_init(self): + init_from = getattr(self.hparams, "init_from", None) + if init_from is not None: + logger.info( + "Initializing with pre-trained weights from %s", init_from + ) + init_from_path = Path(init_from) + model_path = init_from_path / "model.ckpt" + with open(model_path, "rb") as model_file: + model_state_dict = torch.load( + model_file, map_location=self.device + ) + tgt_state_dict = self.modules.model.state_dict() + ignore_keys = [] + for k, v in model_state_dict.items(): + if ( + k in tgt_state_dict + and tgt_state_dict[k].shape != v.shape + ): + logger.warning("Ignoring shape mismatch for %s", k) + ignore_keys.append(k) + for k in ignore_keys: + del model_state_dict[k] + self.modules.model.load_state_dict( + model_state_dict, strict=False + ) + logger.info( + "Successfully initialized with pre-trained weights from %s", + init_from, + ) + + @torch.no_grad() + def evaluate_batch(self, batch, stage): + """Evaluate one batch, override for different procedure than train. + + The default implementation depends on two methods being defined + with a particular behavior: + + * ``compute_forward()`` + * ``compute_objectives()`` + + Arguments + --------- + batch : list of torch.Tensors + Batch of data to use for evaluation. Default implementation assumes + this batch has two elements: inputs and targets. + stage : Stage + The stage of the experiment: Stage.VALID, Stage.TEST + + Returns + ------- + detached loss + """ + out = self.compute_forward(batch, stage=stage) + loss = self.compute_objectives(out, batch, stage=stage) + if self.is_evaluating: + self.evaluator.evaluate_batch(batch) + return loss.detach().cpu() + + def on_stage_end(self, stage, stage_loss, epoch): + """Gets called at the end of an epoch. + + Arguments + --------- + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST + stage_loss : float + The average loss for all of the data processed in this stage. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + + # Store the train loss until the validation stage. + loss_stats = self.loss_metric.summarize(flat=True) + stage_stats = {"loss": stage_loss, **loss_stats} + if stage == sb.Stage.TRAIN: + self.train_stats = stage_stats + + # End evaluation and report stats + if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch): + self.evaluator.on_evaluate_end() + eval_summary = self.evaluator.compute_summary() + eval_summary_stats = { + key: eval_summary.get(value) + for key, value in self.hparams.eval_summary_log.items() + } + stage_stats.update(eval_summary_stats) + + # Perform end-of-iteration things, like annealing, logging, etc. + if stage == sb.Stage.VALID: + + if self.hparams.lr_annealing_mode == "epoch": + _, new_lr = self.hparams.lr_annealing(stage_loss) + sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr) + + lr = self.optimizer.param_groups[0]["lr"] + + # The train_logger writes a summary to stdout and to the logfile. + self.hparams.train_logger.log_stats( + stats_meta={"epoch": epoch, "lr": lr}, + train_stats=self.train_stats, + valid_stats=stage_stats, + ) + + # Save the current checkpoint and delete previous checkpoints. + self.checkpointer.save_and_keep_only( + meta={"loss": stage_stats["loss"]}, min_keys=["loss"], + ) + + def fit_batch(self, batch): + loss = super().fit_batch(batch) + if self.hparams.lr_annealing_mode == "step": + self.hparams.lr_annealing(self.optimizer) + return loss + + +INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phn"} + + +def dataio_prepare(hparams): + """This function prepares the datasets to be used in the brain class. + It also defines the data processing pipeline through user-defined functions. + + + Arguments + --------- + hparams : dict + This dictionary is loaded from the `train.yaml` file, and it includes + all the hyperparameters needed for dataset construction and loading. + + Returns + ------- + datasets : dict + Dictionary containing "train", "valid", and "test" keys that correspond + to the DynamicItemDataset objects. + silence_token : dict + the token used for silence + """ + + # Define datasets from json data manifest file + # Define datasets sorted by ascending lengths for efficiency + representation_mode = RepresentationMode( + hparams.get("representation_mode", RepresentationMode.DISCRETE) + ) + + datasets = {} + data_folder = hparams["data_folder"] + data_info = { + "train": hparams["train_json"], + "valid": hparams["valid_json"], + "test": hparams["test_json"], + } + label_encoder = hparams["label_encoder"] + input_feature = INPUT_FEATURE_MAP[hparams["input"]] + + @sb.utils.data_pipeline.takes("wav") + @sb.utils.data_pipeline.provides("sig") + def audio_ref_pipeline(wav): + """The audio loading pipeline for references + + Arguments + --------- + wav : strƒnum_ + The file path + + Returns + ------- + sig : torch.Tensor + The waveform + """ + sig = sb.dataio.dataio.read_audio(wav) + return sig + + @sb.utils.data_pipeline.takes("label") + @sb.utils.data_pipeline.provides("label_norm", "label_norm_eval") + def text_pipeline(label): + """Processes the transcriptions to generate proper labels""" + label_norm = label.upper() + yield label.upper() + label_norm_eval = RE_PUNCTUATION.sub("", label_norm) + yield label_norm_eval + + @sb.utils.data_pipeline.takes(input_feature) + @sb.utils.data_pipeline.provides("tokens") + def tokens_pipeline(label): + """Processes the transcriptions to generate proper labels""" + return label_encoder.encode_sequence_torch(label) + + use_silence_padding = hparams.get("use_silence_padding", True) + if "token_model_layers" in hparams: + audio_tokens_per_step = len(hparams["token_model_layers"]) + else: + audio_tokens_per_step = hparams["audio_tokens_per_step"] + layer_idx = None + if "speech_model_layers" in hparams: + layer_idx = get_selected_layer_indexes(hparams) + if use_silence_padding: + if representation_mode == RepresentationMode.DISCRETE: + silence_padding = get_silence_token( + hparams["tokenizer"], + num_codebooks=( + hparams["speech_model_layers"] + if "speech_model_layers" in hparams + else audio_tokens_per_step + ), + ) + else: + silence_padding = get_silence_repr(hparams["ssl_model"],) + else: + silence_padding = ( + torch.ones(audio_tokens_per_step, dtype=torch.int64) + * hparams["eos_index"] + ) + + silence_padding = silence_padding.cpu() + if layer_idx: + silence_padding = silence_padding[layer_idx] + else: + silence_padding = silence_padding[:audio_tokens_per_step] + silence_padding_len = int(math.ceil(hparams["silence_padding"])) + bos_width = hparams.get("bos_width", 1) + audio_bos_prefix = ( + torch.ones(bos_width, audio_tokens_per_step) * hparams["bos_index"] + ) + if representation_mode == RepresentationMode.CONTINUOUS: + audio_bos_prefix = audio_bos_prefix.unsqueeze(-1).repeat( + 1, 1, hparams["audio_dim"] + ) + + tokens_loader = hparams.get("tokens_loader") + if layer_idx is not None: + tokens_loader_kwargs = {"num_codebooks": layer_idx} + else: + tokens_loader_kwargs = {"num_codebooks": audio_tokens_per_step} + + @sb.utils.data_pipeline.takes("uttid") + @sb.utils.data_pipeline.provides("audio_pad", "audio_bos") + def audio_pipeline(id): + audio = tokens_loader.tokens_by_uttid(id, **tokens_loader_kwargs) + audio_pad = feature_pad_to( + audio, len(audio) + silence_padding_len, silence_padding + ) + yield audio_pad + audio_bos = torch.cat([audio_bos_prefix, audio_pad], dim=0) + yield audio_bos + + def spk_emb_random_match(uttid, dataset, spk_sample): + # Sample a speaker-matched embedding + selected_idx = spk_sample[uttid] + + # Retrieve the embedding value from the dataset + with dataset.output_keys_as(["sig"]): + spk_emb = dataset[selected_idx]["sig"] + return spk_emb + + dynamic_items = [ + text_pipeline, + tokens_pipeline, + audio_ref_pipeline, + audio_pipeline, + ] + output_keys = [ + "uttid", + "tokens", + "audio_pad", + "audio_bos", + "sig", + "spk_emb_random_match", + ] + + init_sequence_encoder(hparams) + + resample_fn = {} + for dataset in data_info: + dataset_output_keys = ( + output_keys + if dataset == "train" + else output_keys + ["label_norm_eval"] + ) + dynamic_dataset = sb.dataio.dataset.DynamicItemDataset.from_json( + json_path=data_info[dataset], + replacements={"data_root": data_folder}, + dynamic_items=dynamic_items, + output_keys=dataset_output_keys, + ) + + datasets[dataset] = dynamic_dataset + hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False + if hparams["spk_emb_shuffle"]: + spk_idx, spk_samplers = group_by_speaker(dynamic_dataset, hparams) + spk_sample = {} + spk_emb_random_match_pipeline = partial( + spk_emb_random_match, + spk_sample=spk_sample, + dataset=dynamic_dataset.filtered_sorted(), + ) + dynamic_dataset.add_dynamic_item( + func=spk_emb_random_match_pipeline, + takes=["uttid"], + provides=["spk_emb_random_match"], + ) + resample_fn[dataset] = partial( + resample_spk, + spk_idx=spk_idx, + sample=spk_sample, + dataset=dynamic_dataset, + spk_samplers=spk_samplers, + ) + resample_fn[dataset](epoch=0) + + sort_datasets(datasets, hparams) + # Exclude samples without phonemes + if hparams["input"] == "phonemes": + for key in datasets: + datasets[key] = datasets[key].filtered_sorted( + key_test={"phn": lambda value: value} + ) + datasets["sample"] = select_sample(hparams, datasets) + return datasets, silence_padding, resample_fn + + +def sort_datasets(datasets, hparams): + """Sorts datasets according to hyperparameters + + Arguments + --------- + datasets : dict + a key -> value dictionary of datasets (the keys are "train", "valid" and "test") + hparams : dict + a dictionary of hyperparameters + """ + + # Sorting training data with ascending order makes the code much + # faster because we minimize zero-padding. In most of the cases, this + # does not harm the performance. + if hparams["sorting"] == "ascending": + datasets["train"] = datasets["train"].filtered_sorted(sort_key="length") + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "descending": + datasets["train"] = datasets["train"].filtered_sorted( + sort_key="length", reverse=True + ) + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "random": + hparams["train_dataloader_opts"]["shuffle"] = True + pass + + else: + raise NotImplementedError( + "sorting must be random, ascending or descending" + ) + + +def select_sample(hparams, datasets): + """Selects a sample of files for sample generation, freezing the sample if + requested to persist across multiple experiments + + Arguments + --------- + hparams : dict + experiment hyperparameters + datasets : dict + a dictionary of datasets + + Returns + ------- + dataset : speechbrain.dataio.dataset.FilteredSortedDynamicItemDataset + the sample dataset + """ + sample_path = hparams.get("sample_path") + dataset = None + if sample_path is not None: + sample_path = Path(sample_path) + if sample_path.exists(): + with open(sample_path, "r") as sample_file: + data_ids = [line.strip() for line in sample_file] + dataset = FilteredSortedDynamicItemDataset( + datasets["valid"], data_ids + ) + + if dataset is None: + dataset = ( + datasets["valid"] + .batch_shuffle(1) + .filtered_sorted(select_n=hparams["num_audio_samples"]) + ) + if sample_path is not None: + with open(sample_path, "w") as sample_file: + for data_id in dataset.data_ids: + print(data_id, file=sample_file) + return dataset + + +def group_by_speaker(dataset, hparams): + """Groups utterance IDs in a dataset by speaker, for selection. The selection + is stable based on the seed - calling this method multiple times will always + result in the same order + + Arguments + --------- + dataset : torch.Tensor + the dataset from which to select items + hparams : dict + hyperparameters + + Returns + ------- + spk_idx : dict + a str -> int dictionary with a list of utterance indexes + for every speaker + spk_samplers : dict + a reproducible sampler for every speaker + spk_samplers_it : dict + an iterator for each sampler + """ + spk_idx = {} + spk_samplers = {} + speakers = [] + generator = torch.Generator() + generator.manual_seed(hparams["seed"]) + + # Group by speaker + with dataset.output_keys_as(["spk_id"]): + for idx, item in enumerate(dataset): + spk_id = item["spk_id"] + if spk_id not in spk_idx: + spk_idx[spk_id] = [] + spk_idx[spk_id].append(idx) + speakers.append(spk_id) + + # Create a reproducible sampler + for spk_id in speakers: + sampler = hparams["spk_sampler"](data_source=spk_idx[spk_id]) + spk_samplers[spk_id] = sampler + + return spk_idx, spk_samplers + + +def resample_spk(sample, spk_idx, spk_samplers, dataset, epoch): + """Selects new samples + + Arguments + --------- + spk_idx : dict + Data item indexes grouped by speaker + spk_samplers : dict + A sampler for each speaker + spk_samplers_it : dict + An iterator for each speaker + epoch : int + The epoch number + + Returns + ------- + sample : dict + a dictionary with uttids as keys and matching + indexes as values + """ + if epoch is None: + epoch = 0 + spk_samplers_it = {} + for spk_id, sampler in spk_samplers.items(): + sampler.set_epoch(epoch) + spk_samplers_it[spk_id] = iter(sampler) + with dataset.output_keys_as(["uttid", "spk_id"]): + for item in dataset: + spk_item_idx = next(spk_samplers_it[item["spk_id"]]) + dataset_item_idx = spk_idx[item["spk_id"]][spk_item_idx] + sample[item["uttid"]] = dataset_item_idx + + +def init_sequence_encoder(hparams): + """Initialize a sequence encoder + + Arguments + --------- + hparams: dict + parsed hyperparameters + prefix: str + the prefix to be prepended to hyperparameter keys, per the naming + convention + + {prefix}_label_encoder: the hparams key for the label encoder + {prefix}_list_file: the hparams key for the list file + + Returns + ------- + encoder: speechbrain.dataio.encoder.TextEncoder + an encoder instance""" + encoder = hparams["label_encoder"] + token_list_file_name = hparams["token_list_file"] + tokens = read_token_list(token_list_file_name) + encoder.add_unk() + encoder.update_from_iterable(tokens, sequence_input=False) + encoder.expect_len(len(tokens) + SPECIAL_TOKEN_COUNT) + return encoder + + +def get_selected_layer_indexes(hparams): + """Finds the indexes of selected layers + + Arguments + --------- + hparams : dict + Hyperparameters + """ + selected_layers = hparams.get("speech_model_layers") + available_layers = hparams.get("available_speech_model_layers") + if not (selected_layers and available_layers): + return None + layer_idx = [available_layers.index(layer) for layer in selected_layers] + return layer_idx + + +def read_token_list(file_name): + """Reads a simple text file with tokens (e.g. characters or phonemes) listed + one per line + + Arguments + --------- + file_name: str + the file name + + Returns + ------- + result: list + a list of tokens + """ + file_name = Path(file_name) + if not file_name.is_absolute(): + file_name = Path(__file__).parent / "hparams" / file_name + if not file_name.exists(): + raise ValueError(f"Token file {file_name} not found") + with open(file_name) as token_file: + return [line.strip("\r\n") for line in token_file if line] + + +def apply_overfit_test(hparams, dataset): + """Helper for applying an overfit test conditionally based + on hyperparameters: + + `overfit_test`: whether or not to apply an overfit test + `overfit_test_sample_count`: the number of samples to use from the + original dataset + `overfit_test_epoch_data_count`: the number of samples per epoch + + The function will accept datasets, (train, valid, test) tuples + or dictionaries of the form: + {"train": dataset1, "valid": dataset2, "test": dataset3} + + If a tuple or dictionary is used, the training dataset will be of length + overfit_test_epoch_data_count wheres the evaluation dataset will be of + length overfit_test_sample_count. + + Arguments + --------- + hparams: dict + parsed hyperparameters + dataset: DynamicItemDataset|tuple|dict + One of the following + a dataset + a dictionary ({"train": dataset1, "valid": dataset2, "test": dataset3}) + a (train, valid, test) tuple of datasets + + Returns + ------- + result: DynamicItemDataset|tuple|dict + a dataset or collection of datasets suitable for + an overfitting test - in the same format as the + dataset argument (single dataset, dictionary and tuple) + """ + if hparams["overfit_test"]: + if isinstance(dataset, tuple): + dataset_train, _, _ = dataset + dataset_train = apply_overfit_test(hparams, dataset_train) + dataset_eval = dataset_train.filtered_sorted( + select_n=hparams["overfit_test_sample_count"] + ) + result = dataset_train, dataset_eval, dataset_eval + elif isinstance(dataset, dict): + dataset_train = apply_overfit_test(hparams, dataset["train"]) + dataset_eval = dataset_train.filtered_sorted( + select_n=hparams["overfit_test_sample_count"] + ) + result = { + "train": dataset_train, + "valid": dataset_eval, + "test": dataset_eval, + "sample": dataset_eval, + } + else: + result = dataset.overfit_test( + hparams["overfit_test_sample_count"], + hparams["overfit_test_epoch_data_count"], + ) + else: + result = dataset + return result + + +RE_PUNCTUATION = re.compile( + "|".join(re.escape(char) for char in string.punctuation) +) + + +if __name__ == "__main__": + # Reading command line arguments + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + + # Initialize ddp (useful only for multi-GPU DDP training) + sb.utils.distributed.ddp_init_group(run_opts) + + # Load hyperparameters file with command-line overrides + with open(hparams_file) as fin: + yaml = fin.read() + + # Load evaluation hyperparameters + eval_hparams_file = Path(hparams_file).parent / "eval.yaml" + if eval_hparams_file.exists(): + logger.info( + "Using evaluation hyperparameters from %s", eval_hparams_file + ) + with open(eval_hparams_file) as eval_hparams: + hparams_yaml = eval_hparams.read() + yaml = "\n".join([yaml, hparams_yaml]) + else: + logger.info( + "%s not found - not using evaluation hyperparameters", + eval_hparams_file, + ) + hparams = load_hyperpyyaml(yaml, overrides, overrides_must_match=True) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + from libritts_prepare import prepare_libritts + + # Data preparation, to be run on only one process. + if not hparams["skip_prep"]: + run_on_main( + prepare_libritts, + kwargs={ + "data_folder": hparams["data_folder"], + "save_json_train": hparams["train_json"], + "save_json_valid": hparams["valid_json"], + "save_json_test": ( + hparams["test_json"] + if "test" in hparams["splits"] + else None + ), + "sample_rate": hparams["sample_rate"], + "train_split": hparams["train_split"], + "valid_split": hparams["valid_split"], + "test_split": ( + hparams["test_split"] + if "test" in hparams["splits"] + else None + ), + "seed": hparams["seed"], + "model_name": hparams["model"].__class__.__name__, + }, + ) + + # We can now directly create the datasets for training, valid, and test + (datasets, silence_padding, resample_fn) = dataio_prepare(hparams) + + # Apply overfit test settings + datasets = apply_overfit_test(hparams, datasets) + audio_keys = ["audio_pad", "audio_bos"] + + # Trainer initialization + tts_brain = TokotronBrain( + modules=hparams["modules"], + opt_class=hparams["opt_class"], + hparams=hparams, + run_opts=run_opts, + checkpointer=hparams["checkpointer"], + ) + tts_brain.sample_data = datasets["sample"] + tts_brain.resample_fn = resample_fn + + # The `fit()` method iterates the training loop, calling the methods + # necessary to update the parameters of the model. Since all objects + # with changing state are managed by the Checkpointer, training can be + # stopped at any point, and will be resumed on next call. + tts_brain.fit( + tts_brain.hparams.epoch_counter, + datasets["train"], + datasets["valid"], + train_loader_kwargs=use_silence_padding( + hparams["train_dataloader_opts"], silence_padding, audio_keys + ), + valid_loader_kwargs=use_silence_padding( + hparams["valid_dataloader_opts"], silence_padding, audio_keys + ), + ) + + # Load best checkpoint for evaluation + if hparams["testing"]: + test_summary_file = next( + Path(hparams["output_folder"]).glob("eval/test/*/summary.json"), + None, + ) + if test_summary_file is not None: + logging.info("Test run already completed: %s", test_summary_file) + else: + test_key_kind = hparams["test_key_kind"] + test_key = hparams["test_key"] + eval_kwargs = {f"{test_key_kind}_key": test_key} + tts_brain.evaluate( + test_set=datasets["test"], + test_loader_kwargs=hparams["test_dataloader_opts"], + **eval_kwargs, + ) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py new file mode 100644 index 000000000..017a5c367 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py @@ -0,0 +1,375 @@ +"""TTS evaluation tools + +Authors + * Artem Ploujnikov 2024 +""" +import json +import torch +import logging +import re +import csv +from speechbrain.utils.metric_stats import MetricStats +from types import SimpleNamespace +from pathlib import Path +from utils.data import undo_batch +from torch import nn + + +logger = logging.getLogger(__name__) + + +class SpeechEvaluationMetricStats(MetricStats): + """An aggregate metric combining multiple speech evaluators + + Arguments + --------- + hparams : dict | SimpleNamespace | object + Raw hyperparameters for evaluation + + device : str + The device on which evaluation will be performed + + """ + + def __init__(self, hparams, device="cpu"): + if isinstance(hparams, dict): + hparams = SimpleNamespace(**hparams) + self.hparams = hparams + self.device = device + modules = self.hparams.modules + self.modules = nn.ModuleDict(modules).to(self.device) + self.enabled_evaluators = set(self.hparams.evaluations.split(",")) + evaluators = hparams.evaluators + if evaluators: + self.evaluators = { + key: evaluator_f(run_opts={"device": device}) + for key, evaluator_f in evaluators.items() + if key in self.enabled_evaluators + } + else: + self.evaluators = {} + + if not self.evaluators: + logger.warn( + "No evaluators were defined - this run will produce samples only" + ) + + def on_evaluation_start(self, output_folder="eval"): + """Invoked at the beginning of the evaluation cycle. + + Arguments + --------- + output_folder : str | path-like + The folder to which results will be output + + """ + logger.info("Starting evaluation") + output_folder = Path(output_folder) + self.output_folder = ( + output_folder + if output_folder.is_absolute() + else self.hparams.output_folder / output_folder + ) + self.output_folder.mkdir(parents=True, exist_ok=True) + + self.files = [] + details_keys = list(self.evaluators.keys()) + self.details = {evaluator_key: [] for evaluator_key in details_keys} + self.read_reports() + self.create_reports() + self.item_ids = [] + for evaluator_key in self.enabled_evaluators: + self.evaluators[evaluator_key].on_evaluation_start() + + def on_evaluation_end(self): + """Invoked at the beginning of the evaluation cycle. The default + implementation is a no-op + """ + logger.info("Ending evaluation") + self.write_summary() + for evaluator_key in self.enabled_evaluators: + self.evaluators[evaluator_key].on_evaluation_end() + + def create_reports(self): + """Creates report files and report writers""" + self.report_files = {} + self.report_writers = {} + for evaluator_key in self.enabled_evaluators: + columns = self.get_report_columns(evaluator_key) + file_name = self.output_folder / f"{evaluator_key}.csv" + self.files.append(file_name) + resume = file_name.exists() and file_name.stat().st_size > 0 + report_file = open(file_name, "a+") + self.report_files[evaluator_key] = report_file + writer = csv.DictWriter(report_file, columns) + if not resume: + writer.writeheader() + self.report_writers[evaluator_key] = writer + + def read_reports(self): + """Invoked when resuming""" + for evaluator_key in self.enabled_evaluators: + file_name = self.output_folder / f"{evaluator_key}.csv" + if file_name.exists(): + logger.info("%s exists, reading") + with open(file_name) as report_file: + reader = csv.DictReader(report_file) + for row in reader: + del row["uttid"] + row = { + key: handle_number(value) + for key, value in row.items() + } + self.details[evaluator_key].append(row) + + def get_tracker_file_name(self): + """Determines the file name of the tracker file""" + suffix = ( + f"_{self.hparams.eval_suffix}" if self.hparams.eval_suffix else "" + ) + file_name = f"tracker_{self.hparams.eval_dataset}{suffix}.txt" + return self.output_folder / file_name + + def get_report_columns(self, evaluator_key): + """Returns the columns for the specified evaluator + + Arguments + --------- + evaluator_key : str + the identifier of the evaluator + + Returns + ------- + columns : list[str] + a list of column headers + """ + bogus_wavs = torch.randn(2, 10000, device=self.device) + bogus_length = torch.tensor([1.0, 1.0], device=self.device) + evaluator = self.evaluators[evaluator_key] + result = evaluator.evaluate( + wavs=bogus_wavs, + length=bogus_length, + text=["BOGUS"] * len(bogus_wavs), + wavs_ref=bogus_wavs, + length_ref=bogus_length, + ) + + return ["uttid"] + list(result.details.keys()) + + def append(self, ids, wav, length, text, wav_ref, length_ref): + """Appends the result of a single item + + Arguments + --------- + ids : str + Utterance IDs + wav : torch.Tensor + Synthesized waveforms + length : torch.Tensor + Relative lengths of the synthesized waveforms + text : list + Ground truth text + wav_ref : torch.Tensor + Reference (ground truth) waveforms + length_ref : torch.Tensor + Reference lengths + """ + with torch.no_grad(): + self.item_ids.extend(ids) + for evaluator_key, evaluator in self.evaluators.items(): + result = evaluator.evaluate( + wavs=wav, + length=length, + text=text, + wavs_ref=wav_ref, + length_ref=length_ref, + sample_rate_ref=self.hparams.sample_rate, + sample_rate=self.hparams.model_sample_rate, + ) + details = undo_batch(result.details) + self.write_result(evaluator_key, ids, details) + self.details[evaluator_key].extend(details) + + def write_result(self, evaluator_key, ids, details): + """Outputs the result details to the report for the specified evaluator + + Arguments + --------- + evaluator_key : str + The evaluator key + ids : list + The list of IDs + details : list + a list of evaluation details, one dictionary per item + """ + writer = self.report_writers[evaluator_key] + for uttid, details_item in zip(ids, details): + report_details = { + "uttid": uttid, + **details_item, + } + writer.writerow(ascii_only(flatten(report_details))) + self.report_files[evaluator_key].flush() + + def write_summary(self, file_name=None): + """Outputs summarized statistics + + Arguments + --------- + file_name : str | path-like + An alternative path to save the file + """ + summary = self.summarize() + if file_name is None: + file_name = self.output_folder / "summary.json" + self.files.append(file_name) + with open(file_name, "w") as output_file: + json.dump(summary, output_file, indent=4) + + def summarize(self, field=None): + """Computes the summarized statistics + + Arguments + --------- + field : str, optional + If specified, it will return a specific field + + Returns + ------- + result : dict | float + The summary - or the specified field from the sum + """ + result = { + f"{evaluator_key}_{stat_key}": value + for evaluator_key in self.enabled_evaluators + if evaluator_key in self.details + for metric_key in self.hparams.eval_summary[evaluator_key][ + "descriptive" + ] + for stat_key, value in descriptive_statistics( + items=self.details[evaluator_key], key=metric_key, + ).items() + } + for evaluator_key in self.enabled_evaluators: + result.update( + { + f"{evaluator_key}_{stat_key}": value + for stat_key, value in self.evaluators[evaluator_key] + .global_metrics() + .items() + } + ) + if field is not None: + result = result[field] + return result + + def clear(self): + """Deletes all the files that have been created""" + for file_name in self.files: + file_name.unlink() + + +RE_NON_ASCII = re.compile(r"[^\x00-\x7F]+") + + +def ascii_only(values): + """Removes any non-ASCII characters from a dictionary + + Arguments + --------- + values : dict + A dictionary of values + + Returns + ------- + result : dict + The same dictionary - but with non-ASCII strings removed""" + return { + key: RE_NON_ASCII.sub("", value) if isinstance(value, str) else value + for key, value in values.items() + } + + +def descriptive_statistics(items, key): + """Computes descriptive statistics for the summary + + Arguments + --------- + items : list + a list of dictionaries with metric values for each item + key : str + The key of the metric for which the statistics will be computed + + Returns + ------- + statistics : dict + The desccriptive statistics computed + _mean : the arithmetic mean + _std : the standard deviation + _min : the minimum value + _max : the maximum value + _median : the median value + _q1 : the first quartile + _q3 : the third quartile + _iqr : the interquartile ratio + """ + values = torch.tensor([item[key] for item in items]) + quantiles = torch.tensor([0.25, 0.5, 0.75]) + q1, median, q3 = values.quantile(quantiles) + stats = { + "mean": values.mean(), + "std": values.std(), + "min": values.min(), + "max": values.max(), + "median": median, + "q1": q1, + "q3": q3, + "iqr": q3 - q1, + } + return { + f"{key}_{stat_key}": value.item() for stat_key, value in stats.items() + } + + +def flatten(value): + """Converts tensors to scalars and lists of strings to strings + + Arguments + --------- + value : dict + the dictionary to flatten + + Returns + ------- + result : dict + a flattened dictionary + """ + return { + key: item_value.item() if torch.is_tensor(item_value) else item_value + for key, item_value in value.items() + } + + +RE_INTEGER = re.compile(r"^-?\d+$") +RE_FLOAT = re.compile(r"^-?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?$") + + +def handle_number(value): + """Converts a value to a number, if applicable. Strings + that look like integers or floats will be converted to integers + or floats. + + Arguments + --------- + value : str + a string value + + Returns + ------- + result : object + The processed result""" + if RE_INTEGER.match(value): + value = int(value) + elif RE_FLOAT.match(value): + value = float(value) + return value diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/arpabet.txt b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/arpabet.txt new file mode 100644 index 000000000..105a1dd9d --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/arpabet.txt @@ -0,0 +1,50 @@ +AA +AE +AH +AO +AW +AY +B +CH +D +DH +EH +ER +EY +F +G +HH +IH +IY +JH +K +L +M +N +NG +OW +OY +P +R +S +SH +T +TH +UH +UW +V +W +Y +Z +ZH +' +" +! +( +) +, +- +. +: +; +? diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/char_en.txt b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/char_en.txt new file mode 100644 index 000000000..f43d3b08d --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/char_en.txt @@ -0,0 +1,38 @@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +' +" +! +( +) +, +- +. +: +; +? + \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml new file mode 100644 index 000000000..f4e975175 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml @@ -0,0 +1,80 @@ +eval_dataset: test +eval_suffix: "" +eval_folder: null +eval_sample_rate: 16000 +eval_spk_sim_sample_rate: 16000 +eval_samples: null +eval_interval: 1 +eval_subset: null +eval_asr_beam_size: 66 +eval_asr_type: encoder_decoder +eval_asr_source: openai/whisper-small +eval_asr_metric_mode: micro +eval_spk_sim_source: microsoft/wavlm-base-sv +evaluations: utmos,asr,spk_sim +tmp_folder: null +eval_utmos_source: chaanks/wav2vec2-small +eval_utmos_save_path: !ref /utmos +eval_utmos_model_name: utmos.ckpt +eval_utmos_model_url: https://huggingface.co/chaanks/UTMOS/resolve/main +eval_utmos_domain_id: null +eval_utmos_judge_id: null +eval_perf: False + +# Inference Fit +inference_fit_top_k: [10, 20, 30] +inference_fit_sampling_temperature: [0.7, 0.8, 1.0, 1.2, 1.3] +inference_fit_key_metric: dwer +inference_fit_key_metric_kind: min + +eval_asr: !name:utils.eval.WhisperASRSpeechEvaluator + source: !ref + sample_rate: !ref + savedir: !ref + metric_mode: !ref + +eval_utmos: !name:utils.eval.UTMOSSpeechEvaluator + source: !ref + save_path: !ref + model_name: !ref + model_url: !ref + domain_id: !ref + judge_id: !ref + +eval_spk_sim: !name:utils.eval.SpkSimWavLM + source: !ref + savedir: !ref + model_sample_rate: !ref + +evaluators: + utmos: !ref + asr: !ref + spk_sim: !ref + +eval_summary: + asr: + descriptive: ["wer", "cer", "wer_ref", "cer_ref", "dwer", "dcer"] + utmos: + descriptive: ["utmos"] + spk_sim: + descriptive: ["score"] + +dwer_metric_key: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + macro: asr_dwer_median + micro: asr_dwer_micro + +eval_summary_log: + utmos: utmos_utmos_mean + dwer: !ref + spk_sim: spk_sim_score_mean + +inference_fit_space: + top_k: !ref + sampling_temperature: !ref + +inference_fit_metrics: + utmos: utmos_utmos_mean + dwer: !ref + spk_sim: spk_sim_score_mean diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml new file mode 100644 index 000000000..85020bdff --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml @@ -0,0 +1,290 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/dac + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +alignments_folder: null +prepare_save_folder: !ref +data_folder_alignments: null # e.g., /path/to/LibriSpeech +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +token_model_src: "facebook/encodec_24khz" +g2p_src: flexthink/soundchoice-g2p +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +flip_layers: False +use_token_offsets: True +splits: ["train", "valid", "test"] + +# Duration Filter +duration_min: null +duration_max: null + + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 +epoch_fixed: False +batch_size: 16 +valid_inter_data_count: 50 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 1.0 +sorting: random +num_workers: 4 +skip_prep: False +prep_only: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step +betas: [0.9, 0.95] + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 24000 +max_audio_length: 4000 +text_max_length: 500 +spk_prompt_length: 150 +n_ctx: !ref + + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +inference_nbest: 1 +inference_sampling_temperature: 1.0 +inference_top_k: 20 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +# Sample Selector +sample_selection: default +sample_selector_source: openai/whisper-small +sample_selector_sample_rate: 16000 +sample_selector_debug: False + +freeze_lm_head: False + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 1024 +audio_emb_freeze: False +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 2 +flatten: False + +# Model Settings +model_type: 24khz +model_bitrate: 8kbps + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + nbest: !ref + sampling_temperature: !ref + top_k: !ref + + +tokenizer: !new:utils.tokenizer_interface.DACTokenizer + model_type: !ref + model_bitrate: !ref + n_codebooks: !ref + load_pretrained: True + tag: latest + + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.AdamW + lr: !ref + betas: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +generator: !new:model.custom_model.SaveableGenerator + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + generator: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref + +sample_selector: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + default: !name:model.valle.DefaultSampleSelector + asr: !name:model.valle.WhisperASRSampleSelector + tokenizer: !ref + source: !ref + sample_rate: !ref + tokenizer_sample_rate: !ref + savedir: !ref + debug: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml new file mode 100644 index 000000000..24316c3d2 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -0,0 +1,333 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/discrete_ssl +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER + +# Model Type +representation_mode: discrete +output_folder: !ref results/// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER # e.g., path/to/cache +alignments_folder: null +data_folder_alignments: null # e.g., /path/to/LibriSpeech +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +progress_folder: !ref /progress +progress_current: !ref /current +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +freeze_token_model: True + +# Model Settings +ssl_model_type: wavlm +token_model_src: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: microsoft/wavlm-large + hubert: facebook/hubert-large-ll60k + wav2vec2: facebook/wav2vec2-large +g2p_src: speechbrain/soundchoice-g2p +token_model_kmeans_src: poonehmousavi/SSL_Quantization +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech +vocoder_repo_id: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + hubert: speechbrain/hifigan-hubert-k1000-LibriTTS + wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS + wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +flip_layers: False +use_token_offsets: True + +# Speaker Embeddings +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +use_spk_emb: False +splits: ["train", "valid", "test"] + +# Duration Filter +duration_min: null +duration_max: null + + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 +epoch_fixed: False +batch_size: 16 +valid_inter_data_count: 50 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 1.0 +sorting: random +num_workers: 4 +skip_prep: False +prep_only: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step +betas: [0.9, 0.95] + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 16000 +max_audio_length: 4000 +text_max_length: 500 +spk_prompt_length: 150 +n_ctx: !ref + + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +inference_nbest: 1 +inference_sampling_temperature: 1.0 +inference_top_k: 20 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Token model (pretrained) +ssl_model: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True + hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True + wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +token_model_kwargs: + SSL_layers: !ref + +# Sample Selector +sample_selection: default +sample_selector_source: openai/whisper-small +sample_selector_sample_rate: 16000 +sample_selector_debug: False + + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 1000 +audio_emb_size: 1024 +audio_emb_freeze: False +audio_emb_pretrained: False +audio_emb_lr: 0.00001 +audio_emb_weight_decay: 0.001 +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 6 +flatten: False + +freeze_lm_head: False + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + nbest: !ref + sampling_temperature: !ref + top_k: !ref + + +tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer + save_path: !ref / + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref + +modules: + model: !ref + tokenizer: !ref + +opt_class: !name:torch.optim.AdamW + lr: !ref + betas: !ref + + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + + +lr_annealing: !new:model.Tokotron.TargetedNoamScheduler + lr_initial: [!ref , !ref ] + n_warmup_steps: !ref + param_group: 0 + +generator: !new:model.custom_model.SaveableGenerator + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + generator: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref + +sample_selector: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + default: !name:model.valle.DefaultSampleSelector + asr: !name:model.valle.WhisperASRSampleSelector + tokenizer: !ref + source: !ref + sample_rate: !ref + tokenizer_sample_rate: !ref + savedir: !ref + token_model_kwargs: !ref + debug: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml new file mode 100644 index 000000000..e78119670 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -0,0 +1,289 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/encodec + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +alignments_folder: null +prepare_save_folder: !ref +data_folder_alignments: null # e.g., /path/to/LibriSpeech +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +g2p_src: flexthink/soundchoice-g2p +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +flip_layers: False +use_token_offsets: True +splits: ["train", "valid", "test"] + +# Duration Filter +duration_min: null +duration_max: null + + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 +epoch_fixed: False +batch_size: 16 +valid_inter_data_count: 50 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 1.0 +sorting: random +num_workers: 4 +skip_prep: False +prep_only: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step +betas: [0.9, 0.95] + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 24000 +max_audio_length: 4000 +text_max_length: 500 +spk_prompt_length: 150 +n_ctx: !ref + + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +inference_nbest: 1 +inference_sampling_temperature: 1.0 +inference_top_k: 20 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +# Sample Selector +sample_selection: default +sample_selector_source: openai/whisper-small +sample_selector_sample_rate: 16000 +sample_selector_debug: False + +freeze_lm_head: False + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 1024 +audio_emb_freeze: False +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 8 +flatten: False + +# Model Settings +model_hub: facebook/encodec_24khz +bandwidth: 6 + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + nbest: !ref + sampling_temperature: !ref + top_k: !ref + +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer + source: !ref + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.AdamW + lr: !ref + betas: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +generator: !new:model.custom_model.SaveableGenerator + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + generator: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref + +sample_selector: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + default: !name:model.valle.DefaultSampleSelector + asr: !name:model.valle.WhisperASRSampleSelector + tokenizer: !ref + source: !ref + sample_rate: !ref + tokenizer_sample_rate: !ref + savedir: !ref + debug: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml new file mode 100644 index 000000000..31b425824 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml @@ -0,0 +1,294 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/encodec + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +alignments_folder: null +prepare_save_folder: !ref +data_folder_alignments: null # e.g., /path/to/LibriSpeech +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +g2p_src: flexthink/soundchoice-g2p +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +flip_layers: False +use_token_offsets: True +splits: ["train", "valid", "test"] + +# Duration Filter +duration_min: null +duration_max: null + + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 +epoch_fixed: False +batch_size: 16 +valid_inter_data_count: 50 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 1.0 +sorting: random +num_workers: 4 +skip_prep: False +prep_only: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step +betas: [0.9, 0.95] + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 24000 +max_audio_length: 4000 +text_max_length: 500 +spk_prompt_length: 150 +n_ctx: !ref + + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +inference_nbest: 1 +inference_sampling_temperature: 1.0 +inference_top_k: 20 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +# Sample Selector +sample_selection: default +sample_selector_source: openai/whisper-small +sample_selector_sample_rate: 16000 +sample_selector_debug: False + + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 1024 +audio_emb_freeze: False +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 8 +flatten: False + +# Model Settings +espnet_repo: https://github.com/espnet/espnet +espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef +model_hub: espnet/libritts_encodec_24k +model_ckpt: exp/codec_encodec_ss4_24k/120epoch.pth +model_config: exp/codec_encodec_ss4_24k/config.yaml + +freeze_lm_head: True + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + nbest: !ref + sampling_temperature: !ref + top_k: !ref + + +tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface + source: !ref + model_config: !ref + n_codebook: !ref + save_path: !ref + sample_rate: !ref + model_ckpt: !ref + espnet_commit: !ref + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.AdamW + lr: !ref + betas: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +generator: !new:model.custom_model.SaveableGenerator + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + generator: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref + +sample_selector: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + default: !name:model.valle.DefaultSampleSelector + asr: !name:model.valle.WhisperASRSampleSelector + tokenizer: !ref + source: !ref + sample_rate: !ref + tokenizer_sample_rate: !ref + savedir: !ref + debug: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml new file mode 100644 index 000000000..a0d19ce8c --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml @@ -0,0 +1,287 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/mimi + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +alignments_folder: null +prepare_save_folder: !ref +data_folder_alignments: null # e.g., /path/to/LibriSpeech +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +init_from: null +num_audio_samples: 32 +samples_interval: 5 + +g2p_src: flexthink/soundchoice-g2p +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +flip_layers: False +use_token_offsets: True +splits: ["train", "valid", "test"] + +# Duration Filter +duration_min: null +duration_max: null + + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 +epoch_fixed: False +batch_size: 16 +valid_inter_data_count: 50 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 1.0 +sorting: random +num_workers: 4 +skip_prep: False +prep_only: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step +betas: [0.9, 0.95] + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 24000 +max_audio_length: 4000 +text_max_length: 500 +spk_prompt_length: 150 +n_ctx: !ref + + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +inference_nbest: 1 +inference_sampling_temperature: 1.0 +inference_top_k: 20 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +# Sample Selector +sample_selection: default +sample_selector_source: openai/whisper-small +sample_selector_sample_rate: 16000 +sample_selector_debug: False + + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 2048 +audio_emb_freeze: False +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 8 +flatten: False + +# Model Settings +model_hub: kyutai/mimi + +freeze_lm_head: False + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 0.0 + maxlenratio: !ref + nq: !ref + nbest: !ref + sampling_temperature: !ref + top_k: !ref + + +tokenizer: !new:utils.tokenizer_interface.MimiTokenizer + source: !ref + save_path: !ref + num_codebooks: !ref + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.AdamW + lr: !ref + betas: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +generator: !new:model.custom_model.SaveableGenerator + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + generator: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref + +sample_selector: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + default: !name:model.valle.DefaultSampleSelector + asr: !name:model.valle.WhisperASRSampleSelector + tokenizer: !ref + source: !ref + sample_rate: !ref + tokenizer_sample_rate: !ref + savedir: !ref + debug: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml new file mode 100644 index 000000000..52dec45c3 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml @@ -0,0 +1,286 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/speech_tokenizer + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +alignments_folder: null +prepare_save_folder: !ref +data_folder_alignments: null # e.g., /path/to/LibriSpeech +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +g2p_src: flexthink/soundchoice-g2p +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +flip_layers: False +use_token_offsets: True +splits: ["train", "valid", "test"] + +# Duration Filter +duration_min: null +duration_max: null + + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 +epoch_fixed: False +batch_size: 16 +valid_inter_data_count: 50 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 1.0 +sorting: random +num_workers: 4 +skip_prep: False +prep_only: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step +betas: [0.9, 0.95] + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 16000 +max_audio_length: 4000 +text_max_length: 500 +spk_prompt_length: 150 +n_ctx: !ref + + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +inference_nbest: 1 +inference_sampling_temperature: 1.0 +inference_top_k: 20 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +# Sample Selector +sample_selection: default +sample_selector_source: openai/whisper-small +sample_selector_sample_rate: 16000 +sample_selector_debug: False + + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 1024 +audio_emb_freeze: False +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 8 +flatten: False + +# Model Settings +model_hub: fnlp/SpeechTokenizer + +freeze_lm_head: False + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + nbest: !ref + sampling_temperature: !ref + top_k: !ref + + +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper + source: !ref # Only the 24kHz version supports mono audio + save_path: !ref + + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.AdamW + lr: !ref + betas: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +generator: !new:model.custom_model.SaveableGenerator + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + generator: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref + +sample_selector: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + default: !name:model.valle.DefaultSampleSelector + asr: !name:model.valle.WhisperASRSampleSelector + tokenizer: !ref + source: !ref + sample_rate: !ref + tokenizer_sample_rate: !ref + savedir: !ref + debug: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml new file mode 100644 index 000000000..93b8bcd09 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml @@ -0,0 +1,296 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/speech_tokenizer + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +alignments_folder: null +prepare_save_folder: !ref +data_folder_alignments: null # e.g., /path/to/LibriSpeech +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +g2p_src: flexthink/soundchoice-g2p +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +flip_layers: False +use_token_offsets: False +splits: ["train", "valid", "test"] + +# Duration Filter +duration_min: null +duration_max: null + + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 +epoch_fixed: False +batch_size: 16 +valid_inter_data_count: 50 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +prep_only: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step +betas: [0.9, 0.95] + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 16000 +max_audio_length: 4000 +text_max_length: 500 +spk_prompt_length: 150 +n_ctx: !ref + + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +top_k: 1 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +ternary_d_hidden: 128 +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +target_dropout: 0.2 +vocab_size: 19683 +audio_emb_freeze: False +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !ref * 2 + +audio_token_shift: 19683 +audio_tokens_per_step: 4 +flatten: True +ternary_num_digits: 10 +pred_mode: ternary + +# Model Settings +config: config.yaml +checkpoint: ckpt_00190000.pth +sq_codec_save_path: !ref /sq-codec + +freeze_lm_head: False + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: 1 + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + target_dropout: !ref + share_emb: !ref + qk_norm: !ref + lm_head: !ref + emb: !ref + logits_to_probs: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: 1 + top_k: !ref + +lm_head: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + ternary: !new:model.custom_model.TernaryPredictionHead + d_model: !ref + d_hidden: !ref + num_positions: !ref * + tokens: null + +logits_to_probs: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + ternary: !new:model.custom_model.TernaryLogitTokenizer + num_tokens: !ref + num_positions: !ref + tokens: !new:torch.nn.Identity + +emb: !new:speechbrain.nnet.containers.Sequential + ternary: !new:model.sq_codec.TernaryEmbedding + num_digits: !ref + flat: True + linear: !new:speechbrain.nnet.linear.Linear + input_size: !ref * + n_neurons: !ref + +tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer + save_path: !ref + checkpoint: !ref + config: !ref + + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.AdamW + lr: !ref + betas: !ref + +compute_cost: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + ternary: !name:model.sq_codec.ternary_loss + targets_type: tokens + num_positions: !ref + tokens: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +generator: !new:model.custom_model.SaveableGenerator + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + generator: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml new file mode 100644 index 000000000..38831c660 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml @@ -0,0 +1,289 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/wavtokenizer + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +alignments_folder: null +prepare_save_folder: !ref +data_folder_alignments: null # e.g., /path/to/LibriSpeech +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +g2p_src: flexthink/soundchoice-g2p +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +flip_layers: False +use_token_offsets: True +splits: ["train", "valid", "test"] + +# Duration Filter +duration_min: null +duration_max: null + + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 +epoch_fixed: False +batch_size: 16 +valid_inter_data_count: 50 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 1.0 +sorting: random +num_workers: 4 +skip_prep: False +prep_only: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step +betas: [0.9, 0.95] + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 24000 +max_audio_length: 4000 +text_max_length: 500 +spk_prompt_length: 150 +n_ctx: !ref + + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +inference_nbest: 1 +inference_sampling_temperature: 1.0 +inference_top_k: 20 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +# Sample Selector +sample_selection: default +sample_selector_source: openai/whisper-small +sample_selector_sample_rate: 16000 +sample_selector_debug: False + + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 4096 +audio_emb_freeze: False +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 1 +flatten: False + +# Model Settings +model_hub: novateur/WavTokenizer-medium-music-audio-75token +config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml +checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt + +freeze_lm_head: False + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + nbest: !ref + sampling_temperature: !ref + top_k: !ref + +tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper + source: !ref + save_path: !ref + checkpoint: !ref + config: !ref + freeze: True + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.AdamW + lr: !ref + betas: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +generator: !new:model.custom_model.SaveableGenerator + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + generator: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref + +sample_selector: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + default: !name:model.valle.DefaultSampleSelector + asr: !name:model.valle.WhisperASRSampleSelector + tokenizer: !ref + source: !ref + sample_rate: !ref + tokenizer_sample_rate: !ref + savedir: !ref + debug: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py new file mode 100644 index 000000000..759014220 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py @@ -0,0 +1,490 @@ +"""Inference fit grid search for VALL-E + +Curriculum inspired by Lifeiteng's VALL-E +https://github.com/lifeiteng/vall-e + +Authors + * Artem Ploujnikov 2024 +""" + +import json +import speechbrain as sb +import sys +import csv +import torch +import operator +import yaml + +from hyperpyyaml import load_hyperpyyaml +from pathlib import Path +from torch import nn +from tqdm.auto import tqdm +from types import SimpleNamespace +from speechbrain.dataio.dataio import clean_padding +from speechbrain.utils.logger import get_logger +from speechbrain.utils.data_utils import batch_pad_right, pad_right_to + +base_dir = str(Path(__file__).resolve().parent.parent.parent.parent) +sys.path.append(base_dir) + +from evaluation import SpeechEvaluationMetricStats # noqa: E402 +from train import undo_padding_tensor, get_offsets # noqa: E402 + +logger = get_logger(__name__) + + +class InferenceFit: + """A wrapper class for hyperparameter fitting + + Arguments + --------- + hparams : dict + Parsed hyperparameters + run_opts : dict + Parsed run options + """ + + def __init__(self, hparams, run_opts): + device = run_opts.get("device", "cpu") + self.hparams = SimpleNamespace(**hparams) + self.modules = nn.ModuleDict(self.hparams.modules).to(device) + self.device = device + self.space = self.hparams.inference_fit_space + self.result = None + self.evaluation_metric = SpeechEvaluationMetricStats( + self.hparams, self.device + ) + self.offsets = get_offsets( + self.hparams.vocab_size, self.hparams.audio_tokens_per_step, + )[None, None, :].to(self.device) + if not self.hparams.use_token_offsets: + self.offsets = torch.zeros_like(self.offsets) + self.output_folder_rel = "eval/inference_fit" + self.output_folder = ( + Path(self.hparams.output_folder) / self.output_folder_rel + ) + self.token_model_kwargs = getattr( + self.hparams, "token_model_kwargs", {} + ) + + def fit(self, dataset): + """Performs infernece fitting + + Arguments + --------- + dataset: speechbrain.dataio.dataset.DynamicItemDataset + A dataset instance + + Returns + ------- + result: dict + the fit result + """ + self.result = [] + self.recover() + logger.info("Parameter Space: %s", format_space(self.space)) + evaluations = self.enumerate_param_space() + for idx, params in enumerate(tqdm(evaluations, desc="Parameter space")): + if self.is_completed(params): + eval_result = self.get_result(params) + else: + eval_result = self.evaluate(dataset, params) + self.result.append({"idx": idx, **params, **eval_result}) + self.best = self.find_best() + return self.result, self.best + + def is_completed(self, params): + """Determines whether the fitting run has been completed + + Arguments + --------- + params : torch.Tensor + the parameters to evaluate + + Returns + ------- + result : bool + Whether the run has been completed + """ + folder_name = params_to_folder_name(params) + path = self.output_folder / folder_name / "summary.json" + return path.exists() + + def get_result(self, params): + """Retrieves the result for a completed run + + Arguments + --------- + params : torch.Tensor + A hyperparameter search entry + + Returns + ------- + result : dict + The result of the run + """ + params_str = format_params(params) + logger.info("Retrieving params for completed run %s", params_str) + folder_name = params_to_folder_name(params) + path = self.output_folder / folder_name / "summary.json" + with open(path) as summary_file: + summary = json.load(summary_file) + result = { + key: summary.get(value, 0.0) + for key, value in self.hparams.inference_fit_metrics.items() + } + return result + + def find_best(self): + """Finds the best run result based on the metric chosen + + Returns + ------- + result : dict + The best result + """ + best = self.result[0] + op = ( + operator.lt + if self.hparams.inference_fit_key_metric_kind == "min" + else operator.gt + ) + for item in self.result[1:]: + value = item[self.hparams.inference_fit_key_metric] + if op(value, best[self.hparams.inference_fit_key_metric]): + best = item + return best + + def enumerate_param_space(self): + """Enumerates the parameter space + + Returns + ------- + result : generator + The parameter space (each element is a dictionary of hyperparameters) + """ + return enumerate_space(self.space) + + def evaluate(self, dataset, params): + """Performs evaluation at a particular point + in the hyperparameter space + + Arguments + --------- + dataset : speechbrain.dataio.dataset.DynamicItemDataset + A dataset instance + params : dict + The hyperparameter dictionary + + Returns + ------- + metrics : dictionary + a key/value dictionary with the metrics computed + """ + dataloader = sb.dataio.dataloader.make_dataloader(dataset) + params_str = format_params(params) + logger.info("Starting evaluation of %s", params_str) + folder_name = params_to_folder_name(params) + self.evaluation_metric.on_evaluation_start( + f"{self.output_folder_rel}/{folder_name}" + ) + for batch in tqdm( + dataloader, desc="Evaluation run", total=len(dataset) + ): + self.evaluate_batch(batch, params) + logger.info("Finished evaluation of %s", params_str) + self.evaluation_metric.on_evaluation_end() + summary = self.evaluation_metric.summarize() + metrics = { + key: summary.get(value, 0.0) + for key, value in self.hparams.inference_fit_metrics.items() + } + return metrics + + def evaluate_batch(self, batch, params): + """Evaluates a single batch + + Arguments + --------- + batch : PaddedBatch + A single batch of data + params : dict + A set of hyperparameters to try""" + batch = batch.to(self.device) + audio_tokens, audio_length = self.inference(batch, params) + wav = self.create_waveform(audio_tokens, audio_length) + wav = wav.squeeze(1) + self.evaluation_metric.append( + ids=batch.uttid, + wav=wav, + text=batch.label_norm_eval, + length=audio_length, + wav_ref=batch.sig.data, + length_ref=batch.sig.lengths, + ) + + def write_report(self): + """Outputs the hyperparameter fitting report""" + if self.result is None: + logger.warning("Nothing to report") + return + + report_file_name = self.output_folder / "results.csv" + report_file_name.parent.mkdir(parents=True, exist_ok=True) + with open(report_file_name, "w") as report_file: + columns = next(iter(self.result)).keys() + writer = csv.DictWriter(report_file, columns) + writer.writeheader() + for result in self.result: + writer.writerow(result) + best_file_name = self.output_folder / "best.yaml" + with open(best_file_name, "w") as best_file: + yaml.dump(self.best, best_file) + + def inference(self, batch, params): + """Runs TTS inference + + Arguments + --------- + batch : PaddedBatch + A batch + + Returns + ------- + audio : torch.Tensor + A padded tensor of audio + audio_length : torch.Tensor + Relative lengths + """ + prefix, prefix_length = batch.prefix + # NOTE: ESPNET VALL-E does not support batched inference + prefix_items = undo_padding_tensor(prefix.int(), prefix_length) + inference = self.modules.model.inference + inference_results = [ + inference( + prefix=prefix_item.unsqueeze(0), + opts=self._get_inference_opts(params), + ) + for prefix_item in prefix_items + ] + inferred_tokens = [ + self._pad_inferred_sample(result) for result in inference_results + ] + audio, audio_length = batch_pad_right(inferred_tokens) + audio_length = audio_length.to(self.device) + audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(0) + return audio, audio_length + + # TODO: Duplicated in train, consider refactoring + def _pad_inferred_sample(self, result): + """Applies length padding to an inference result + + Arguments + --------- + result : list + The VALL-E Inference output + + Returns + ------- + sample : torch.Tensor + A sample, padded if needed + """ + if result[0]: + sample = result[0][0] + else: + sample = torch.zeros( + 1000, self.hparams.audio_tokens_per_step, device=self.device + ) + min_length = getattr(self.hparams, "infer_min_length", 10) + sample_length, tracks = sample.shape + if sample_length < min_length: + sample = pad_right_to(sample, (min_length, tracks),)[0] + return sample + + def create_waveform(self, audio, length): + """Creates a waveform from a discrete or continuous audio + representation + + Arguments + --------- + audio : torch.Tensor + An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) + lengths : torch.Tensor + A 1-D tensor + + Returns + ------- + wav : torch.Tensor + """ + tokenizer = self.modules.tokenizer + tokenizer.device = self.device + if hasattr(tokenizer, "codec_vocoder"): + tokenizer.codec_vocoder.to(self.device) + tokenizer.codec_vocoder.device = self.device + wav = tokenizer.tokens_to_sig(audio, **self.token_model_kwargs) + wav = clean_padding(wav, length) + wav = wav.to(self.device) + return wav + + def _get_inference_opts(self, params): + idx = torch.arange(self.hparams.model_vocab_size, device=self.device)[ + None, : + ] + tracks = torch.arange( + self.hparams.audio_tokens_per_step, device=self.device + )[:, None] + if not self.hparams.use_token_offsets: + tracks = torch.zeros_like(tracks) + track_start = ( + self.hparams.audio_token_shift + tracks * self.hparams.vocab_size + ) + if self.hparams.flip_layers: + track_start = track_start.flip(0) + track_end = track_start + self.hparams.vocab_size + mask = ( + ((idx >= track_start) & (idx < track_end)) + | (idx == self.hparams.bos_index) + ).logical_not() + mask[ + ( + (idx >= self.hparams.special_num_tokens) + & (idx <= self.hparams.audio_token_shift) + ).expand_as(mask) + ] = True + return self.hparams.inference_opts( + masks={self.hparams.bos_index: mask}, **params, device=self.device, + ) + + def recover(self): + """Recovers a checkpoint according to the settings specified""" + test_key_kind = hparams["test_key_kind"] + test_key = hparams["test_key"] + kwargs = {f"{test_key_kind}_key": test_key} + logger.info("Revovering a checkpoint") + ckpt = self.hparams.checkpointer.recover_if_possible(**kwargs) + if not ckpt: + logger.error("Checkpoint not found - cannot evaluate") + raise ValueError("No checkpoint available") + logger.info("Checkpoint recovered: %s", ckpt) + + +def enumerate_space(space, entry=None, points=None): + """Enumerates the hyperparameter space for a full + grid search + + Arguments + --------- + space : dict + A key -> value dictionary with hyperparameter names as keys + and sets of values to try as values + entry : dict + The entry being constructed + points : list + The list of points being constructed + + Returns + ------- + result : list + All configurations to try + """ + if points is None: + points = [] + if not space: + points.append(entry) + return points + if entry is None: + entry = {} + key, values = next(iter(space.items())) + rest = dict(space) + del rest[key] + for value in values: + enumerate_space(rest, {**entry, key: value}, points) + return points + + +def format_space(space): + """Formats a hyperparameter space for display + + Arguments + --------- + space : dict + A space definition + + Returns + ------- + result : str + A formatted space for display""" + return ", ".join( + f"{parameter}: {values}" for parameter, values in space.items() + ) + + +def format_params(params): + """Formats a set of hyperparameters (a single point in the hyperparameter + space) for display + + Arguments + --------- + params : dict + A dictionary of hyperparameter values + + Returns + ------- + result : str + A formatted hyperparameter dictionary + """ + return ", ".join(f"{key}={value}" for key, value in params.items()) + + +def params_to_folder_name(params): + """Formats a dictionary of hyperparameters as a folder name (for ease of reading) + + Arguments + --------- + params : dict + A dictionary of hyperparameter values + + Returns + ------- + result : str + The corresponding folder name""" + params_str = "-".join(f"{key}-{value}" for key, value in params.items()) + return f"eval-{params_str}" + + +if __name__ == "__main__": + # Reading command line arguments + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + + # Load hyperparameters file with command-line overrides + with open(hparams_file) as fin: + yaml_content = fin.read() + + # Load evaluation hyperparameters + eval_hparams_file = Path(hparams_file).parent / "eval.yaml" + if not eval_hparams_file.exists(): + eval_hparams_file = Path(__file__).parent / "hparams" / "eval.yaml" + if eval_hparams_file.exists(): + logger.info( + "Using evaluation hyperparameters from %s", eval_hparams_file + ) + with open(eval_hparams_file) as eval_hparams: + hparams_yaml = eval_hparams.read() + yaml_content = "\n".join([yaml_content, hparams_yaml]) + else: + logger.info( + "%s not found - not using evaluation hyperparameters", + eval_hparams_file, + ) + hparams = load_hyperpyyaml( + yaml_content, overrides, overrides_must_match=True + ) + from train import dataio_prepare, select_eval_subset # noqa + + datasets, _ = dataio_prepare(hparams) + dataset = datasets["valid"] + dataset = select_eval_subset(dataset, hparams) + + inference_fit = InferenceFit(hparams, run_opts) + inference_fit.fit(dataset) + inference_fit.write_report() diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/libritts_prepare.py b/benchmarks/DASB/LibriTTS/TTS/valle/libritts_prepare.py new file mode 120000 index 000000000..489ab4011 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/libritts_prepare.py @@ -0,0 +1 @@ +../../libritts_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/tokenizer_prepare.py b/benchmarks/DASB/LibriTTS/TTS/valle/tokenizer_prepare.py new file mode 100644 index 000000000..896e6e4f5 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/tokenizer_prepare.py @@ -0,0 +1,106 @@ +"""A script to prepare annotations for tokenizers + +""" + +import json +import re + +from pathlib import Path +from speechbrain.lobes.models.g2p.dataio import build_token_char_map +from speechbrain.utils.logger import get_logger + + +logger = get_logger(__name__) +MULTI_SPACE = re.compile(r"\s{2,}") + + +def phn2txt(phn, phoneme_map): + """Encodes phonemes using a character map for use with SentencePiece + + Arguments + --------- + phn: list + a list of original phonemes (ARPABET) + phoneme_map: dict + the phoneme-to-character map + + Returns + ------- + value: str + the mapped string representation + """ + value = "".join(phoneme_map[phoneme] for phoneme in phn).strip() + value = MULTI_SPACE.sub(" ", value) + return value + + +def prepare_annotation(src, destination_file_name, phonemes): + """Prepares the annotation file + + Arguments + --------- + src: datasets.arrow_dataset.Dataset + the source dataset + destination_file_name: str + the path to the annotation file to be created + phonemes: list + the list of phonemes + """ + phoneme_map = build_token_char_map(phonemes) + annotation = { + key: { + "label": item["label"], + "phonemes": phn2txt(item["phn"], phoneme_map), + } + for key, item in src.items() + } + with open(destination_file_name, "w", encoding="utf-8") as dst_file: + json.dump(annotation, dst_file, indent=2) + + +DATA_SPLITS = ["train", "valid", "test"] + + +def prepare_tokenizer(splits, save_folder, input, phonemes): + """Prepares annotations for the tokenizer + + Arguments + --------- + datasets: list + the list of dataset splits + save_folder: str + the path to the folder where annotations will be saved + input : str + identifies what type of input will be used (text or phonemes) + phonemes: list + the list of phonemes + """ + save_folder = Path(save_folder) + if input == "text": + for key in splits: + src_file_name = save_folder / f"{key}.json" + destination_file_name = ( + save_folder / f"tokenizer_annotation_{key}.json" + ) + destination_file_name.symlink_to(src_file_name) + else: + for key in splits: + destination_file_name = ( + save_folder / f"tokenizer_annotation_{key}.json" + ) + if destination_file_name.exists(): + logger.info( + "Annotation file '%s' already exists", destination_file_name + ) + else: + logger.info( + "Creating tokenizer annotation '%s'", destination_file_name, + ) + data_file_name = save_folder / f"{key}.json" + with open(data_file_name) as data_file: + data = json.load(data_file) + prepare_annotation( + src=data, + destination_file_name=destination_file_name, + phonemes=phonemes, + ) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py new file mode 100644 index 000000000..6cf1c7eca --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -0,0 +1,1489 @@ +#!/usr/bin/env/python3 +"""Recipe for training VALL-E + +Based on ESPNET VALL-E + +Curriculum inspired by Lifeiteng's VALL-E +https://github.com/lifeiteng/vall-e + +Authors + * Artem Ploujnikov 2024 +""" + + +import logging +import speechbrain as sb +import torch +import sys +import shutil +from pathlib import Path +from hyperpyyaml import load_hyperpyyaml +from speechbrain.dataio.dataio import ( + clean_padding, + length_to_mask, + write_audio, +) +from speechbrain.dataio.dataloader import LoopedLoader +from speechbrain.utils.data_utils import pad_right_to +from speechbrain.utils.distributed import run_on_main +from speechbrain.utils.data_utils import batch_pad_right +from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset +from functools import partial +from torch.utils.data import DataLoader +import re +import os +import string + +base_dir = str(Path(__file__).resolve().parent.parent.parent.parent) +sys.path.append(base_dir) + +from evaluation import SpeechEvaluationMetricStats # noqa: E402 +from model.valle import DefaultSampleSelector # noqa: E402 + +logger = logging.getLogger(__name__) + + +# Brain class for speech recognition training +class VALLEBrain(sb.Brain): + """Class that manages the training loop. See speechbrain.core.Brain.""" + + def __init__( + self, + modules=None, + opt_class=None, + hparams=None, + run_opts=None, + checkpointer=None, + ): + super().__init__( + modules, opt_class, hparams, run_opts, checkpointer, + ) + self.evaluation_metric = SpeechEvaluationMetricStats( + self.hparams, self.device + ) + + def create_waveform(self, audio, length): + """Creates a waveform from a discrete or continuous audio + representation + + Arguments + --------- + audio : torch.Tensor + An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) + lengths : torch.Tensor + A 1-D tensor + + Returns + ------- + wav : torch.Tensor + """ + tokenizer = ( + self.modules.tokenizer.module + if hasattr(self.modules.tokenizer, "module") + else self.modules.tokenizer + ) + tokenizer.device = self.device + if hasattr(tokenizer, "codec_vocoder"): + tokenizer.codec_vocoder.to(self.device) + tokenizer.codec_vocoder.device = self.device + wav = tokenizer.tokens_to_sig(audio, **self.token_model_kwargs) + wav = clean_padding(wav, length) + wav = wav.to(self.device) + return wav + + def compute_forward(self, batch, stage): + """Runs all the computation of the Tokotron TTS + + Arguments + --------- + batch : PaddedBatch + This batch object contains all the relevant tensors for computation. + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + + Returns + ------- + predictions : dict + TTS predictions + """ + batch = batch.to(self.device) + prompt, prompt_length = batch.prompt + batch_size, prompt_max_len, num_tracks = prompt.shape + nar_track = None + if self.train_nar: + nar_track = torch.randint( + 1, num_tracks, (batch_size,), device=self.device + ) + logits_ar, logits_nar = self.modules.model( + dec_seq=batch.prompt.data, + dec_seq_lengths=batch.prompt.lengths, + prefix_len=batch.prefix_length / prompt_max_len, + nar_level_idx=nar_track, + predict_ar=self.train_ar, + predict_nar=self.train_nar, + ) + return logits_ar, logits_nar, nar_track + + def compute_objectives(self, predictions, batch, stage): + """Computes the loss given the predicted and targeted outputs. We here + do multi-task learning and the loss is a weighted sum of the ctc + seq2seq + costs. + + Arguments + --------- + predictions : dict + The output dict from `compute_forward`. + batch : PaddedBatch + This batch object contains all the relevant tensors for computation. + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + + Returns + ------- + loss : torch.Tensor + A one-element tensor used for backpropagating the gradient. + """ + batch = batch.to(self.device) + + logits_ar, logits_nar, nar_track = predictions + prompt, prompt_length = batch.prompt + prefix_length = batch.prefix_length + + batch_size, prompt_max_len, _ = prompt.shape + batch_idx = torch.arange(batch_size, device=prompt.device) + length_mask = length_to_mask( + prompt_length * prompt_max_len, prompt_max_len + ) + prefix_mask = length_to_mask( + prefix_length, prompt_max_len + ).logical_not() + mask = (length_mask * prefix_mask)[:, 1:] + + loss_components = [] + + if self.train_ar: + logits_ar_sm = self.hparams.log_softmax(logits_ar) + if self.hparams.flatten: + targets_ar = prompt[:, 1:] + else: + targets_ar = prompt[:, 1:, 0] + loss_ar = self.hparams.compute_cost( + logits_ar_sm, targets=targets_ar, mask=mask + ) + loss_components.append(loss_ar) + else: + logits_ar_sm, targets_ar = None, None + if self.train_nar: + logits_nar_sm = self.hparams.log_softmax(logits_nar) + targets_nar = prompt[batch_idx, 1:, nar_track] + loss_nar = self.hparams.compute_cost( + logits_nar_sm, targets=targets_nar, mask=mask, + ) + loss_components.append(loss_nar) + else: + logits_nar_sm, targets_nar = None, None + + self.loss_metric.append( + ids=batch.uttid, + logits_ar=logits_ar_sm, + targets_ar=targets_ar, + logits_nar=logits_nar_sm, + targets_nar=targets_nar, + mask=mask, + reduction="batch", + ) + + loss = torch.mean(torch.stack(loss_components)) + return loss + + def compute_loss_stats( + self, + logits_ar, + targets_ar, + logits_nar, + targets_nar, + mask, + reduction="batch", + ): + """Computes an autoregressive/non-autoregressive loss breakdown, + to be used for metrics/stats + + Arguments + --------- + logits_ar : torch.Tensor + The autoregressive predictions + targets_ar : torch.Tensor + The targets for autoregressive predictions + logits_nar : torch.Tensor + The non-autoregressive predictions + targets_nar : torch.Tensor + The targets for non-autoregressive prediction + + Returns + ------- + stats: dict + statistics + """ + stats = {} + if self.train_ar: + stats["loss_ar"] = self.hparams.compute_cost( + logits_ar, targets=targets_ar, mask=mask, reduction=reduction, + ) + if self.train_nar: + stats["loss_nar"] = self.hparams.compute_cost( + logits_nar, targets=targets_nar, mask=mask, reduction=reduction, + ) + return stats + + def on_stage_start(self, stage, epoch): + """Gets called at the beginning of each epoch. + + Arguments + --------- + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + self.offsets = get_offsets( + self.hparams.vocab_size, self.hparams.audio_tokens_per_step, + )[None, None, :].to(self.device) + if not self.hparams.use_token_offsets: + self.offsets = torch.zeros_like(self.offsets) + + if hasattr(hparams, "speech_model_layers"): + self.layer_idx = get_selected_layer_indexes( + hparams.available_speech_model_layers, + hparams.speech_model_layers, + ) + else: + self.layer_idx = None + + self.loss_metric = sb.utils.metric_stats.MultiMetricStats( + metric=self.compute_loss_stats, batch_eval=True, + ) + self.apply_curriculum() + + self.is_evaluating = False + if stage == sb.Stage.VALID: + if self.is_eval_epoch(epoch): + self.evaluation_metric.on_evaluation_start() + self.is_evaluating = True + else: + logger.info("No evaluation on epoch %d", epoch) + elif stage == sb.Stage.TEST: + self.evaluation_metric.on_evaluation_start() + self.is_evaluating = True + self.token_model_kwargs = getattr( + self.hparams, "token_model_kwargs", {} + ) + dataset = stage.name.lower() + self.resample_fn[dataset](epoch=epoch or 0) + self.init_sample_selector(stage) + + def init_sample_selector(self, stage): + """Initializes the sample selector""" + if stage == sb.Stage.TRAIN: + self.sample_selector = None + else: + sample_selector = getattr(self.hparams, "sample_selector", None) + if not sample_selector: + sample_selector = DefaultSampleSelector + self.sample_selector = sample_selector( + token_shift=self.hparams.audio_token_shift, offsets=self.offsets + ) + + def apply_curriculum(self): + """Applies curriculum settings, if specified, training only the autoregressive part - or + only the non-autoregressive part""" + epoch = self.hparams.epoch_counter.current + self.train_ar, self.train_nar = True, True + lm_head = ( + self.modules.model.module.lm_head + if hasattr(self.modules.model, "module") + else self.modules.model.lm_head + ) + lm_head.requires_grad_(True) + if self.hparams.audio_tokens_per_step == 1 or self.hparams.flatten: + # NOTE: If there is only one track it's autoregressive + self.train_nar = False + elif ( + self.hparams.number_of_epochs_ar is not None + and epoch <= self.hparams.number_of_epochs_ar + ): + self.train_nar = False + elif self.hparams.number_of_epochs_nar is not None and epoch <= ( + self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar + ): + self.train_ar = False + if self.hparams.freeze_lm_head: + lm_head.requires_grad_(False) + + def is_eval_epoch(self, epoch): + """Determines whether or not evaluation should be performed + in the specieied epoch + + Arguments + --------- + epoch : int + The epoch number. If omitted, the epoch number from the + epoch counter will be used + + Returns + ------- + eval_epoch : bool + True if evaluation should be run in this epoch, false + otherwise""" + if epoch is None: + epoch = self.hparams.epoch_counter.current + # NOTE: Need to get past AR-only training to be able to evaluate + can_evaluate = not ( + self.hparams.number_of_epochs_ar is not None + and epoch <= self.hparams.number_of_epochs_ar + ) + return can_evaluate and (epoch % self.hparams.eval_interval == 0) + + def on_fit_start(self): + """Gets called at the beginning of ``fit()``, on multiple processes + if ``distributed_count > 0`` and backend is ddp. + + Default implementation compiles the jit modules, initializes + optimizers, and loads the latest checkpoint to resume training. + """ + # Run this *after* starting all processes since jit/compiled modules + # cannot be pickled. + self._compile() + + # Wrap modules with parallel backend after jit + self._wrap_distributed() + + # Initialize optimizers after parameters are configured + self.init_optimizers() + + # Load latest checkpoint to resume training if interrupted + if self.checkpointer is not None and not getattr( + self, "_ckpt_recovered", False + ): + self.checkpointer.recover_if_possible() + self._ckpt_recovered = True + + @torch.no_grad() + def evaluate_batch(self, batch, stage): + """Evaluate one batch, override for different procedure than train. + + The default implementation depends on two methods being defined + with a particular behavior: + + * ``compute_forward()`` + * ``compute_objectives()`` + + Arguments + --------- + batch : list of torch.Tensors + Batch of data to use for evaluation. Default implementation assumes + this batch has two elements: inputs and targets. + stage : Stage + The stage of the experiment: Stage.VALID, Stage.TEST + + Returns + ------- + detached loss + """ + out = self.compute_forward(batch, stage=stage) + loss = self.compute_objectives(out, batch, stage=stage) + if self.is_evaluating: + with torch.no_grad(): + audio_tokens, audio_length = self.inference(batch) + if self.hparams.flip_layers: + audio_tokens = audio_tokens.flip(2) + wav = self.create_waveform(audio_tokens, audio_length) + wav = wav.squeeze(1) + self.save_samples( + batch=batch, wav=wav, length=audio_length, stage=stage + ) + self.evaluation_metric.append( + ids=batch.uttid, + wav=wav, + text=batch.label_norm_eval, + length=audio_length, + wav_ref=batch.sig.data, + length_ref=batch.sig.lengths, + ) + return loss.detach().cpu() + + def on_stage_end(self, stage, stage_loss, epoch): + """Gets called at the end of an epoch. + + Arguments + --------- + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST + stage_loss : float + The average loss for all of the data processed in this stage. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + + # Store the train loss until the validation stage. + loss_stats = self.loss_metric.summarize(flat=True) + stage_stats = {"loss": stage_loss, **loss_stats} + if stage == sb.Stage.TRAIN: + self.train_stats = stage_stats + + eval_summary_stats = {} + # End evaluation and report stats + if stage != sb.Stage.TRAIN and self.is_evaluating: + self.evaluation_metric.on_evaluation_end() + self.save_eval(stage) + eval_summary = self.evaluation_metric.summarize() + eval_summary_stats = { + key: eval_summary.get(value) + for key, value in self.hparams.eval_summary_log.items() + } + stage_stats.update(eval_summary_stats) + + # Perform end-of-iteration things, like annealing, logging, etc. + if stage == sb.Stage.VALID: + + if self.hparams.lr_annealing_mode == "epoch": + _, new_lr = self.hparams.lr_annealing(stage_loss) + sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr) + + lr = self.optimizer.param_groups[0]["lr"] + + # The train_logger writes a summary to stdout and to the logfile. + self.hparams.train_logger.log_stats( + stats_meta={"epoch": epoch, "lr": lr, **eval_summary_stats}, + train_stats=self.train_stats, + valid_stats=stage_stats, + ) + + ckpt_kwargs = { + f"{self.hparams.ckpt_key_kind}_keys": [self.hparams.ckpt_key], + } + # Save the current checkpoint and delete previous checkpoints. + self.checkpointer.save_and_keep_only( + meta={"loss": stage_stats["loss"], **eval_summary_stats}, + num_to_keep=hparams["ckpt_keep"], + **ckpt_kwargs, + ) + elif stage == sb.Stage.TEST: + self.hparams.train_logger.log_stats( + stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, + test_stats=stage_stats, + ) + + def inference(self, batch): + """Runs TTS inference + + Arguments + --------- + batch : PaddedBatch + A batch + + Returns + ------- + audio : torch.Tensor + A padded tensor of audio + audio_length : torch.Tensor + Relative lengths + """ + prefix, prefix_length = batch.prefix + # NOTE: ESPNET VALL-E does not support batched inference + prefix_items = undo_padding_tensor(prefix.int(), prefix_length) + inference = ( + self.modules.model.module.inference + if hasattr(self.modules.model, "module") + else self.modules.model.inference + ) + logger.info("Running inference") + inference_results = [ + inference( + prefix=prefix_item.unsqueeze(0), opts=self._get_inference_opts() + ) + for prefix_item in prefix_items + ] + logger.info("Running selection") + inference_results = [ + self.sample_selector.select(tokens, scores, label) + for (tokens, scores), label in zip( + inference_results, batch.label_norm_eval + ) + ] + inferred_tokens = [ + self._pad_inferred_sample(result) for result in inference_results + ] + audio, audio_length = batch_pad_right(inferred_tokens) + audio_length = audio_length.to(self.device) + audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(0) + return audio, audio_length + + def _pad_inferred_sample(self, result): + """Applies length padding to an inference result + + Arguments + --------- + result : list + The VALL-E Inference output + + Returns + ------- + sample : torch.Tensor + A sample, padded if needed + """ + if result is not None: + sample = result + else: + sample = torch.zeros( + 1000, self.hparams.audio_tokens_per_step, device=self.device + ) + min_length = getattr(self.hparams, "infer_min_length", 10) + sample_length, tracks = sample.shape + if sample_length < min_length: + sample = pad_right_to(sample, (min_length, tracks),)[0] + return sample + + def _get_inference_opts(self): + idx = torch.arange(self.hparams.model_vocab_size, device=self.device)[ + None, : + ] + tracks = torch.arange( + self.hparams.audio_tokens_per_step, device=self.device + )[:, None] + if not self.hparams.use_token_offsets: + tracks = torch.zeros_like(tracks) + track_start = ( + self.hparams.audio_token_shift + tracks * self.hparams.vocab_size + ) + if self.hparams.flip_layers: + track_start = track_start.flip(0) + track_end = track_start + self.hparams.vocab_size + mask = ( + ((idx >= track_start) & (idx < track_end)) + | (idx == self.hparams.bos_index) + ).logical_not() + mask[ + ( + (idx >= self.hparams.special_num_tokens) + & (idx <= self.hparams.audio_token_shift) + ).expand_as(mask) + ] = True + return self.hparams.inference_opts( + masks={self.hparams.bos_index: mask}, device=self.device, + ) + + def save_samples(self, batch, wav, length, stage): + """Saves audio samples + + Arguments + --------- + batch : PaddedBatch + An audio batch + wav : torch.Tensor + Generated audio + length : torch.Tensor + Relative lengths + stage : speechbrain.Stage + The training stage + """ + output_folder = self._get_eval_output_folder(stage) + samples = undo_padding_tensor(wav, length) + for uttid, sample in zip(batch.uttid, samples): + file_name = output_folder / f"pred_{uttid}.wav" + write_audio(file_name, sample.cpu(), self.hparams.model_sample_rate) + + def save_eval(self, stage): + """Saves evaluation results + + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + """ + output_folder = self._get_eval_output_folder(stage) + for src_file_name in self.evaluation_metric.files: + dest_file_name = output_folder / src_file_name.name + shutil.copyfile(src_file_name, dest_file_name) + self.evaluation_metric.clear() + + def _get_eval_output_folder(self, stage): + epoch = self.hparams.epoch_counter.current + eval_folder_name = None + if stage == sb.Stage.TEST and self.hparams.eval_folder: + eval_folder_name = self.hparams.eval_folder + if not eval_folder_name: + eval_folder_name = stage.name.lower() + if self.hparams.eval_suffix: + eval_folder_name += self.hparams.eval_suffix + output_folder = ( + Path(self.hparams.output_folder) / "eval" / eval_folder_name + ) + if epoch is not None: + output_folder = output_folder / str(epoch) + output_folder.mkdir(exist_ok=True, parents=True) + return output_folder + + def fit_batch(self, batch): + """Fit one batch, using the default implementation with per-step + annealing + + Arguments + --------- + batch : list of torch.Tensors + Batch of data to use for training. Default implementation assumes + this batch has two elements: inputs and targets. + + Returns + ------- + detached loss + """ + loss = super().fit_batch(batch) + if self.hparams.lr_annealing_mode == "step": + self.hparams.lr_annealing(self.optimizer) + return loss + + def fit( + self, + epoch_counter, + train_set, + valid_set=None, + progressbar=None, + train_loader_kwargs={}, + valid_loader_kwargs={}, + ): + """Iterate epochs and datasets to improve objective. + + Relies on the existence of multiple functions that can (or should) be + overridden. The following methods are used and expected to have a + certain behavior: + + * ``fit_batch()`` + * ``evaluate_batch()`` + * ``update_average()`` + + If the initialization was done with distributed_count > 0 and the + distributed_backend is ddp, this will generally handle multiprocess + logic, like splitting the training data into subsets for each device and + only saving a checkpoint on the main process. + + Arguments + --------- + epoch_counter : iterable + Each call should return an integer indicating the epoch count. + train_set : Dataset, DataLoader + A set of data to use for training. If a Dataset is given, a + DataLoader is automatically created. If a DataLoader is given, it is + used directly. + valid_set : Dataset, DataLoader + A set of data to use for validation. If a Dataset is given, a + DataLoader is automatically created. If a DataLoader is given, it is + used directly. + progressbar : bool + Whether to display the progress of each epoch in a progressbar. + train_loader_kwargs : dict + Kwargs passed to `make_dataloader()` for making the train_loader + (if train_set is a Dataset, not DataLoader). + E.G. batch_size, num_workers. + DataLoader kwargs are all valid. + valid_loader_kwargs : dict + Kwargs passed to `make_dataloader()` for making the valid_loader + (if valid_set is a Dataset, not DataLoader). + E.g., batch_size, num_workers. + DataLoader kwargs are all valid. + + Returns + ------- + None + """ + if self.test_only: + logger.info( + "Test only mode, skipping training and validation stages." + ) + return + if not ( + isinstance(train_set, DataLoader) + or isinstance(train_set, LoopedLoader) + ): + train_set = self.make_dataloader( + train_set, stage=sb.Stage.TRAIN, **train_loader_kwargs + ) + self.on_fit_start() + epoch = self.hparams.epoch_counter.current + if epoch < self.hparams.number_of_epochs: + valid_set = sample_dataset( + dataset=valid_set, + count=self.hparams.valid_inter_data_count, + seed=self.hparams.seed, + ) + + valid_set = self.make_dataloader( + valid_set, + stage=sb.Stage.VALID, + ckpt_prefix=None, + **valid_loader_kwargs, + ) + + if progressbar is None: + progressbar = not self.noprogressbar + + # Only show progressbar if requested and main_process + enable = progressbar and sb.utils.distributed.if_main_process() + + # Iterate epochs + for epoch in epoch_counter: + self._fit_train(train_set=train_set, epoch=epoch, enable=enable) + self._fit_valid(valid_set=valid_set, epoch=epoch, enable=enable) + + # Debug mode only runs a few epochs + if ( + self.debug + and epoch == self.debug_epochs + or self._optimizer_step_limit_exceeded + ): + break + + +INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phn"} + + +def dataio_prepare(hparams): + """This function prepares the datasets to be used in the brain class. + It also defines the data processing pipeline through user-defined functions. + + + Arguments + --------- + hparams : dict + This dictionary is loaded from the `train.yaml` file, and it includes + all the hyperparameters needed for dataset construction and loading. + + Returns + ------- + datasets : dict + Dictionary containing "train", "valid", and "test" keys that correspond + to the DynamicItemDataset objects. + silence_token : dict + the token used for silence + """ + + # Define datasets from json data manifest file + # Define datasets sorted by ascending lengths for efficiency + datasets = {} + data_folder = hparams["data_folder"] + data_info = { + "train": hparams["train_json"], + "valid": hparams["valid_json"], + "test": hparams["test_json"], + } + label_encoder = hparams["label_encoder"] + input_feature = INPUT_FEATURE_MAP[hparams["input"]] + offsets = get_offsets( + hparams["vocab_size"], hparams["audio_tokens_per_step"] + ).unsqueeze(0) + if not hparams["use_token_offsets"]: + offsets = torch.zeros_like(offsets) + if hparams["flip_layers"]: + offsets = offsets.flip(-1) + + tokens_loader = hparams.get("tokens_loader") + spk_prompt_length = hparams["spk_prompt_length"] + + layer_idx = None + if "speech_model_layers" in hparams: + layer_idx = get_selected_layer_indexes( + hparams["available_speech_model_layers"], + hparams["speech_model_layers"], + ) + + if layer_idx is not None: + num_codebooks = layer_idx + else: + num_codebooks = hparams["audio_tokens_per_step"] + + @sb.utils.data_pipeline.takes("label") + @sb.utils.data_pipeline.provides("label_norm", "label_norm_eval") + def text_pipeline(label): + """Processes the transcriptions to generate proper labels""" + label_norm = label.upper() + yield label_norm + label_norm_eval = RE_PUNCTUATION.sub("", label_norm) + yield label_norm_eval + + @sb.utils.data_pipeline.takes(input_feature) + @sb.utils.data_pipeline.provides("tokens") + def tokens_pipeline(label): + """Processes the transcriptions to generate proper labels""" + return label_encoder.encode_sequence_torch(label) + + def spk_prompt(uttid, spk_sample): + # Sample a speaker-matched embedding + selected_uttid = spk_sample[uttid] + audio = tokens_loader.tokens_by_uttid( + selected_uttid, num_codebooks=num_codebooks + ) + if audio.size(0) > spk_prompt_length: + offset = torch.randint(0, audio.size(0), (1,)).item() + else: + offset = 0 + # Retrieve the embedding value from the dataset + audio_spk_prompt, _ = pad_right_to( + audio[offset : offset + spk_prompt_length], + (spk_prompt_length, audio.size(1)), + ) + return audio_spk_prompt + + @sb.utils.data_pipeline.takes("uttid", "tokens", "spk_prompt") + @sb.utils.data_pipeline.provides( + "audio", "prefix", "prompt", "prefix_length", "length" + ) + def prompt_pipeline(id, tokens, spk_prompt): + audio = tokens_loader.tokens_by_uttid(id, num_codebooks=num_codebooks) + if hparams["flip_layers"]: + audio = audio.flip(-1) + yield audio + num_tracks = audio.size(1) + prefix = torch.cat( + [ + torch.ones(1, num_tracks) * hparams["bos_index"], + tokens.unsqueeze(-1).expand(len(tokens), num_tracks), + torch.ones(1, num_tracks) * hparams["eot_index"], + spk_prompt + hparams["audio_token_shift"] + offsets, + torch.ones(1, num_tracks) * hparams["eop_index"], + ] + ) + yield prefix + prompt = torch.cat( + [ + prefix, + torch.ones(1, num_tracks) * hparams["bos_index"], + audio + hparams["audio_token_shift"] + offsets, + torch.ones(1, num_tracks) * hparams["eos_index"], + ] + ).int() + yield prompt + yield len(prefix) + yield len(prompt) + + @sb.utils.data_pipeline.takes("wav") + @sb.utils.data_pipeline.provides("sig") + def sig_pipeline(wav): + sig = sb.dataio.dataio.read_audio(wav) + return sig + + dynamic_items = [text_pipeline, tokens_pipeline, sig_pipeline] + + init_sequence_encoder(hparams) + use_spk_emb = hparams.get("use_spk_emb", False) + prepared_features = ["audio_tokens"] + output_keys = [ + "uttid", + "tokens", + "label_norm", + "audio", + "prompt", + "prefix_length", + "length", + ] + if use_spk_emb: + prepared_features.append("spk_emb") + output_keys.append("spk_emb") + + resample_fn = {} + for dataset in data_info: + dataset_dynamic_items = list(dynamic_items) + dataset_output_keys = list(output_keys) + if dataset != "train": + dataset_output_keys += ["sig", "label_norm_eval", "prefix"] + dynamic_dataset = sb.dataio.dataset.DynamicItemDataset.from_json( + json_path=data_info[dataset], + replacements={"data_root": data_folder}, + dynamic_items=dataset_dynamic_items, + output_keys=dataset_output_keys, + ) + spk_idx, spk_samplers = group_by_speaker(dynamic_dataset, hparams) + spk_sample = {} + spk_prompt_pipeline = partial(spk_prompt, spk_sample=spk_sample,) + dynamic_dataset.add_dynamic_item( + func=spk_prompt_pipeline, takes=["uttid"], provides=["spk_prompt"], + ) + dynamic_dataset.add_dynamic_item(prompt_pipeline) + resample_fn[dataset] = partial( + resample_spk, + spk_idx=spk_idx, + sample=spk_sample, + dataset=dynamic_dataset, + spk_samplers=spk_samplers, + ) + resample_fn[dataset](epoch=0) + dataset = filter_alignments(dataset, hparams) + dataset = filter_duration(dataset, hparams) + + datasets[dataset] = dynamic_dataset + hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False + + sort_datasets(datasets, hparams) + + return datasets, resample_fn + + +def filter_duration(dataset, hparams): + """Filters the dataset by sample duration + + Arguments + --------- + dataset: speechbrain.dataio.dataset.DynamicItemDataset + A dataset + hparams: dict + Hyperparameters + + Returns + ------- + result : speechbrain.dataio.dataset.DynamicItemDataset + A filtered dataset + """ + duration_min = hparams.get("duration_min") + duration_max = hparams.get("duration_max") + if duration_min or duration_max: + key_min_value = None + key_max_value = None + if duration_min: + key_min_value = {"duration": duration_min} + if duration_max: + key_max_value = {"duration": duration_max} + dataset = dataset.filtered_sorted( + key_min_value=key_min_value, key_max_value=key_max_value, + ) + return dataset + + +def filter_alignments(dataset, hparams): + """Filters the dataset by the presence of alignments if + phonemes are selected as a source + + Arguments + --------- + dataset: speechbrain.dataio.dataset.DynamicItemDataset + A dataset + hparams: dict + Hyperparameters + + Returns + ------- + result : speechbrain.dataio.dataset.DynamicItemDataset + A filtered dataset + """ + if hparams["input"] == "phonemes": + dataset = dataset.filtered_sorted( + key_test={"has_alignments": lambda value: value} + ) + return dataset + + +def sort_datasets(datasets, hparams): + """Sorts datasets according to hyperparameters + + Arguments + --------- + datasets : dict + a key -> value dictionary of datasets (the keys are "train", "valid" and "test") + hparams : dict + a dictionary of hyperparameters + """ + # Sorting training data with ascending order makes the code much + # faster because we minimize zero-padding. In most of the cases, this + # does not harm the performance. + if hparams["sorting"] == "ascending": + datasets["train"] = datasets["train"].filtered_sorted(sort_key="length") + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "descending": + datasets["train"] = datasets["train"].filtered_sorted( + sort_key="length", reverse=True + ) + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "random": + if not hparams["overfit_test"]: + hparams["train_dataloader_opts"]["shuffle"] = True + else: + raise NotImplementedError( + "sorting must be random, ascending or descending" + ) + + +def sample_dataset(dataset, count, seed): + """Selects a sample of the specified dataset in a + stable manner, returning the same sample on each call + + Arguments + --------- + dataset : speechbrain.dataio.dataset.DynamicItemDataset + A dataset + count : int + The number of items to select + seed : int + The seed to be used + """ + if len(dataset) < count: + return dataset + generator = torch.Generator() + generator.manual_seed(seed) + indexes = torch.randperm(len(dataset)).tolist()[:count] + data_ids = [dataset.data_ids[idx] for idx in indexes] + return FilteredSortedDynamicItemDataset(dataset, data_ids,) + + +def get_offsets(vocab_size, tracks): + """Adds offsets to each track to treat the tokens as distinct + + Arguments + --------- + vocab_size : int + The vocabulary size, for each track + tracks : int + The number of tracks + """ + return torch.arange(tracks) * vocab_size + + +def apply_mem_fraction(): + """Applies the memory fraction, based on environment variables, useful for cases where + multiple experiments share a large GPU""" + if not torch.cuda.is_available(): + return + mem_fraction = os.environ.get("SB_CUDA_MEM_FRACTION") + if mem_fraction: + fraction, device = mem_fraction.split(":") + fraction, device = float(fraction), int(device) + logger.info("Using %f of GPU %f", fraction, device) + torch.cuda.set_per_process_memory_fraction(fraction, device) + + +def group_by_speaker(dataset, hparams): + """Groups utterance IDs in a dataset by speaker, for selection. The selection + is stable based on the seed - calling this method multiple times will always + result in the same order + + Arguments + --------- + dataset : torch.Tensor + the dataset from which to select items + hparams : dict + hyperparameters + + Returns + ------- + spk_idx : dict + a str -> str with a list of utterance IDs + for every speaker + spk_samplers : dict + a reproducible sampler for every speaker + spk_samplers_it : dict + an iterator for each sampler + """ + spk_uttid = {} + spk_samplers = {} + speakers = [] + generator = torch.Generator() + generator.manual_seed(hparams["seed"]) + + # Group by speaker + with dataset.output_keys_as(["spk_id", "uttid"]): + for idx, item in enumerate(dataset): + spk_id = item["spk_id"] + if spk_id not in spk_uttid: + spk_uttid[spk_id] = [] + spk_uttid[spk_id].append(item["uttid"]) + speakers.append(spk_id) + + # Create a reproducible sampler + for spk_id in speakers: + sampler = hparams["spk_sampler"](data_source=spk_uttid[spk_id]) + spk_samplers[spk_id] = sampler + + return spk_uttid, spk_samplers + + +def resample_spk(sample, spk_idx, spk_samplers, dataset, epoch): + """Selects new samples + + Arguments + --------- + spk_idx : dict + Data item indexes grouped by speaker + spk_samplers : dict + A sampler for each speaker + spk_samplers_it : dict + An iterator for each speaker + epoch : int + The epoch number + + Returns + ------- + sample : dict + a dictionary with uttids as keys and matching + indexes as values + """ + if epoch is None: + epoch = 0 + spk_samplers_it = {} + for spk_id, sampler in spk_samplers.items(): + sampler.set_epoch(epoch) + spk_samplers_it[spk_id] = iter(sampler) + with dataset.output_keys_as(["uttid", "spk_id"]): + for item in dataset: + spk_item_idx = next(spk_samplers_it[item["spk_id"]]) + dataset_item_idx = spk_idx[item["spk_id"]][spk_item_idx] + sample[item["uttid"]] = dataset_item_idx + + +def init_sequence_encoder(hparams): + """Initialize a sequence encoder + + Arguments + --------- + hparams: dict + parsed hyperparameters + prefix: str + the prefix to be prepended to hyperparameter keys, per the naming + convention + + {prefix}_label_encoder: the hparams key for the label encoder + {prefix}_list_file: the hparams key for the list file + + Returns + ------- + encoder: speechbrain.dataio.encoder.TextEncoder + an encoder instance""" + encoder = hparams["label_encoder"] + token_list_file_name = hparams["token_list_file"] + tokens = read_token_list(token_list_file_name) + encoder.add_unk() + for token in hparams["special_tokens"]: + token_key = token.replace("<", "").replace(">", "") + token_index = hparams[f"{token_key}_index"] + encoder.insert_label(token, token_index) + + encoder.update_from_iterable(tokens, sequence_input=False) + encoder.expect_len(len(tokens) + hparams["special_num_tokens"]) + return encoder + + +def get_selected_layer_indexes(available_layers, selected_layers): + """Finds the layers of selected layers + + Arguments + --------- + available_layers : list + The available layers + selected_layers : list + The selected layers + + Returns + ------- + layer_idx : list + The layer indexes + """ + if not (selected_layers and available_layers): + return None + layer_idx = [available_layers.index(layer) for layer in selected_layers] + return layer_idx + + +def read_token_list(file_name): + """Reads a simple text file with tokens (e.g. characters or phonemes) listed + one per line + + Arguments + --------- + file_name: str + the file name + + Returns + ------- + result: list + a list of tokens + """ + file_name = Path(file_name) + if not file_name.is_absolute(): + file_name = Path(__file__).parent / "hparams" / file_name + if not file_name.exists(): + raise ValueError(f"Token file {file_name} not found") + with open(file_name) as token_file: + return [line.strip("\r\n") for line in token_file if line] + + +def apply_overfit_test(hparams, dataset): + """Helper for applying an overfit test conditionally based + on hyperparameters: + + `overfit_test`: whether or not to apply an overfit test + `overfit_test_sample_count`: the number of samples to use from the + original dataset + `overfit_test_epoch_data_count`: the number of samples per epoch + + The function will accept datasets, (train, valid, test) tuples + or dictionaries of the form: + {"train": dataset1, "valid": dataset2, "test": dataset3} + + If a tuple or dictionary is used, the training dataset will be of length + overfit_test_epoch_data_count wheres the evaluation dataset will be of + length overfit_test_sample_count. + + Arguments + --------- + hparams: dict + parsed hyperparameters + dataset: DynamicItemDataset|tuple|dict + One of the following + a dataset + a dictionary ({"train": dataset1, "valid": dataset2, "test": dataset3}) + a (train, valid, test) tuple of datasets + + Returns + ------- + result: DynamicItemDataset|tuple|dict + a dataset or collection of datasets suitable for + an overfitting test - in the same format as the + dataset argument (single dataset, dictionary and tuple) + """ + if hparams["overfit_test"]: + if isinstance(dataset, tuple): + dataset_train, dataset_valid, _ = dataset + dataset_train = apply_overfit_test(hparams, dataset_train) + dataset_eval = dataset_train.filtered_sorted( + select_n=hparams["overfit_test_sample_count"] + ) + dataset_eval.set_output_keys( + list(dataset_valid.pipeline.output_mapping.keys()) + ) + result = dataset_train, dataset_eval, dataset_eval + elif isinstance(dataset, dict): + dataset_train = apply_overfit_test(hparams, dataset["train"]) + dataset_eval = dataset_train.filtered_sorted( + select_n=hparams["overfit_test_sample_count"] + ) + dataset_eval.set_output_keys( + list(dataset["valid"].pipeline.output_mapping.keys()) + ) + + result = { + "train": dataset_train, + "valid": dataset_eval, + "test": dataset_eval, + "sample": dataset_eval, + } + else: + result = dataset.overfit_test( + hparams["overfit_test_sample_count"], + hparams["overfit_test_epoch_data_count"], + ) + else: + result = dataset + return result + + +def select_eval_subset(dataset, hparams, key="eval_subset"): + """Selects a subset of the dataset provided, if specified. + The selection is controlled by a hyperparameter named + eval_subset, which is expected to list the IDs of the + data items on which evaluation will take place, one per line + + Arguments + --------- + dataset : speechbrain.dataio.dataset.DynamicItemDataset + A dataset + hparams : dict + A hyperparameters file + + Returns + ------- + subset : dataset + The dataset, filtered down if applicable + """ + eval_subset_path = hparams.get(key) + if eval_subset_path is not None: + eval_subset_path = Path(eval_subset_path) + if not eval_subset_path.exists(): + raise ValueError(f"eval_subset {eval_subset_path} does not exist") + with open(eval_subset_path) as eval_subset_file: + eval_subset_ids = [line.strip() for line in eval_subset_file] + existing_ids = dataset.data_ids + eval_subset_ids = [ + uttid for uttid in eval_subset_ids if uttid in existing_ids + ] + if not eval_subset_ids: + raise ValueError( + "{eval_subset_path}: no items found in the dataset" + ) + subset = FilteredSortedDynamicItemDataset(dataset, eval_subset_ids) + else: + subset = dataset + return subset + + +def undo_padding_tensor(batch, lengths): + """Produces Python lists given a batch of sentences with + their corresponding relative lengths. + + Arguments + --------- + batch : torch.Tensor + Batch of sentences gathered in a batch. + lengths : torch.Tensor + Relative length of each sentence in the batch. + + Returns + ------- + as_list : list + A python list of the corresponding input tensor. + + Example + ------- + >>> batch=torch.rand([4,100]) + >>> lengths=torch.tensor([0.5,0.6,0.7,1.0]) + >>> snt_list=undo_padding(batch, lengths) + >>> len(snt_list) + 4 + """ + batch_max_len = batch.shape[1] + as_list = [] + for seq, seq_length in zip(batch, lengths): + actual_size = int(torch.round(seq_length * batch_max_len)) + seq_true = seq.narrow(0, 0, actual_size) + as_list.append(seq_true) + return as_list + + +RE_PUNCTUATION = re.compile( + "|".join(re.escape(char) for char in string.punctuation) +) + + +if __name__ == "__main__": + # Reading command line arguments + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + + # Initialize ddp (useful only for multi-GPU DDP training) + sb.utils.distributed.ddp_init_group(run_opts) + + # Applies the memory fraction for a shared GPU + apply_mem_fraction() + + # Load hyperparameters file with command-line overrides + with open(hparams_file) as fin: + yaml = fin.read() + + # Load evaluation hyperparameters + eval_hparams_file = Path(hparams_file).parent / "eval.yaml" + if not eval_hparams_file.exists(): + eval_hparams_file = Path(__file__).parent / "hparams" / "eval.yaml" + if eval_hparams_file.exists(): + logger.info( + "Using evaluation hyperparameters from %s", eval_hparams_file + ) + with open(eval_hparams_file) as eval_hparams: + hparams_yaml = eval_hparams.read() + yaml = "\n".join([yaml, hparams_yaml]) + else: + logger.info( + "%s not found - not using evaluation hyperparameters", + eval_hparams_file, + ) + hparams = load_hyperpyyaml(yaml, overrides, overrides_must_match=True) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + # Data preparation, to be run on only one process. + from libritts_prepare import prepare_libritts + + # Data preparation, to be run on only one process. + if not hparams["skip_prep"]: + run_on_main( + prepare_libritts, + kwargs={ + "data_folder": hparams["data_folder"], + "save_json_train": hparams["train_json"], + "save_json_valid": hparams["valid_json"], + "save_json_test": ( + hparams["test_json"] + if "test" in hparams["splits"] + else None + ), + "sample_rate": hparams["sample_rate"], + "train_split": hparams["train_split"], + "valid_split": hparams["valid_split"], + "test_split": ( + hparams["test_split"] + if "test" in hparams["splits"] + else None + ), + "seed": hparams["seed"], + "alignments_folder": hparams.get("alignments_folder"), + "model_name": hparams["model"].__class__.__name__, + "max_valid_size": hparams.get("max_valid_size", 10000), + }, + ) + + if not hparams.get("prep_only"): + # We can now directly create the datasets for training, valid, and test + datasets, resample_fn = dataio_prepare(hparams) + + # Apply overfit test settings + datasets = apply_overfit_test(hparams, datasets) + audio_keys = ["audio_tokens"] + + # Trainer initialization + tts_brain = VALLEBrain( + modules=hparams["modules"], + opt_class=hparams["opt_class"], + hparams=hparams, + run_opts=run_opts, + checkpointer=hparams["checkpointer"], + ) + + tts_brain.resample_fn = resample_fn + + # The `fit()` method iterates the training loop, calling the methods + # necessary to update the parameters of the model. Since all objects + # with changing state are managed by the Checkpointer, training can be + # stopped at any point, and will be resumed on next call. + tts_brain.fit( + tts_brain.hparams.epoch_counter, + datasets["train"], + datasets["valid"], + train_loader_kwargs=hparams["train_dataloader_opts"], + valid_loader_kwargs=hparams["valid_dataloader_opts"], + ) + + # Load best checkpoint for evaluation + if hparams["testing"]: + test_summary_file = next( + Path(hparams["output_folder"]).glob("eval/test/*/summary.json"), + None, + ) + if test_summary_file is not None: + logging.info( + "Test run already completed: %s", test_summary_file + ) + else: + test_key_kind = hparams["test_key_kind"] + test_key = hparams["test_key"] + eval_kwargs = {f"{test_key_kind}_key": test_key} + eval_dataset_key = hparams["eval_dataset"] + logger.info( + "Performing final evaluation on the %s dataset", + eval_dataset_key, + ) + eval_dataset = datasets[eval_dataset_key] + eval_dataset = select_eval_subset(eval_dataset, hparams) + tts_brain.evaluate( + test_set=eval_dataset, + test_loader_kwargs=hparams["test_dataloader_opts"], + **eval_kwargs, + ) diff --git a/benchmarks/DASB/LibriTTS/extraction/extract.py b/benchmarks/DASB/LibriTTS/extraction/extract.py new file mode 100644 index 000000000..328fbe868 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/extract.py @@ -0,0 +1,90 @@ +#!/usr/bin/env/python3 +"""Recipe for extracting a discrete tokens with librispeech. + +Authors + * Jarod Duret 2024 +""" + +import os +import sys +import logging +import pathlib as pl +import speechbrain as sb +from speechbrain.dataio.dataset import DynamicItemDataset +from speechbrain.utils.distributed import run_on_main +from hyperpyyaml import load_hyperpyyaml + +base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) +sys.path.append(base_dir) + +logger = logging.getLogger(__name__) + + +if __name__ == "__main__": + # CLI: + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + with open(hparams_file) as fin: + hparams = load_hyperpyyaml(fin, overrides) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + # Dataset prep (parsing Librispeech + from libritts_prepare import prepare_libritts # noqa + + # multi-gpu (ddp) save data preparation + run_on_main( + prepare_libritts, + kwargs={ + "data_folder": hparams["data_folder"], + "train_split": hparams["train_splits"], + "valid_split": hparams["dev_splits"], + "test_split": hparams["test_splits"], + "save_json_train": hparams["train_json"], + "save_json_valid": hparams["valid_json"], + "save_json_test": hparams["test_json"], + "sample_rate": hparams["sample_rate"], + "skip_prep": hparams["skip_prep"], + "max_valid_size": None, + "skip_resample": hparams["skip_resample"], + }, + ) + + tokens_extractor = hparams["tokens_extractor"] + data_folder = hparams["data_folder"] + datasets = [] + for split in ["train", "valid", "test"]: + json_path = hparams[f"{split}_json"] + name = pl.Path(json_path).stem + dataset = sb.dataio.dataset.DynamicItemDataset.from_json( + json_path=json_path, replacements={"data_root": data_folder}, + ) + datasets.append(dataset) + + merged_data = { + key: value + for dataset in datasets + for key, value in dataset.data.items() + } + merged_dataset = DynamicItemDataset(merged_data) + + save_folder = pl.Path(hparams["save_folder"]) + logger.info("Extracting dataset tokens ...") + tokens_extractor.extract_tokens( + merged_dataset, + hparams["num_codebooks"], + (save_folder / "libritts").as_posix(), + ) + + if hparams["save_embedding"]: + save_folder = pl.Path(hparams["save_folder"]) + logger.info("Saving embeddings ...") + tokens_extractor.save_pretrained_embeddings( + (save_folder / "embeddings").as_posix(), + vocab_size=hparams["vocab_size"], + num_codebooks=hparams["num_codebooks"], + ) diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/dac.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/dac.yaml new file mode 100644 index 000000000..836503717 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/dac.yaml @@ -0,0 +1,64 @@ +# ############################################################################ +# Auido Tokenizer: DAC +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/dac +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /dev-clean.json +test_json: !ref /test.json + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +####################### Model parameters ########################### +# Tokenizer parameters +# DAC parameters +# model_type: [16khz, 24khz, 44khz, 44khz] +# vocab_size: [1024, 1024, 1024, 1024] +# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps] +# max_num_codebooks: [12, 32, 9, 18] +# embedding_dim: [1024, 1024, 1024, 128] +model_type: 24khz +vocab_size: 1024 +model_bitrate: 8kbps +num_codebooks: 32 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +save_embedding: False +skip_resample: False + +tokenizer: !new:utils.tokenizer_interface.DACTokenizer + model_type: !ref + model_bitrate: !ref + load_pretrained: True + tag: latest + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/discrete_ssl.yaml new file mode 100644 index 000000000..6ae14c87c --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/discrete_ssl.yaml @@ -0,0 +1,103 @@ +# ############################################################################ +# Auido Tokenizer: WavLM +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/wavlm +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["test-clean", "test-other"] +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /dev-clean.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +### Configuration for discrete SSL model +# | SSL Model | HF Encoder | K-Means Dataset | K-Means Size | SSL Layers | Vocoder Model | +# |------------|----------------------------------------|-----------------|--------------|----------------------|------------------------------------------| +# | WavLM | microsoft/wavlm-large | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | speechbrain/hifigan-wavlm-k1000-LibriTTS | +# | HuBERT | facebook/hubert-large-ll60k | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | WIP | +# | Wav2Vec2 | facebook/wav2vec2-large-960h-lv60-self | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | WIP | + +# ssl_model_type: hubert, wavlm, wav2vec2 +# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large, facebook/wav2vec2-large +ssl_model_type: WavLM +ssl_hub: microsoft/wavlm-large +ssl_folder: !ref /ssl_checkpoint +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech +vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS +freeze_ssl: True +freeze_feature_extractor: True +vocab_size: 1000 +save_embedding: False +skip_resample: False + + +### Config for Tokenizer +# Layer number should be among the supported layers for discrete SSL models(kmenas model should be available for that layer) +num_codebooks: [1, 3, 7, 12, 18, 23] +deduplicate: [False, False, False, False, False, False] +bpe_tokenizer_path: [null, null, null, null, null, null] +sample_rate: 16000 +encoder_dim: 1024 + +ssl_model: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + WavLM: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + HuBERT: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + Wav2Vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + +tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer + save_path: !ref + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml new file mode 100644 index 000000000..188b38a6d --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml @@ -0,0 +1,64 @@ +# ############################################################################ +# Auido Tokenizer: Encodec +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/encodec +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /dev-clean.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# EnCodec parameters +# sample_rate: [24000, 24000, 24000, 24000] +# vocab_size: [1024, 1024, 1024, 1024] +# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] +# num_codebooks: [2, 4, 8, 16, 32] +bandwidth: 24.0 +num_codebooks: 32 +vocab_size: 1024 +sample_rate: 24000 +save_embedding: False +skip_resample: False + +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/espnet_encodec.yaml new file mode 100644 index 000000000..a0542b189 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/espnet_encodec.yaml @@ -0,0 +1,67 @@ +# ############################################################################ +# Auido Tokenizer: Encodec +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/encodec +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /dev-clean.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# EnCodec parameters +# sample_rate: [24000, 24000, 24000, 24000] +# vocab_size: [1024, 1024, 1024, 1024] +# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] +# num_codebooks: [2, 4, 8, 16, 32] +espnet_repo: https://github.com/espnet/espnet +espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef +model_hub: espnet/libritts_encodec_24k +model_ckpt: exp/codec_encodec_ss4_24k/120epoch.pth +model_config: exp/codec_encodec_ss4_24k/config.yaml +num_codebooks: 32 +vocab_size: 1024 +sample_rate: 24000 +save_embedding: False +skip_resample: False + +tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface + source: !ref + model_config: !ref + n_codebook: !ref + save_path: !ref + sample_rate: !ref + model_ckpt: !ref + espnet_commit: !ref + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml new file mode 100644 index 000000000..dc026cc55 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml @@ -0,0 +1,59 @@ +# ############################################################################ +# Auido Tokenizer: Speech Tokenizer +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/speech_tokenizer +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /dev-clean.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +####################### Model parameters ########################### +# Tokenizer parameters +model_hub: kyutai/mimi +vocab_size: 1024 +num_codebooks: 32 +sample_rate: 24000 +encoder_dim: 1024 +freeze_embedding: False +save_embedding: False +skip_resample: False + +tokenizer: !new:utils.tokenizer_interface.MimiTokenizer + source: !ref + save_path: !ref + num_codebooks: !ref + + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml new file mode 100644 index 000000000..8d3a9aa27 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml @@ -0,0 +1,55 @@ +# ############################################################################ +# Auido Tokenizer: Speech Tokenizer +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/speech_tokenizer +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /dev-clean.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +vocab_size: 1024 +num_codebooks: 8 +sample_rate: 16000 +encoder_dim: 1024 +freeze_embedding: False +save_embedding: False +skip_resample: False + +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper + source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio + save_path: !ref + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml new file mode 100644 index 000000000..3d9792bbb --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml @@ -0,0 +1,59 @@ +# ############################################################################ +# Auido Tokenizer: Speech Tokenizer +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/sqcodec +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /dev-clean.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# SQCodec parameters +config: config.yaml +checkpoint: ckpt_00190000.pth +sample_rate: 16000 +save_embedding: False +num_codebooks: 4 +save_path: /home/ubuntu/sq-codec/SQ-Codec +skip_resample: False + + +# SQCodec model +tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer + save_path: !ref + checkpoint: !ref + config: !ref + + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml new file mode 100644 index 000000000..bfd802740 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml @@ -0,0 +1,61 @@ +# ############################################################################ +# Auido Tokenizer: Speech Tokenizer +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/speech_tokenizer +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /dev-clean.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# WavTokenizer parameters +model_hub: novateur/WavTokenizer-medium-music-audio-75token +config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml +checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt +sample_rate: 24000 +save_embedding: False +num_codebooks: 1 +vocab_size: 4096 +skip_resample: False + +# wavtokenizer model +tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper + source: !ref + save_path: !ref + checkpoint: !ref + config: !ref + freeze: True + + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/libritts_prepare.py b/benchmarks/DASB/LibriTTS/extraction/libritts_prepare.py new file mode 120000 index 000000000..39f1a78c2 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/libritts_prepare.py @@ -0,0 +1 @@ +../libritts_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/libritts_prepare.py b/benchmarks/DASB/LibriTTS/libritts_prepare.py new file mode 100644 index 000000000..dda10826d --- /dev/null +++ b/benchmarks/DASB/LibriTTS/libritts_prepare.py @@ -0,0 +1,563 @@ +""" +LibriTTS data preparation + +Authors + * Pradnya Kandarkar 2022 +""" + +import json +import os +import random + +import torch +import torchaudio +import re +from tqdm import tqdm + +from speechbrain.inference.text import GraphemeToPhoneme +from speechbrain.utils.data_utils import get_all_files +from speechbrain.utils.logger import get_logger +from speechbrain.utils.text_to_sequence import _g2p_keep_punctuations +from pathlib import Path + +logger = get_logger(__name__) +LIBRITTS_URL_PREFIX = "https://www.openslr.org/resources/60/" + +DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +def prepare_libritts( + data_folder, + save_json_train, + save_json_valid, + save_json_test, + sample_rate, + split_ratio=[80, 10, 10], + libritts_subsets=None, + train_split=None, + valid_split=None, + test_split=None, + seed=1234, + model_name=None, + max_valid_size=500, + alignments_folder=None, + skip_prep=False, + skip_resample=False, +): + """ + Prepares the json files for the LibriTTS dataset. + Downloads the dataset if it is not found in the `data_folder` as expected. + + Arguments + --------- + data_folder : str + Path to the folder where the LibriTTS dataset is stored. + save_json_train : str + Path where the train data specification file will be saved. + save_json_valid : str + Path where the validation data specification file will be saved. + save_json_test : str + Path where the test data specification file will be saved. + sample_rate : int + The sample rate to be used for the dataset + split_ratio : list + List composed of three integers that sets split ratios for train, valid, + and test sets, respectively. For instance split_ratio=[80, 10, 10] will + assign 80% of the sentences to training, 10% for validation, and 10% + for test. + libritts_subsets: list + List of librispeech subsets to use (e.g., dev-clean, train-clean-100, ...) for the experiment. + This parameter will be ignored if explicit data splits are provided. + Explicit data splits parameters: "train_split", "valid_split", "test_split" + train_split : list + List of librispeech subsets to use (e.g.,train-clean-100, train-clean-360) for the experiment training stage. + valid_split : list + List of librispeech subsets to use (e.g., dev-clean) for the experiment validation stage. + test_split : list + List of librispeech subsets to use (e.g., test-clean) for the experiment testing stage. + seed : int + Seed value + model_name : str + Model name (used to prepare additional model specific data) + alignments_path : None + The path to alignments files + skip_prep: Bool + If True, skip preparation. + skip_resample: bool + If True, audio will not be resampled + + Returns + ------- + None + """ + + if skip_prep: + return + + # Setting the seed value + random.seed(seed) + + # Checks if this phase is already done (if so, skips it) + if skip(save_json_train, save_json_valid, save_json_test): + logger.info("Preparation completed in previous run, skipping.") + return + + logger.info( + f"Creating {save_json_train}, {save_json_valid}, and {save_json_test}" + ) + + # If specific splits are provided, creates data manifest files accordingly + if train_split: + wav_list = prepare_split(data_folder, train_split) + create_json( + wav_list, + save_json_train, + sample_rate, + data_folder, + alignments_folder, + model_name, + skip_resample, + ) + if valid_split: + wav_list = prepare_split(data_folder, valid_split) + # TODO add better way to speedup evaluation + if max_valid_size is not None and len(wav_list) > max_valid_size: + wav_list = random.sample(wav_list, max_valid_size) + create_json( + wav_list, + save_json_valid, + sample_rate, + data_folder, + alignments_folder, + model_name, + skip_resample, + ) + if test_split: + wav_list = prepare_split(data_folder, test_split) + create_json( + wav_list, + save_json_test, + sample_rate, + data_folder, + alignments_folder, + model_name, + skip_resample, + ) + + if skip(save_json_train, save_json_valid, save_json_test): + logger.info("Preparation completed.") + return + + # If specific splits are not provided, and a list of subsets if provided, creates train, valid, test splits + # Creates data manifest files according to the data splits + if libritts_subsets: + wav_list = prepare_split(data_folder, libritts_subsets) + # Random split the signal list into train, valid, and test sets. + data_split = split_sets(wav_list, split_ratio) + # Creating json files + create_json( + data_split["train"], + save_json_train, + sample_rate, + alignments_folder, + model_name, + skip_resample, + ) + create_json( + data_split["valid"], + save_json_valid, + sample_rate, + alignments_folder, + model_name, + skip_resample, + ) + create_json( + data_split["test"], + save_json_test, + sample_rate, + alignments_folder, + model_name, + skip_resample, + ) + + +def prepare_split(data_folder, split_list): + """ + Processes the provided list of LibriTTS subsets and creates a list of all the .wav files present in the subsets. + Downloads the LibriTTS subsets as required. + + Arguments + --------- + data_folder : str + Path to the folder where the LibriTTS dataset is stored + split_list : list + List of librispeech subsets to process (e.g., dev-clean, train-clean-100, ...) + + Returns + ------- + wav_list : list + List of all .wav files to be processed + """ + extension = [".wav"] # The expected extension for audio files + wav_list = list() # Stores all audio file paths for the dataset + + # For every subset of the dataset, if it doesn't exist, downloads it + for subset_name in split_list: + subset_folder = os.path.join(data_folder, subset_name) + subset_archive = os.path.join(subset_folder, subset_name + ".tar.gz") + + if not check_folders(subset_folder): + logger.info( + f"No data found for {subset_name}. Checking for an archive file." + ) + if not os.path.isfile(subset_archive): + logger.info( + f"No archive file found for {subset_name}. Downloading and unpacking." + ) + quit() + # Collects all files matching the provided extension + wav_list.extend(get_all_files(subset_folder, match_and=extension)) + + return wav_list + + +def create_json( + wav_list, + json_file, + sample_rate, + data_folder, + alignments_folder=None, + model_name=None, + skip_resample=False, +): + """ + Creates the json file given a list of wav files. + Arguments + --------- + wav_list : list of str + The list of wav files. + json_file : str + The path of the output json file + sample_rate : int + The sample rate to be used for the dataset + data_folder : str + The path to LibriTTS + alignments_folder : str + The path to LibriTTS alignments + model_name : str + Model name (used to prepare additional model specific data) + skip_resample : int + Skips resampling - useful when large temporary storage + is absent. + """ + + # Downloads and initializes the G2P model to compute the phonemes if data is being prepared for Tacotron2 experiments + if model_name == "Tacotron2": + logger.info( + "Computing phonemes for labels using SpeechBrain G2P. This may take a while." + ) + g2p = GraphemeToPhoneme.from_hparams( + "speechbrain/soundchoice-g2p", run_opts={"device": DEVICE} + ) + else: + g2p = None + + json_dict = {} + + # Processes all the wav files in the list + for wav_file in tqdm(wav_list): + # Reads the signal + signal, sig_sr = torchaudio.load(wav_file) + duration = signal.shape[1] / sig_sr + + # TODO add better way to filter short utterances + if duration < 1.0: + continue + + # Manipulates path to get relative path and uttid + path_parts = wav_file.split(os.path.sep) + uttid, _ = os.path.splitext(path_parts[-1]) + # relative_path = os.path.join("{data_root}", *path_parts[-4:]) + + # Gets the path for the text files and extracts the input text + normalized_text_path = os.path.join( + "/", *path_parts[:-1], uttid + ".normalized.txt" + ) + try: + with open(normalized_text_path, encoding="utf-8") as f: + normalized_text = f.read() + if normalized_text.__contains__("{"): + normalized_text = normalized_text.replace("{", "") + if normalized_text.__contains__("}"): + normalized_text = normalized_text.replace("}", "") + except FileNotFoundError: + print(f"Warning: The file {normalized_text_path} does not exist.") + continue + + # Resamples the audio file if required + if sig_sr != sample_rate and not skip_resample: + resampled_signal = torchaudio.functional.resample( + signal, sig_sr, sample_rate + ) + os.unlink(wav_file) + torchaudio.save(wav_file, resampled_signal, sample_rate=sample_rate) + + # Gets the speaker-id from the utterance-id + spk_id = uttid.split("_")[0] + + # Creates an entry for the utterance + json_dict[uttid] = { + "uttid": uttid, + "wav": wav_file, + "duration": duration, + "spk_id": spk_id, + "label": normalized_text, + "segment": True if "train" in json_file else False, + } + if alignments_folder is not None: + alignments_file_name = get_alignment_path( + data_folder, alignments_folder, wav_file + ) + alignments = parse_alignments(alignments_file_name) + json_dict[uttid].update(alignments) + + # Characters are used for Tacotron2, phonemes may be needed for other models + if model_name not in ["Tacotron2", "HiFi-GAN"] and g2p is not None: + # Computes phoneme labels using SpeechBrain G2P and keeps the punctuations + phonemes = _g2p_keep_punctuations(g2p, normalized_text) + json_dict[uttid].update({"label_phoneme": phonemes}) + + # Writes the dictionary to the json file + with open(json_file, mode="w", encoding="utf-8") as json_f: + json.dump(json_dict, json_f, indent=2) + + logger.info(f"{json_file} successfully created!") + + +def get_alignment_path(data_folder, alignments_folder, file_name): + """Returns the path in the LibriSpeech-Alignments dataset + corresponding to the specified file path in LibriSpeech + + Arguments + --------- + data_folder: str + the path to LibriSpeech + alignments_folder: str + the path to LibriSpeech-Alignments + file_name: str + the file name within LibriSpeech + + Returns + ------- + file_name: str + the alignment file path + """ + file_name = Path(file_name) + data_folder = Path(data_folder) + if file_name.parts[0] == "{data_root}": + file_name_rel = file_name.relative_to("{data_root}") + else: + file_name_rel = file_name.relative_to(data_folder) + data_slice = file_name_rel.parts[0] + + textgrid_folder = file_name_rel.relative_to( + Path(data_slice) / "LibriTTS" / data_slice + ).parent.parent + textgrid_file_name = f"{file_name_rel.stem}.TextGrid" + textgrid_path = ( + Path(alignments_folder) + / data_slice + / textgrid_folder + / textgrid_file_name + ) + + return textgrid_path + + +def skip(*filenames): + """ + Detects if the data preparation has been already done. + If the preparation has been done, we can skip it. + + Arguments + --------- + *filenames : tuple + Set of filenames to check for existence. + + Returns + ------- + bool + if True, the preparation phase can be skipped. + if False, it must be done. + """ + for filename in filenames: + if isinstance(filename, list): + if any(not os.path.isfile(item) for item in filename): + return False + else: + if not os.path.isfile(filename): + return False + return True + + +def split_sets(wav_list, split_ratio): + """Randomly splits the wav list into training, validation, and test lists. + + Arguments + --------- + wav_list : list + list of all the signals in the dataset + split_ratio: list + List composed of three integers that sets split ratios for train, valid, + and test sets, respectively. For instance split_ratio=[80, 10, 10] will + assign 80% of the sentences to training, 10% for validation, and 10% + for test. + + Returns + ------- + dictionary containing train, valid, and test splits. + """ + # Random shuffles the list + random.shuffle(wav_list) + tot_split = sum(split_ratio) + tot_snts = len(wav_list) + data_split = {} + splits = ["train", "valid"] + + for i, split in enumerate(splits): + n_snts = int(tot_snts * split_ratio[i] / tot_split) + data_split[split] = wav_list[0:n_snts] + del wav_list[0:n_snts] + data_split["test"] = wav_list + + return data_split + + +def check_folders(*folders): + """Returns False if any passed folder does not exist.""" + for folder in folders: + if not os.path.exists(folder): + return False + return True + + +def parse_alignments(file_name): + """Parses a given LibriSpeech-Alignments TextGrid file and + converts the results to the desired format (to be used in JSON + metadata) + + Arguments + --------- + file_name : path-like + the file name of the TextGrid file + + Returns + ------- + details: dict + the metadata details + """ + try: + import textgrids + except ImportError: + logger.error( + "Parsing LibriSpeech-alignments requires the" + "praat-textgrids package" + ) + raise + if not file_name.exists(): + return { + "has_alignments": False, + "phn": [], + "phn_stress": [], + "phn_start": [], + "phn_end": [], + "phn_count": 0, + "wrd": [], + "wrd_start": [], + "wrd_end": [], + "wrd_count": 0, + "unk_count": None, + } + + text_grid = textgrids.TextGrid() + text_grid.read(file_name) + word_intervals = [ + {**word, "label": word["label"].upper()} + for word in text_grid.interval_tier_to_array("words") + ] + phn_intervals = text_grid.interval_tier_to_array("phones") + details = {} + details.update(intervals_to_dict(word_intervals, "wrd")) + phn = intervals_to_dict(phn_intervals, "phn") + phn_stress = phn["phn"] + phn_nostress = remove_stress_marks(phn_stress) + phn["phn"] = phn_nostress + phn["phn_stress"] = phn_stress + details.update(phn) + details["unk_count"] = sum(wrd == "" for wrd in details["wrd"]) + details["has_alignments"] = True + + return details + + +INTERVAL_MAP = [("label", ""), ("begin", "_start"), ("end", "_end")] +INTERVAL_EMPTY_LABELS = {"", "sil", "sp", "spn"} + + +def intervals_to_dict(intervals, prefix): + """ + Converts a parsed list of intervals from PRAAT TextGrid + to a learning-friendly array + + Arguments + --------- + intervals: list + A list of raw TextGrid intervals, as returned by + TextGrid.interval_tier_to_array + prefix: str + the prefix to add + + Returns + ------- + result: dict + A dictionary of the form + { + "{prefix}": , + "{prefix}_start": , + "{prefix}_end": , + "{prefix}_count: + } + + """ + # Remove meaningless labels + intervals_clean = [ + interval + for interval in intervals + if interval["label"] not in INTERVAL_EMPTY_LABELS + ] + result = { + f"{prefix}{suffix}": [interval[key] for interval in intervals_clean] + for key, suffix in INTERVAL_MAP + } + # This will map space labels to a single one + result[f"{prefix}_count"] = len(intervals_clean) + return result + + +RE_STRESS_MARK = re.compile(r"\d$") + + +def remove_stress_marks(phn): + """Removes stress marks from a phoneme annotation + + Arguments + --------- + phn: list + a list of phoneme annotations with or without stress marks + + Returns + ------- + result: list + a list of phoneme annotations without stress marks + """ + return [RE_STRESS_MARK.sub("", item) for item in phn] diff --git a/benchmarks/DASB/README.md b/benchmarks/DASB/README.md index c3e42bf64..1c3f78818 100644 --- a/benchmarks/DASB/README.md +++ b/benchmarks/DASB/README.md @@ -25,17 +25,31 @@ For detailed information, refer to [paper](https://arxiv.org/pdf/2406.14294): # Table of Contents -- [Table of Contents](#table-of-contents) -- [Installation](#-installation) -- [Discrete Audio Encoder](#-Discrete-Audio-Encoder) -- [Datasets and Recipes](#-Datasets-and-Recipes) -- [Quickstart](#-quickstart) - - [Running a single task](#Running-a-single-task) - - [Running multiple tasks](#Runnin-multiple-tasks) -- [‍Incorporating Your Audio Tokenizer](#-Incorporating-Your-Audio-Tokenizer) -- [Results](#-results) -- [Contact](#-contact) -- [Citing](#-citing) +Here’s the updated **Table of Contents** for your GitHub README with corrections and better alignment: + +--- + +# 📑 Table of Contents + +- [DASB - Discrete Audio and Speech Benchmark](#dasb---discrete-audio-and-speech-benchmark) +- [🛠️ Installation](#-installation) +- [🎌 Discrete Audio Encoder](#-discrete-audio-encoder) +- [⚡ Datasets and Recipes](#-datasets-and-recipes) +- [📖 Training Scenarios](#-training-scenarios) + - [On-the-Fly Token Extraction](#on-the-fly-token-extraction) + - [Offline Token Extraction](#offline-token-extraction) +- [🎛️ Hyperparameter Tuning](#%EF%B8%8F-hyperparameter-tuning) +- [📝 Incorporating Your Audio Tokenizer](#-incorporating-your-audio-tokenizer) +- [📈 Results](#-results) + - [Ranking](#ranking) + - [Benchmarking Results for Discriminative Tasks](#benchmarking-results-for-discriminative-tasks) + - [Benchmarking Results for Generative Tasks](#benchmarking-results-for-generative-tasks) +- [📧 Contact](#-contact) +- [📖 Citing](#-citing) + +--- + +This structure provides a clear and logical flow, ensuring users can easily navigate the document. Each major section is linked appropriately, with sub-sections for detailed content. Let me know if additional adjustments are required! # 🛠️ Installation @@ -98,51 +112,162 @@ To set up SpeechBrain-DASB, follow these steps: | Libri2Mix | Speech Separation | Conformer | CRDNN | [github.com/JorisCos/LibriMix](https://github.com/JorisCos/LibriMix) | | LJSpeech | Text-to-Speech | Shallow Transformer | Deep Transformer | [keithito.com/LJ-Speech-Dataset/](https://keithito.com/LJ-Speech-Dataset/) | -# ▶️ Quickstart +# 📖 Training Scenarios -## Running a single task +We offer two different training scenarios: **on-the-fly token extraction** and **offline token extraction**. -If you have specific discrete model and want to benchmark it for a specific task, you need to run the following command: - ``` - python LibriSpeech/ASR/LSTM/train_[tokenzier_name].py LibriSpeech/ASR/LSTM/hparams/train_[tokenzier_name].yaml --output_folder my-output-folder --data_folder mypath/to/LibriSpeech - ``` +## On-the-Fly Token Extraction +In this scenario, audio tokens are extracted dynamically during training. To enhance efficiency, we use a caching mechanism where tokens are saved in memory during the first epoch and retrieved for subsequent epochs. However, this approach has some limitations: +- It works best when the dataset is small, the bitrate is low, and batching is sorted (not random). +- It is unsuitable when data augmentation is required. -## Running multiple tasks +You can also disable the caching mechanism if needed. -To run all tasks, make the following changes: +Currently, the on-the-fly token extraction is applied only in the recipe located at: +`LibriSpeech/ASR-on-the-fly` -1. Edit the `run_discriminative_benchmark.sh` and `run_genarative_benchmark.sh` files and modify tokenizer related values for example the bitrate , number of codebooks, and etc. -2. Choose a set of tasks from the provided list and, for each task, select a downstream architecture from the available options (see list below). -3. Update the variables defined in `run_benchmark.sh` with two lists of equal size. In the `ConsideredTasks` list, specify the tasks you want to run (e.g., `'LibriSpeechASR' 'LibriSpeechASR' 'IEMOCAP'`). In the `Downstreams` list, specify the corresponding downstream architecture for each task (e.g., `'BiLSTM'`, `contextnet`, `'ecapa_tdnn'`). +If you wish to adapt this strategy for your own recipe, you can copy and modify the existing recipe as needed. Here's how to run the on-the-fly recipe: + +```bash +python LibriSpeech/ASR-on-the-fly/train.py LibriSpeech/ASR-on-the-fly/hparams/LSTM/{TOKENIZER}.yaml --data_folder=path/LibriSpeech --output_folder=path/results/LibriSpeech/ASR/{TOKENIZER}/LSTM +``` + +> **Note:** On-the-fly extraction can be time-consuming, which is why we also provide an alternative approach: **offline token extraction**. + + +## Offline Token Extraction +In this scenario, all tokens are pre-extracted in a separate recipe. We recommend using the highest number of codebooks available for token extraction and then choosing the desired settings during training. + +### Token Extraction Command +To extract tokens, use the following command: + +```bash +python LibriSpeech/extraction/extract.py benchmarks/DASB/LibriSpeech/extraction/hparams/{tokenizer}.yaml --data_folder=path/LibriSpeech --num_codebooks=32 +``` + +If you wish to initialize your embedding layer with the tokenizer's embeddings while training your downstream model, set the flag `save_embedding` to `True`. For discrete SSL tokenizers, you can specify a list of layers for `--num_codebooks` instead of a single number (e.g., `--num_codebooks=[3,7,12]`). + +### Training with Pre-Extracted Tokens +Once tokens are extracted and saved, you can train a downstream model using the following command: + +```bash +bash run_experiments.sh --hparams benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml --data_folder LibriSpeech --cached_data_folder cache/ --output_folder results/LibriSpeech/ASR/encodec/LSTM --task ASR --dataset LibriSpeech --seed 1986 --nruns 2 --eval_metric WER --tokens_folder LibriSpeech/extraction-emb/speech_tokenizer/save/librispeech/ +``` + +--- + +This workflow ensures flexibility, efficiency, and reproducibility for both training scenarios. Adapt the recipes as needed for your specific requirements! + + +# 🎛️ Hyperparameter Tuning + +Efficient hyperparameter tuning is critical when introducing novel models or experimenting with diverse datasets. Our benchmark establishes a standardized protocol for hyperparameter tuning, leveraging [Orion](https://orion.readthedocs.io/en/stable/) to ensure fair and consistent model comparisons. + +--- + +## **Overview** + +Hyperparameter tuning is managed using the `./run_hparam_optimization.sh` script. This script coordinates multiple hyperparameter trials via `run_experiments.sh`. - For example, if you set `ConsideredTasks=('LibriSpeechASR' 'LibriSpeechASR' 'IEMOCAP')` and `Downstreams=('BiLSTM', 'contextnet', 'ecapa_tdnn')`, the benchmark will be executed as follows: - - LibriSpeechASR with BiLSTM as the probing head - - LibriSpeechASR with contextnet as the probing head - - IEMOCAP with ecapa_tdnn as the probing head. -3. Run the following command: - ``` - bash run_discriminative_benchmark.sh [tokenzier_name] - bash run_genarative_benchmark.sh [tokenzier_name] - ``` - You could also pass extra arguments as far as they are consistent across all tasks. - For generative task, make sure to set the `utmos_path` required for TTS evaluation. +## **Incorporating Orion Flags in Hparam Files** + +To enable tuning, Orion flags should be directly embedded in the YAML hparam file using comments. For example, to optimize the learning rate (`lr`) parameter within a defined range, include the following line in the YAML file: + +```yaml +lr_model: 0.0001 # @orion_step1: --lr_model~"loguniform(0.00001,0.5)" +``` + + + +## **Workflow of the Script** + +The script operates as follows: + +1. **Scans** the YAML hparam file for Orion flags. +2. **Executes** hyperparameter tuning using the `orion-hunt` command. +3. **Saves** the best hyperparameters for reference via `torch-info`. +4. **Iterates** until encountering flags such as `@orion_step` in the YAML file. + + + +## **Running Hyperparameter Optimization** + +You can perform hyperparameter optimization using a command like this: + +```bash +bash run_hparam_optimization.sh \ + --exp_name 'ASR-encodec-LSTM_hopt' \ + --hparams LibriSpeech/ASR/hparams/LSTM/train.yaml \ + --data_folder path/LibriSpeech \ + --cached_data_folder path/cache/ \ + --output_folder results/LibriSpeech/ASR/encodec/LSTM \ + --task ASR \ + --dataset LibriSpeech \ + --seed 1986 \ + --nruns 1 \ + --nruns_eval 5 \ + --eval_metric WER \ + --exp_max_trials 50 \ + --tokens_folder results/LibriSpeech/extraction-emb/encodec/save/librispeech/ \ + --run_name encodec +``` + +For more details on the arguments and customization options, refer to `./run_hparam_optimization.sh`. + + +### **Notes** + +1. **Execution Time**: + - Hyperparameter tuning may take several hours or even days, depending on the model complexity and dataset. + +2. **GPU vs. CPU**: + - By default, models are trained on GPU. To train on CPU instead, include the `--device cpu` flag. + +3. **Monitoring Progress**: + - Use the following command to monitor optimization status: + ```bash + orion status --all + ``` + - Ensure that Orion-specific environment variables are set in your bash environment. For example: + ```bash + export ORION_DB_ADDRESS=results/LibriSpeech/ASR/encodec/LSTM/hopt/ASR-encodec-LSTM_hopt.pkl + export ORION_DB_TYPE=pickleddb + ``` + Adjust `ORION_DB_ADDRESS` according to your experiment. + +4. **Resuming Optimization**: + - You can interrupt the script at any point. It will resume from the last completed trial. + +5. **Repetition of Optimization**: + - For multiple repetitions of the same hyperparameter optimization, modify the `--exp_name` parameter. + +6. **System Requirements**: + - The script is designed for Linux-based systems. A bash script is provided instead of Python due to its ability to manage diverse training loops across various subjects and sessions. + +--- + +This protocol ensures fair model comparison across diverse tasks and datasets. All reported results are derived using this standardized hyperparameter tuning methodology, enabling consistent assessments across models. + # 📝 ‍Incorporating Your Audio Tokenizer Let's now assume you've designed an audio and speech tokenizer in PyTorch and wish to integrate it into our benchmark. You're in luck because we've made this step as simple as possible for you! Here are the steps you should follow: -1. Write your model's code in a Python library saved in `benchmarks/DASB/model` (e.g., `benchmarks/MOABB/models/my_model.py`). -2. Create a YAML and py file for each task you want to experiment with. Thankfully, you don't have to start from scratch. For example, if you're working with LibriSpeech/ASR/LSTM, copy `benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_encodec.yaml` and save it in the same folder with a different name (e.g., `train_my_model.yaml` and `train_my_model.py`). +1. Write your model's code in a Python library saved in `benchmarks/DASB/model` (e.g., `benchmarks/DASB/models/my_model.py`). + +2. Add the tokenizer to `utils/tokenizer_interface.py` and ensure the `encode` and `decode` functions are consistent in functionality and output shape with the other tokenizers. + +3. Create a YAML and Python file for each task you want to experiment with. Thankfully, you don't have to start from scratch. For example, you can copy `LibriSpeech/extraction/hparams/encodec.yaml`, adapt it based on your needs, and save it in the same folder with a different name (e.g., `LibriSpeech/extraction/hparams/{YOUR_TOKENIZER_NAME}.yaml`). -3. Edit the relevant section of your `train_my_model.yaml` and `train_my_model.py`. Redefine the `codec:` to reference your custom model (e.g., `codec: !new:models.my_model.my_model`). +4. Edit the relevant sections of your `{YOUR_TOKENIZER_NAME}.yaml`. Redefine the `tokenizer:` field to reference your custom model (e.g., `tokenizer: !new:tokenizer_interface.your_tokenizer`). -4. Ensure you include the hyperparameters specific to your model. +5. Ensure you include the hyperparameters specific to your model. -5. Now, follow the instructions above to run an experiments across tasks. +6. Now, follow the instructions provided earlier to run experiments across tasks. **Note**: If you're not familiar with YAML, you can refer to our [HyperPyYAML tutorial](https://speechbrain.github.io/tutorial_basics.html) on the SpeechBrain website for guidance. # 📈 Results diff --git a/benchmarks/DASB/extra_requirements.txt b/benchmarks/DASB/extra_requirements.txt index 4d1d241c3..dffb3cd07 100644 --- a/benchmarks/DASB/extra_requirements.txt +++ b/benchmarks/DASB/extra_requirements.txt @@ -1,7 +1,11 @@ beartype jsonlines +kaldiio librosa>=0.9.2 +omegaconf onnxruntime>=1.16.3 +orion +orion[profet] scikit-learn speechbrain>=1.0.0 speechtokenizer>=0.1.2 diff --git a/benchmarks/DASB/model/ __init__.py b/benchmarks/DASB/model/ __init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py index 5238beacd..833e98da4 100644 --- a/benchmarks/DASB/model/Tokotron.py +++ b/benchmarks/DASB/model/Tokotron.py @@ -20,13 +20,17 @@ PositionalEncoding as TransformerPositionalEncoding, get_lookahead_mask, ) +from speechbrain.dataio.batch import PaddedBatch +from speechbrain.utils.data_utils import batch_pad_right from speechbrain.nnet.attention import RelPosEncXL from speechbrain.nnet.embedding import Embedding from speechbrain.nnet.linear import Linear -from speechbrain.nnet.losses import kldiv_loss, mse_loss, compute_masked_loss +from speechbrain.nnet.losses import ( + kldiv_loss, + mse_loss, + compute_masked_loss, +) from speechbrain.dataio.dataio import length_to_mask -from speechbrain.dataio.batch import PaddedBatch -from speechbrain.decoders.seq2seq import S2STransformerBeamSearcher from speechbrain.utils.data_utils import concat_padded_features from speechbrain.nnet.schedulers import NoamScheduler @@ -74,16 +78,22 @@ class EosMode(Enum): + """The method of determining end-of-sequence""" + GATE = "gate" TOKEN = "token" class DecoderMode(Enum): + """The method of determining what type of decoder to use""" + AUTOREGRESSIVE = "autoregressive" FORWARD = "forward" class RepresentationMode(Enum): + """Inidcates the type of representations to use for audio (discrete or continuous)""" + DISCRETE = "discrete" CONTINUOUS = "continuous" @@ -157,8 +167,10 @@ def __init__( show_inference_progress=True, audio_token_shift=0, multihead_input=True, + multihead_output=True, representation_mode=RepresentationMode.DISCRETE, audio_dim=1024, + out_proj=None, ): super().__init__() self.num_tokens = num_tokens @@ -182,9 +194,11 @@ def __init__( if self.representation_mode == RepresentationMode.DISCRETE else audio_dim ) - self.out_proj = Linear( - input_size=d_model, n_neurons=self.out_dim * tokens_per_step, - ) + if out_proj is None: + out_proj = Linear( + input_size=d_model, n_neurons=self.out_dim * tokens_per_step, + ) + self.out_proj = out_proj self.gate = Linear(input_size=d_model, n_neurons=1) if audio_emb is None: if self.representation_mode == RepresentationMode.DISCRETE: @@ -222,6 +236,7 @@ def __init__( self.multihead_input = multihead_input self.d_model = d_model self.d_model_sqrt = math.sqrt(d_model) + self.multihead_output = multihead_output def decode( self, @@ -371,16 +386,17 @@ def forward( pos_embs_src, ) lin_out = self.out_proj(dec_out) - batch_size, audio_max_len, num_tokens = lin_out.shape - lin_out_heads = lin_out.reshape( - batch_size, - audio_max_len, - self.tokens_per_step, - num_tokens // self.tokens_per_step, - ) + if self.multihead_output: + batch_size, audio_max_len, num_tokens = lin_out.shape + lin_out = lin_out.reshape( + batch_size, + audio_max_len, + self.tokens_per_step, + num_tokens // self.tokens_per_step, + ) gate_out = self.gate(dec_out).squeeze(-1) return TokotronDecoderOutput( - lin_out_heads, + lin_out, gate_out, dec_self_attn, dec_attn, @@ -439,6 +455,8 @@ def __init__( representation_mode=RepresentationMode.DISCRETE, audio_dim=1024, show_inference_progress=True, + transform_audio=None, + feed_audio=None, ): super().__init__() self.decoder = None @@ -451,6 +469,10 @@ def __init__( self.representation_mode = RepresentationMode(representation_mode) self.audio_dim = audio_dim self.show_inference_progress = show_inference_progress + if transform_audio is None: + transform_audio = nn.Identity() + self.transform_audio = transform_audio + self.feed_audio = feed_audio def bind(self, model): """Binds this inference implementation to a model @@ -522,6 +544,7 @@ def forward(self, enc_out, length, emb=None): steps_range = tqdm(steps_range, desc="Inference") for idx in steps_range: # One autoregressive step + audio = self.transform_audio(audio) step_out = self.decoder.forward( enc_out=enc_out, src_length=length, @@ -530,7 +553,9 @@ def forward(self, enc_out, length, emb=None): ) audio_out = step_out.out - if self.representation_mode == RepresentationMode.DISCRETE: + if self.feed_audio: + audio_out = self.feed_audio(audio_out) + elif self.representation_mode == RepresentationMode.DISCRETE: audio_out = audio_out.argmax(-1) # The model outputs predictions without BOS. Add the BOS back for the @@ -592,357 +617,6 @@ def forward(self, enc_out, length, emb=None): ) -class TokotronSearchWrapper(nn.Module): - """A wrapper class to facilitate seach-based inference. It takes care of re-interpreting - a multi-headed sequence as multiple samples, for compatibility, and for the retention - of attention tensors - - Arguments - --------- - decoder : TokotronTransformerDecoder - the Tokotron transformer decoder - """ - - def __init__(self, decoder): - super().__init__() - self.tokens_per_step = decoder.tokens_per_step - self.decoder = decoder - - def decode(self, memory, enc_states, enc_lens): - """Wraps the decode operation, will all the necessary - reshaping - - Arguments - --------- - memory : torch.Tensor - Characters predicted so far - enc_states : torch.Tensor - Encoder states - enc_lens : torch.Tensor - Encoder state lengths - """ - batch_size = enc_states.size(0) // self.tokens_per_step - _, mem_len = memory.shape - memory = memory.reshape( - self.tokens_per_step, batch_size, mem_len - ).permute(1, 2, 0) - dec_out, dec_self_attn, dec_attn = self.decoder.decode( - enc_out=enc_states[:batch_size], - src_length=enc_lens[:batch_size], - tgt=memory, - ) - self.dec_self_attn = dec_self_attn - self.dec_attn = dec_attn - return dec_out, dec_attn - - -class TokotronTransformerBeamSearcher(S2STransformerBeamSearcher): - """A slight modification of S2STransformerBeamSearcher that uses an - explicit number of tokens instead of trying to infer it from the - weights of the linear layer. This is needed because Tokotron is - multi-header and the final output layer outputs multiple output states - - Arguments - --------- - num_tokens : int - The number of audio tokens available - """ - - def __init__(self, num_tokens, *args, **kwargs): - super().__init__(*args, **kwargs) - self.num_tokens = num_tokens - - def set_n_out(self): - """Set the number of output tokens.""" - return self.num_tokens - - -class SearchLinearWrapper(nn.Module): - """A wrapper for the final linear layer of the Transformer. The goal is to - make it compatible with the SpeechBrain Beam Search implementation, which is - single-headed, by expanding multiple heads along the batch dimensions. - - Arguments - --------- - lin : torch.Tensor - A linear layer with an output feature dimensions of - (tokens_per_step x num_tokens) - tokens_per_step : int - the numer of tokens the model outputs for each - time step - """ - - def __init__(self, lin, tokens_per_step): - super().__init__() - self.lin = lin - self.tokens_per_step = tokens_per_step - - def forward(self, x): - """Performs a forward pass with all the required reshape operations - - Arguments - --------- - x : torch.Tensor - The decoder output - - Returns - ------- - result : torch.Tensor - The layer output, reshaped along the batch dimension - """ - x = self.lin(x) - batch_size, max_len, out_dim = x.shape - num_tokens = x.size(-1) // self.tokens_per_step - x = ( - # batch x tokens x length - x.transpose(2, 1) - # batch x heads x tokens x length - .view(batch_size, self.tokens_per_step, num_tokens, max_len) - # heads x batch x tokens x length - .transpose(0, 1) - # heads * batch x tokens x length - .reshape(self.tokens_per_step * batch_size, num_tokens, max_len) - # heads * batch x length x tokens - .transpose(1, 2) - ) - return x - - -class TokotronSearchInference(nn.Module): - """A beam search-based inference implementation - - All keyword arguments will be passed on to the underlying - beam search - """ - - def __init__(self, audio_token_shift=1, **kwargs): - super().__init__() - self.search_kwargs = kwargs - self.audio_token_shift = audio_token_shift - self.decoder, self.search, self.tokens_per_step = None, None, None - - def bind(self, model=None): - """Binds this inference implementation to a model - - Arguments - --------- - model : TokotronTransformerModel - The transformer model - """ - decoder = model.decoder - self.tokens_per_step = decoder.tokens_per_step - self.decoder = TokotronSearchWrapper(decoder) - self.search = TokotronTransformerBeamSearcher( - modules=[ - self.decoder, - SearchLinearWrapper(decoder.out_proj, self.tokens_per_step), - ], - num_tokens=decoder.num_tokens + self.audio_token_shift, - **self.search_kwargs, - ) - - def decode(self, enc_out, length): - """"Decodes the encoder representation using Beam Search - - Arguments - --------- - enc_out : torch.Tensor - Encoder output - length : torch.Tensor - Encoder output lengths - - Returns - ------- - output : TokotronDecoderInfernceOutput - The inference output - """ - with torch.no_grad(): - device = enc_out.device - # The search does not support multiple heads. "Trick" it by expanding encoded - # representations along the batch dimension so that the beam searcher - # treats it as if they were separate, independent samples. - batch_size, max_len, enc_dim = enc_out.shape - enc_out_search = ( - enc_out.unsqueeze(0) - .expand(self.tokens_per_step, batch_size, max_len, enc_dim) - .reshape(self.tokens_per_step * batch_size, max_len, enc_dim) - ) - length_search = ( - length.unsqueeze(0) - .expand(self.tokens_per_step, batch_size) - .reshape(self.tokens_per_step * batch_size) - ) - hyps, audio_length, scores, log_probs = self.search( - enc_out_search, length_search - ) - tokens_batch = PaddedBatch( - [ - {"hyps": torch.tensor(item, device=enc_out.device)} - for item in hyps - ] - ).to(device) - - audio_tokens, length = tokens_batch.hyps - _, audio_max_len = audio_tokens.shape - audio_tokens = audio_tokens.reshape( - self.tokens_per_step, batch_size, audio_max_len - ).permute(1, 2, 0) - length = ( - length.reshape(self.tokens_per_step, batch_size).min(dim=0) - ).values - audio_tokens = audio_tokens - self.audio_token_shift - - return TokotronDecoderInfernceOutput( - audio_tokens=audio_tokens, - length=length, - dec_self_attn=self.decoder.dec_self_attn, - dec_attn=self.decoder.dec_attn, - alignments=get_alignments(self.decoder.dec_attn), - p_eos=None, - ) - - -class TokotronForwardInference(nn.Module): - """A beam search-based inference implementation - - All keyword arguments will be passed on to the underlying - beam search - - Arguments - --------- - scale_factor : float - The scaling factor for encoder representations - gate_threshold : float - The threshold for gate activation - min_length : int - The minimum length for generating sequences, in tokens - """ - - def __init__( - self, - scale_factor=5.0, - gate_threshold=0.5, - min_length=16, - eos_mode=EosMode.GATE, - eos_index=0, - representation_mode=RepresentationMode.DISCRETE, - ): - super().__init__() - self.scale_factor = scale_factor - self.gate_threshold = gate_threshold - self.min_length = min_length - self.decoder = None - self.gate = None - self.eos_mode = EosMode(eos_mode) - self.eos_index = eos_index - self.representation_mode = RepresentationMode(representation_mode) - - def bind(self, model=None): - """Binds this inference implementation to a model - - Arguments - --------- - model : TokotronTransformerModel - The transformer model - """ - self.decoder = model.decoder - - def decode(self, enc_out, length): - """"Decodes the encoder representation using Beam Search - - Arguments - --------- - enc_out : torch.Tensor - Encoder output - length : torch.Tensor - Encoder output lengths - - Returns - ------- - output : TokotronDecoderInfernceOutput - The inference output - """ - with torch.no_grad(): - max_len = enc_out.size(1) - src_key_padding_mask = length_to_mask( - length * max_len, max_len, - ).logical_not() - tgt = scale(enc_out, self.scale_factor) - dec_out = self.decoder( - enc_out=enc_out, - tgt=tgt, - tgt_length=length, - src_length=length, - src_key_padding_mask=src_key_padding_mask, - pos_embs_src=None, - ) - if self.eos_mode == EosMode.GATE: - p_eos, eos = self.get_length_gate(dec_out) - else: - p_eos, eos = self.get_length_token(dec_out) - - infer_length_abs = eos.max(dim=1).indices - infer_length_abs_nonzero = infer_length_abs[infer_length_abs > 0] - if len(infer_length_abs_nonzero) > 0: - infer_length_max = infer_length_abs_nonzero.max() - else: - infer_length_max = 0 - if infer_length_max == 0: - infer_length_max = p_eos.size(1) - infer_length_abs = torch.where( - infer_length_abs == 0, infer_length_max, infer_length_abs - ) - infer_length_abs = infer_length_abs.clip(min=self.min_length) - infer_length = infer_length_abs / infer_length_max - - audio = dec_out.out[:, :infer_length_max].argmax(-1) - if self.representation_mode == RepresentationMode.DISCRETE: - audio = audio.argmax(-1) - return TokotronDecoderInfernceOutput( - audio=audio, - length=infer_length, - dec_self_attn=dec_out.dec_self_attn, - dec_attn=dec_out.dec_attn, - alignments=get_alignments(dec_out.dec_attn), - p_eos=p_eos, - ) - - def get_length_gate(self, dec_out): - """Infers lengths using the gate module - - Arguments - --------- - dec_out : TokotronDecoderOutput - The decoder output - - Returns - ------- - p_eos : torch.Tensor - EOS probabilities (as estimated by the gate) - eos : torch.Tensor - a Boolean tensor where positions indicate whether - the gate has activated - """ - p_eos = dec_out.gate_out.sigmoid() - eos = p_eos > self.gate_threshold - return p_eos, eos - - def get_length_token(self, dec_out): - """Infers lengths using an EOS token - - Arguments - --------- - dec_out : TokotronDecoderOutput - The decoder output - eos : torch.Tensor - A Boolean tensor indicating whether EOS has been reached - """ - p_seq = dec_out.out[:, :, 0].softmax(dim=-1) - p_eos = p_seq[:, :, self.eos_index].softmax(-1) - eos = p_seq.argmax(dim=-1) == self.eos_index - return p_eos, eos - - class TokotronTransformerModel(nn.Module): """An end-to-end Tokotron model receiving characters or phonemes as inputs and outputting audio tokens @@ -1052,11 +726,13 @@ def __init__( eos_mode=EosMode.GATE, inference=None, audio_token_shift=0, - decoder_mode=DecoderMode.AUTOREGRESSIVE, scale_factor=5.0, representation_mode=RepresentationMode.DISCRETE, audio_dim=1024, emb=None, + audio_emb=None, + out_proj=None, + multihead_input=True, ): super().__init__() self.in_emb = Embedding( @@ -1075,11 +751,6 @@ def __init__( activation=activation, normalize_before=True, ) - self.decoder_mode = DecoderMode(decoder_mode) - audio_emb = None - if self.decoder_mode == DecoderMode.FORWARD: - audio_emb = nn.Identity() - audio_emb_size = d_model self.decoder = TokotronTransformerDecoder( num_tokens=audio_num_tokens + self.audio_token_shift, tokens_per_step=audio_tokens_per_step, @@ -1099,9 +770,11 @@ def __init__( gate_threshold=gate_threshold, gate_offset=gate_offset, audio_token_shift=audio_token_shift, - multihead_input=self.decoder_mode == DecoderMode.AUTOREGRESSIVE, + multihead_input=multihead_input, + multihead_output=out_proj is None, representation_mode=representation_mode, audio_dim=audio_dim, + out_proj=out_proj, ) self.bos_idx = bos_idx self.attention_type = attention_type @@ -1255,17 +928,11 @@ def forward( src_key_padding_mask=src_key_padding_mask, pos_embs=pos_embs_encoder, ) - if self.decoder_mode == DecoderMode.AUTOREGRESSIVE: - tgt = audio - tgt_length = audio_length - else: - tgt = scale(enc_out, self.scale_factor) - tgt_length = input_length enc_out = self.add_emb(enc_out, emb) dec_out = self.decoder( enc_out=enc_out, - tgt=tgt, - tgt_length=tgt_length, + tgt=audio, + tgt_length=audio_length, src_length=input_length, src_key_padding_mask=src_key_padding_mask, pos_embs_src=pos_embs_encoder, @@ -1569,6 +1236,7 @@ def __init__( representation_mode=RepresentationMode.DISCRETE, audio_clip_min=-10.0, audio_clip_max=10.0, + multihead_output=True, ): super().__init__() self.guided_attention_weight = guided_attention_weight @@ -1597,6 +1265,7 @@ def __init__( self.register_buffer("audio_eos", audio_eos) self.audio_clip_min = audio_clip_min self.audio_clip_max = audio_clip_max + self.multihead_output = multihead_output def forward( self, @@ -1629,9 +1298,14 @@ def forward( out = out.log_softmax(dim=-1) batch_size, out_len, heads, tok_dim = out.shape max_len = out_len - 1 - out_reshaped = ( - out.transpose(1, 2).reshape(batch_size * heads, out_len, tok_dim) - )[:, :max_len] + if self.multihead_output: + out_reshaped = ( + out.transpose(1, 2).reshape( + batch_size * heads, out_len, tok_dim + ) + )[:, :max_len] + else: + out_reshaped = out if self.eos_mode == EosMode.TOKEN: # NOTE: Shift only the tokens, but not EOS padding_lengths = torch.ones(batch_size, device=audio.device) @@ -1645,7 +1319,10 @@ def forward( ) tok_len = audio.size(1) - if self.representation_mode == RepresentationMode.DISCRETE: + if not self.multihead_output: + audio_reshaped = audio + lengths_reshaped = audio_length + elif self.representation_mode == RepresentationMode.DISCRETE: audio_reshaped = audio.transpose(1, 2).reshape( batch_size * heads, max_len ) @@ -1664,18 +1341,21 @@ def forward( ) audio_reshaped = audio_reshaped[:, :max_len] - lengths_reshaped = ( - audio_length.unsqueeze(-1) - .expand(batch_size, heads) - .reshape(batch_size * heads) - ) + if self.multihead_output: + lengths_reshaped = ( + audio_length.unsqueeze(-1) + .expand(batch_size, heads) + .reshape(batch_size * heads) + ) + else: + lengths_reshaped = audio_length seq_loss = self.seq_cost( out_reshaped[:, :tok_len], audio_reshaped, length=lengths_reshaped, reduction=reduction, ) - if reduction == "batch": + if reduction == "batch" and self.multihead_output: seq_loss = seq_loss.reshape(batch_size, heads).mean(-1) lengths_abs = audio_length * out_len @@ -2143,10 +1823,30 @@ def __call__(self, opt): class PositionalEncoding(TransformerPositionalEncoding): + """A wrapper for the positional encoding that does not try + to be loaded from state dictionaries""" + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def load_state_dict(self, state_dict, strict=True, assign=False): + """Copy parameters and buffers from :attr:`state_dict` into this module and its descendants. + + Arguments + --------- + state_dict : dict + A dict containing parameters and persistent buffers. + strict : (bool, optional) + Whether to strictly enforce that the keys + assign (bool, optional): whether to assign items in the state + dictionary to their corresponding keys in the module + + Returns + ------- + ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields: + * **missing_keys** is a list of str containing the missing keys + * **unexpected_keys** is a list of str containing the unexpected keys + """ pass @@ -2229,178 +1929,198 @@ def all_weights(self): return torch.stack([emb.weight for emb in self.emb]) -class DACFeatureExtractor(nn.Module): - """An adapter for feature extraction +def get_silence_token( + model, + sample_length=100000, + unsqueeze=False, + device=None, + num_codebooks=None, +): + """Attempts to find out the silence tokens for a given model, + if applicable Arguments --------- - dac : DAC - a DAC model - """ + model : nn.Module + A discrete token model, taking (wav, lengths) as arguments + sample_length : int + The length of the sample + unsqueeze: bool + Whether to add an extra dimension to the audio (needed for DAC) + device : str | torch.Device + The device to use + num_codebooks : int | list + The number of codebooks or the codebooks to use - def __init__(self, dac, n_quantizers): - super().__init__() - self.dac = dac - self.dac.eval() - self.n_quantizers = n_quantizers - - def encode(self, inputs, length): - """Encodes a raw audio sample using DAC + Returns + ------- + silence_tokens : torch.Tensor + The token(s) corresponding to silence - Arguments - --------- - inputs : torch.Tensor - A (Batch x Samples) or (Batch x Channel x Samples) - tensor of audio - length : torch.Tensor - A tensor of relative lengths + silece_emb : torch.Tensor + The embedding(s) corresponding to silence - Returns - ------- - tokens : torch.Tensor - A (Batch x Tokens x Heads) tensor of audio tokens - emb : torch.Tensor - Raw vector embeddings from the model's - quantizers + """ + if device is None: + device = next(model.parameters()).device + + audio = torch.zeros(1, sample_length, device=device) + if unsqueeze: + audio = audio.unsqueeze(1) + length = torch.ones(1, device=device) + model_training = model.training + model.eval() + tokens = model.sig_to_tokens(audio, length, num_codebooks=num_codebooks) + if model_training: + model.train() + tokens = tokens.squeeze(0) + if unsqueeze: + tokens = tokens.squeeze(0) + silence_tokens = tokens.mode(0).values + return silence_tokens + + +def get_silence_repr(model, sample_length=100000, device=None): + """Gets continuous silence - """ - if inputs.dim() < 3: - inputs = inputs.unsqueeze(1) - emb, codes, _, _, _ = self.dac.encode( - inputs, n_quantizers=self.n_quantizers - ) - emb.transpose_(1, 2) - codes.transpose_(1, 2) - max_len = emb.size(1) - mask = length_to_mask( - length * max_len, max_len, device=inputs.device - ).unsqueeze(-1) - return codes * mask, emb * mask + Arguments + --------- + model : nn.Module + A discrete token model, taking (wav, lengths) as arguments + sample_length : int + The length of the sample + device : str | torch.Device + The device to use - def forward(self, inputs, length): - """Encodes a raw audio sample using DAC + Returns + ------- + silence : torch.Tensor + A silecnce tensor + """ + audio = torch.zeros(1, sample_length, device=device) + length = torch.ones(1, device=device) + audio_repr = model(audio, length) + silence = audio_repr.mean(dim=1)[0] + return silence - Arguments - --------- - inputs : torch.Tensor - A (Batch x Samples) or (Batch x Channel x Samples) - tensor of audio - length : torch.Tensor - A tensor of relative lengths - Returns - ------- - tokens : torch.Tensor - A (Batch x Tokens x Heads) tensor of audio tokens - emb : torch.Tensor - Raw vector embeddings from the model's - quantizers +def feature_pad_to(tensor, length, padding=None): + """Pads feature dimensions to the specified length with the specified padding, + assuming a (Batch x Length x Features..) tensor - """ - return self.encode(inputs, length) + Arguments + --------- + tensor : torch.Tensor + The tensor to be padded - def embeddings(self, tokens): - """Converts token indexes to vector embeddings + length : int + The length to which the tensor will be padded - Arguments - --------- - tokens : torch.Tensor - a (Batch x Length x Heads) tensor of token indexes + padding : torch.Tensor, optional + The padding tensor - if omitted, zero padding + will be used - Returns - ------- - emb : torch.Tensor - a (Batch x Length x Heads x Embedding) tensor - of raw vector embeddings from the model's - quantizer codebooks - """ - emb, _, _ = self.dac.quantizer.from_codes(tokens.transpose(1, 2).int()) - return emb.transpose(1, 2) + Returns + ------- + result : torch.Tensor + The padded tensor + """ + if padding is None: + padding = torch.zeros(tensor.shape[1:]) + padding = padding[None, ...].expand( + (length - tensor.size(0),) + tensor.shape[1:] + ) + return torch.cat([tensor, padding], dim=0) -class SpeechTokenizerFeatureExtractor(nn.Module): - """This lobe enables the integration of HuggingFace and SpeechBrain - pretrained SpeechTokenizer. +def batch_feature_pad(tensors, padding=None): + """Similar to batch_pad_right but pads with the specified padding, which + can be a vector or a tensor - Please, install speechtokenizer: - pip install speechtokenizer + Arguments + --------- + tensors : list + The list of tensors to be padded + padding : torch.Tensor + The padding tensor - Source paper: https://arxiv.org/abs/2308.16692 + Returns + ------- + result : torch.Tensor + the padded tensor + """ + lengths_abs = torch.tensor( + [len(item) for item in tensors], device=tensors[0].device + ) + max_length = lengths_abs.max() + data = torch.stack( + [feature_pad_to(item, max_length, padding) for item in tensors] + ) + lengths = lengths_abs / max_length + return data, lengths - The model can be used as a fixed Discrete feature extractor or can be finetuned. It - will download automatically the model from HuggingFace or use a local path. +def token_collate_fn(examples, silence_token, token_keys): + """A customized collation function for audio tokens where + the specified silence token will be used as padding - instead of + zeros Arguments --------- - speech_tokenizer : speechbrain.lobes.models.discrete.speechtokenizer_interface.SpeechTokenizer_interface - The speech tokenizer interface - codebooks : int, optional - The number of codebooks to use - if omitted, - """ + examples : list + A list of examples - def __init__(self, speech_tokenizer, codebooks=None): - super().__init__() - self.speech_tokenizer = speech_tokenizer - self.codebooks = codebooks + silence_token : torch.Tensor + The token(s) representing silence - def forward(self, wav, wav_lens=None): - """Takes an input waveform and return its corresponding wav2vec encoding. + token_keys : list + The list of keys to which special padding will be applied - Arguments - --------- - wav : torch.Tensor (signal) - A batch of audio signals to transform to features. - wav_lens : torch.Tensor - The relative length of the wav given in SpeechBrain format. + Returns + ------- + result : speechbrain.dataio.batch.PaddedBatch + A padded batch + """ + token_tensor_ids = {id(examples[0][key]) for key in token_keys} + return PaddedBatch( + examples, + padding_func=_silence_padding, + padding_kwargs={ + "silence_token": silence_token, + "token_tensor_ids": token_tensor_ids, + }, + ) - Returns - ------- - tokens : torch.Tensor - A tensor of audio tokens - Shape: (N_q x Batch x Time) by default - (Batch x Time x N_q) if shape == compat - """ - return self.encode(wav, wav_lens) +def _silence_padding(values, silence_token, token_tensor_ids): + return ( + batch_feature_pad(values, silence_token) + if id(values[0]) in token_tensor_ids + else batch_pad_right(values) + ) - def encode(self, wav, wav_lens=None): - """Takes an input waveform and return its corresponding wav2vec encoding. - Arguments - --------- - wav : torch.Tensor (signal) - A batch of audio signals to transform to features. - wav_lens : torch.Tensor - The relative length of the wav given in SpeechBrain format. +def use_silence_padding(dataloader_opts, silence_token, token_keys): + """Overrides the collation function to add silence padding to + audio token features - Returns - ------- - tokens : torch.Tensor - A (Batch x Seq, N_q) tensor of audio tokens - - """ - # Extract discrete codes from SpeechTokenizer - codes = self.speech_tokenizer.encode( - wav.unsqueeze(1), wav_lens - ) # codes: (n_q, B, T) - if self.codebooks is not None: - codes = codes[: self.codebooks] - codes = codes.permute(1, 2, 0) - return codes - - def decode(self, codes): - """Takes an input waveform and return its corresponding wav2vec encoding. - - Arguments - --------- - tokens : torch.Tensor - A (N_q, Batch x Seq) tensor of audio tokens + Arguments + --------- + dataloder_opts : dict + Dataloader options + silence_token : torch.Tensor + The tensor to be used as silence padding + token_keys : torch.Tensor + The keys to apply silence padding to - Returns - ------- - wav : torch.Tensor (signal) - A batch of reconstructed audio signals. - """ - codes = codes.permute(2, 0, 1) - return self.speech_tokenizer.decode(codes) + Returns + ------- + dataloader_opts : dict + Updated data loader options + """ + return { + **dataloader_opts, + "collate_fn": partial( + token_collate_fn, silence_token=silence_token, token_keys=token_keys + ), + } diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py index b6e11a0d2..31110cb58 100644 --- a/benchmarks/DASB/model/custom_model.py +++ b/benchmarks/DASB/model/custom_model.py @@ -1,5 +1,15 @@ +import math +import re +import speechbrain as sb import torch +from speechbrain.nnet.linear import Linear +from model.sq_codec import tokens_to_ternary +from speechbrain.utils.logger import get_logger + + +logger = get_logger(__name__) + class AttentionMLP(torch.nn.Module): def __init__(self, input_dim, hidden_dim): @@ -57,22 +67,31 @@ def __init__( num_codebooks, vocab_size, emb_dim, - pad_index=0, init=False, freeze=False, + hidden_dim=None, ): super(Discrete_EmbeddingLayer, self).__init__() self.vocab_size = vocab_size - self.num_codebooks = num_codebooks + self.num_codebooks = ( + len(num_codebooks) + if isinstance(num_codebooks, list) + else num_codebooks + ) self.freeze = freeze self.embedding = torch.nn.Embedding( - num_codebooks * vocab_size, emb_dim + self.num_codebooks * vocab_size, emb_dim ).requires_grad_(not self.freeze) self.init = init + # Add a linear layer to match dimensions if necessary + if hidden_dim is not None and hidden_dim != emb_dim: + self.proj_layer = torch.nn.Linear(emb_dim, hidden_dim) + else: + self.proj_layer = None + def init_embedding(self, weights): - with torch.no_grad(): - self.embedding.weight = torch.nn.Parameter(weights) + self.embedding.weight.data.copy_(weights) def forward(self, in_tokens): """Computes the embedding for discrete tokens. @@ -97,4 +116,228 @@ def forward(self, in_tokens): ) # Forward Pass to embedding and in_embs = self.embedding(in_tokens) + if self.proj_layer is not None: + in_embs = self.proj_layer(in_embs) return in_embs + + +class TernaryPredictionHead(torch.nn.Module): + """An alternative prediction head that predicts a fixed number of ternary digits + for each position (as used in SQ-Codec) + + Arguments + --------- + d_model : int + The model dimension + num_positions : int + the number of positions + """ + + def __init__(self, d_model, num_positions, d_hidden=512, norm=True): + super().__init__() + self.num_positions = num_positions + self.d_model = d_model + self.norm = torch.nn.LayerNorm(d_model) if norm else torch.nn.Identity() + self.lin_hidden = Linear(input_size=d_model, n_neurons=d_hidden,) + self.act = torch.nn.LeakyReLU() + self.lin_p = Linear( + input_size=d_hidden, n_neurons=num_positions * 3, bias=False + ) + + def forward(self, x, track=None): + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + The decoder output (Batch x Length x d_model) + + track : int + The track index (if applicable) + + Returns + ------- + p : torch.Tensor + A tensor of shape (Batch x Length x num_positions x ternary digit) + The values are logits (unnormalized probabilities) + + p[:, :, :, 0] corresponds to -1 + p[:, :, :, 1] corresponds to 0 + p[:, :, :, 2] corresponds to 1 + """ + batch_size, max_len, _ = x.shape + x = self.norm(x) + x = self.lin_hidden(x) + x = self.act(x) + p = self.lin_p(x) + p = p.reshape(batch_size, max_len, self.num_positions, 3) + return p + + +class TernaryLogitTokenizer(torch.nn.Module): + """Converts ternary logits to probabilities + + Arguments + --------- + num_positions : int + The number of ternary digits/positions + num_tokens : int + The number of tokens + chunk_size : int + The size of the chunk (to prevent OOM) + mode : str + "probability" : treats the outputs as a probability distribution + "argmax" : "hard" mode, only the top probability is used. Cannot be used with + top_k sampling with k > 1 + + """ + + def __init__( + self, num_positions, num_tokens=None, num_tracks=4, chunk_size=10 + ): + super().__init__() + self.num_positions = num_positions + if num_tokens is None: + num_tokens = 3 ** num_positions + self.num_tokens = num_tokens + self.num_tracks = num_tracks + self.chunk_size = chunk_size + self.register_buffer("vocab", torch.arange(num_tokens)) + self.register_buffer( + "vocab_ternary", + tokens_to_ternary(self.vocab[None, None, None, :], D=num_positions) + + 1, + ) + self.register_buffer("idx", torch.arange(3)[None, None, None, None, :]) + + def forward(self, logits): + batch_size, max_len, num_positions, _ = logits.shape + logits = logits.softmax(-1) + logits = logits.reshape( + batch_size, + max_len, + self.num_tracks, + 1, + num_positions // self.num_tracks, + 3, + ) + chunks = logits.chunk( + dim=1, chunks=math.ceil(logits.size(1) / self.chunk_size) + ) + token_logits_chunks = [] + for chunk in chunks: + token_logits_raw = ( + torch.where( + self.vocab_ternary[:, None, None, :, :, None] == self.idx, + chunk, + torch.ones_like(chunk), + ) + .prod(-1) + .log() + .sum(-1) + .exp() + ) + token_logits_raw_sum = token_logits_raw.sum(-1, keepdim=True) + token_logits_chunks.append( + (token_logits_raw / token_logits_raw_sum).squeeze(2) + ) + token_logits = torch.cat(token_logits_chunks, dim=1) + return token_logits + + +@sb.utils.checkpoints.register_checkpoint_hooks +class SaveableGenerator: + """A wrapper that can be used to store the state of + the random number generator in a checkpoint. It helps + with reproducibility in long-running experiments. + + Currently, this only supports CPU and Cuda devices + natively. If you need training on other architectures, + consider implementing a custom generator. + + Running it on an unsupported device not using the Torch + generator interface will simply fail to restore the + state but will not cause an error. + + Arguments + --------- + generators : list, optional + A list of generator objects. If not provided, + """ + + def __init__(self, generators=None): + if generators is None: + generators = {"default": torch.default_generator} + if torch.cuda.is_available(): + for idx in range(torch.cuda.device_count()): + generators[f"cuda:{idx}"] = _CudaDefaultGeneratorWrapper( + idx + ) + + self.generators = generators + + @sb.utils.checkpoints.mark_as_saver + def _save(self, path): + save_dict = { + key: generator.get_state() + for key, generator in self.generators.items() + } + torch.save(save_dict, path) + + @sb.utils.checkpoints.mark_as_loader + def _recover(self, path, end_of_epoch): + del end_of_epoch + save_dict = torch.load(path) + for key, state in save_dict.items(): + if key == "default": + torch.default_generator.set_state(state) + continue + match = re.match(r"cuda:(\d+)", key) + if match: + if not torch.cuda.is_available(): + logger.warn( + "Unable to restore RNG for %s, CUDA unavailable", key + ) + continue + idx = int(match.group(1)) + if idx > torch.cuda.device_count() - 1: + logger.warn( + "Unable to restore RNG for %s, device not found", key + ) + continue + self.generators[key].set_state(state) + + +class _CudaDefaultGeneratorWrapper: + """A generator wrapper for default generators - because torch no longer + exposes default_generators + + This class should not be used outside of SaveableGenerator + + Arguments + --------- + device : int|str + The device index or identifier""" + + def __init__(self, device): + self.device = device + + def get_state(self): + """Returns the generator state + + Returns + ------- + result : torch.Tensor + The generator state + """ + return torch.cuda.get_rng_state(self.device) + + def set_state(self, new_state): + """"Sets the generator state + + Arguments + --------- + new_state : dict + The new state + """ + torch.cuda.set_rng_state(new_state, self.device) diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py new file mode 100644 index 000000000..e5c1ea970 --- /dev/null +++ b/benchmarks/DASB/model/sq_codec.py @@ -0,0 +1,1566 @@ +"""This lobe enables the integration of speech codec model (SQ-Codec) with scalar quantization,. + +SQ-Codec effectively maps the complex speech signal into a finite and compact latent space, named scalar latent space. + +Repository: https://github.com/yangdongchao/SimpleSpeech +Paper: https://arxiv.org/abs/2406.02328, https://arxiv.org/abs/2408.13893 + +Authors + * Pooneh Mousavi 2024 +""" + +import logging +import os + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchaudio +from omegaconf import OmegaConf +from torch.autograd import Function +from torch.nn.utils import remove_weight_norm, weight_norm + +from speechbrain.dataio.dataio import length_to_mask + + +class SQCodec(nn.Module): + """ + Speech codec model (SQ-Codec) with scalar quantization. It maps the complex speech signal into a finite and compact latent space. + The model consists of an encoder-decoder architecture with optional causal convolutions, downsampling, and upsampling layers. + It uses vector quantization and various convolutional blocks for processing. + + Make sure that you download and extract the SQ-codec.zip in save_path from following Huggingface repo: + - HF repo: https://huggingface.co/Dongchao/UniAudio/blob/main/SQ-Codec.zip + + Repository: https://github.com/yangdongchao/SimpleSpeech + Paper: https://arxiv.org/abs/2406.02328, https://arxiv.org/abs/2408.13893 + + Arguments + --------- + save_path : str, optional + Directory where the model and configuration files are saved (default is None). + config : str, optional + Configuration filename for the model. It is extracted form zip file(default is 'config.yaml'). + checkpoint : str, optional + Model checkpoint filename. It is extracted form zip file( (default is 'ckpt_00190000.pth'). + sample_rate : int, optional + Sample rate for input audio (default is 16000). + dim_codebook : int, optional + Dimension of each codebook (default is 19683). + n_codebook : int, optional + Number of codebooks used (default is 4). + bw : float, optional + Bandwidth parameter (default is 2). + clip_length : int, optional + Maximum clip length for processing (default is 450). + + Example + ------- + >>> save_path = "savedir" + >>> config = "config.yaml" + >>> checkpoint = "ckpt_00190000.pth" + >>> model = SQCodec(save_path, config, checkpoint) + >>> audio = torch.randn(3, 16000) + >>> tokens, emb = model.encode(audio) + >>> tokens.shape + torch.Size([3, 200]) + >>> emb.shape + torch.Size([3, 36, 50]) + >>> rec = model.decode(tokens) + >>> rec.shape + torch.Size([3, 1, 16000]) + """ + + def __init__( + self, + save_path, + config, + checkpoint, + sample_rate=16000, + dim_codebook=19683, + n_codebook=4, + bw=2, + clip_length=450, + ): + super(SQCodec, self).__init__() + self.config_path = os.path.join(save_path, config) + self.ckpt_path = os.path.join(save_path, checkpoint) + if not os.path.exists(self.config_path) and not os.path.exists( + self.ckpt_path + ): + err_msg = ( + "the files %s or %s does not exist." + "(make sure that you download and extract the SQ-codec.zip in save_path from following Huggingface repo:" + " https://huggingface.co/Dongchao/UniAudio/blob/main/SQ-Codec.zip)" + % (self.ckpt_path, self.config_path) + ) + raise FileNotFoundError(err_msg) + self.clip_length = clip_length + + logging.info( + f"Using config {self.config_path} and model {self.ckpt_path}" + ) + + self.scalar_codec = self.build_codec_model(self.config_path) + self.sample_rate = sample_rate + self.dim_codebook = dim_codebook + self.n_codebook = n_codebook + self.bw = bw + self.mask_id = self.dim_codebook * self.n_codebook + + def build_codec_model(self, config): + """ + Loads and builds the scalar codec model from the given configuration. + + Parameters + ---------- + config : str + Path to the configuration file. + + Returns + ------- + ScalarModel + The built scalar codec model loaded with weights from the checkpoint. + """ + exp_model_config = OmegaConf.load(config) + scalar_codec = ScalarModel(**exp_model_config.generator.config) + device = next(iter(scalar_codec.parameters())).device + parameter_dict = torch.load( + self.ckpt_path, map_location=device, weights_only=False + ) + scalar_codec.load_state_dict(parameter_dict["codec_model"]) + return scalar_codec + + def _flatten_codebooks(self, arr, offset_size=None): + """ + Flattens a 3D array (B, N, D) to a 1D array while applying an offset to each codebook if specified. + + Parameters + ---------- + arr : numpy.ndarray + A 3D array of shape (B, N, D). + offset_size : int or None, optional + The offset size to be applied to each codebook slice (default is None). + + Returns + ------- + numpy.ndarray + A 1D array representing the flattened codebooks. + """ + assert ( + len(arr.shape) == 3 + ), "Input array must have 3 dimensions [B, N, D]" + N, B, D = arr.shape + arr = arr.copy() + # if offset_size is not None: + # for n in range(N): + # arr[n, :, :] += offset_size * n + flattened_arr = arr.transpose(1, 2, 0).reshape(B, N * D) + return flattened_arr + + def encode(self, inputs): + """ + Encodes the input audio tensor using the scalar codec and quantizes the output. + + Parameters + ---------- + inputs : torch.Tensor + Input audio tensor of shape (B, T) or (B, 1, T), where B is the batch size + and T is the length of the audio sequence. + + Returns + ------- + tuple + A tuple containing: + - torch.Tensor: The flattened and quantized encoded representation of the input. + - torch.Tensor: Quantized embedding. + """ + if inputs.dim() == 2: + inputs = inputs.unsqueeze(1) + compressed = self.scalar_codec.encode(inputs) + chunks = compressed.chunk(self.n_codebook, dim=1) + codec_ls = [] + for i, chunk in enumerate(chunks): + chunk = chunk.detach().cpu().numpy().astype(np.int32) + 1 + tmp_codec = ternary_matrix_to_decimal(chunk) + codec_ls.append(tmp_codec) + codec_ls = np.array(codec_ls) + flat_codec = self._flatten_codebooks(codec_ls, self.dim_codebook) + flat_codec = torch.from_numpy(flat_codec).to(torch.int32) + return flat_codec.to(inputs.device), compressed.to(inputs.device) + + def decode(self, codes): + """ + Decodes the quantized codes back into an audio tensor. + + Parameters + ---------- + codes : torch.Tensor + Quantized codes with shape (B, T). + + Returns + ------- + torch.Tensor + Reconstructed audio signal. + """ + assert codes.dim() == 2 + B, T = codes.shape + assert ( + T % self.n_codebook == 0 + ), "Length T must be divisible by n_codebook" + codes = codes.view(B, -1, self.n_codebook).permute(2, 0, 1) + # for i in range(self.n_codebook): + # codes[i, :, :] -= i * self.dim_codebook + emb_quant = [] + for i in range(self.n_codebook): + tmp_list = decimal_to_ternary_matrix(codes[i, :, :], D=9) - 1 + emb_quant.append(tmp_list) + emb_quant = torch.cat(emb_quant, dim=1) + out = self.scalar_codec.decode(emb_quant.float().to(codes.device)) + return out.detach().cpu().squeeze(0) + + def reconstruct(self, wav_root): + """ + Processes a given waveform file by encoding and decoding it through the scalar codec. + + Parameters + ---------- + wav_root : str + Path to the waveform file. + + Returns + ------- + torch.Tensor or None + Processed waveform tensor or None if the file is empty. + """ + wav, sr = torchaudio.load(wav_root) + if wav.numel() == 0: + return None + if sr != self.sample_rate: + wav = torchaudio.transforms.Resample(sr, self.sample_rate)(wav) + wav = wav.unsqueeze(1) + emb, emb_quant, x = self.scalar_codec.inference(wav) + return x.detach().cpu().squeeze(0) + + @property + def is_discrete(self): + """Indicates whether the codec works with discrete values.""" + return True + + @property + def codebook_length(self): + """Returns the total length of the codebook.""" + return self.dim_codebook * self.n_codebook + 1 + + def find_length(self, x): + """ + Finds the length of the tokenized version of the input tensor. + + Parameters + ---------- + x : torch.Tensor + Input tensor. + + Returns + ------- + int + The length of the tokenized input. + """ + return self.tokenize(x).shape[0] // self.n_codebook + + +class ScalarModel(nn.Module): + """ + A custom neural network model for encoding and decoding audio signals. + + The model consists of an encoder-decoder architecture with optional + causal convolutions, downsampling, and upsampling layers. It uses + vector quantization and various convolutional blocks for processing. + + + Arguments + --------- + num_bands : int + Number of input bands (or channels). + sample_rate : int + Sample rate of the input signal. + causal : bool + If True, uses causal convolutions for processing. + num_samples : int + Number of samples to process for downsampling or upsampling. + downsample_factors : list of int + List of factors to downsample the input. + downsample_kernel_sizes : list of int + List of kernel sizes for downsampling layers. + upsample_factors : list of int + List of factors to upsample the input. + upsample_kernel_sizes : list of int + List of kernel sizes for upsampling layers. + latent_hidden_dim : int + Dimension of the latent representation. + default_kernel_size : int + Default kernel size for convolutional layers. + delay_kernel_size : int + Kernel size used for the delay convolutional layer. + init_channel : int + Number of initial channels for the encoder and decoder. + res_kernel_size : int + Kernel size used for the residual convolutional blocks. + + Example + ------- + >>> model = ScalarModel(num_bands=1, sample_rate=16000,causal=True,num_samples=2,downsample_factors=[2,4,4,5],downsample_kernel_sizes=[4,8,8,10],upsample_factors=[5,4,4,2],upsample_kernel_sizes=[10,8,8,4],latent_hidden_dim=36,default_kernel_size=7,delay_kernel_size=5,init_channel=48,res_kernel_size=7) # doctest: +SKIP + >>> audio = torch.randn(3, 1, 16000) + >>> quant_emb = model.encode(audio) # doctest: +SKIP + >>> quant_emb.shape + torch.Size([3, 36, 50]) + >>> rec = model.decode(quant_emb) # doctest: +SKIP + >>> rec.shap) # doctest: +SKIP + torch.Size([3, 1, 16000]) + """ + + def __init__( + self, + num_bands, + sample_rate, + causal, + num_samples, + downsample_factors, + downsample_kernel_sizes, + upsample_factors, + upsample_kernel_sizes, + latent_hidden_dim, + default_kernel_size, + delay_kernel_size, + init_channel, + res_kernel_size, + ): + super(ScalarModel, self).__init__() + self.sample_rate = sample_rate + self.encoder = [] + self.decoder = [] + self.vq = lambda x: CustomRoundingFunction.apply(x, "binary") + + # Encoder layers + self.encoder.append( + weight_norm( + Conv1d( + num_bands, + init_channel, + kernel_size=default_kernel_size, + causal=causal, + ) + ) + ) + if num_samples > 1: + # Downsampling layer + self.encoder.append( + PreProcessor( + init_channel, + init_channel, + num_samples, + kernel_size=default_kernel_size, + causal=causal, + ) + ) + for i, down_factor in enumerate(downsample_factors): + self.encoder.append( + ResEncoderBlock( + init_channel * np.power(2, i), + init_channel * np.power(2, i + 1), + down_factor, + downsample_kernel_sizes[i], + res_kernel_size, + causal=causal, + ) + ) + self.encoder.append( + weight_norm( + Conv1d( + init_channel * np.power(2, len(downsample_factors)), + latent_hidden_dim, + kernel_size=default_kernel_size, + causal=causal, + ) + ) + ) + + # Decoder layers + self.decoder.append( + weight_norm( + Conv1d( + latent_hidden_dim, + init_channel * np.power(2, len(upsample_factors)), + kernel_size=delay_kernel_size, + ) + ) + ) + for i, upsample_factor in enumerate(upsample_factors): + self.decoder.append( + ResDecoderBlock( + init_channel * np.power(2, len(upsample_factors) - i), + init_channel * np.power(2, len(upsample_factors) - i - 1), + upsample_factor, + upsample_kernel_sizes[i], + res_kernel_size, + causal=causal, + ) + ) + if num_samples > 1: + self.decoder.append( + PostProcessor( + init_channel, + init_channel, + num_samples, + kernel_size=default_kernel_size, + causal=causal, + ) + ) + self.decoder.append( + weight_norm( + Conv1d( + init_channel, + num_bands, + kernel_size=default_kernel_size, + causal=causal, + ) + ) + ) + + self.encoder = nn.ModuleList(self.encoder) + self.decoder = nn.ModuleList(self.decoder) + + def forward(self, x): + """ + Performs a forward pass through the encoder and decoder. + + Parameters + ---------- + x : torch.Tensor + Input tensor of shape (batch_size, channels, length). + + Returns + ------- + torch.Tensor + Reconstructed output tensor. + """ + for i, layer in enumerate(self.encoder): + if i != len(self.encoder) - 1: + x = layer(x) + else: + x = F.tanh(layer(x)) + x = self.vq(x) # Quantization step + for i, layer in enumerate(self.decoder): + x = layer(x) + return x + + def inference(self, x): + """ + Encodes input tensor `x` and decodes the quantized embeddings. + + Parameters + ---------- + x : torch.Tensor + Input tensor of shape (batch_size, channels, length). + + Returns + ------- + tuple + A tuple (emb, emb_quant, x), where `emb` is the latent embedding, + `emb_quant` is the quantized embedding, and `x` is the decoded output. + """ + for i, layer in enumerate(self.encoder): + if i != len(self.encoder) - 1: + x = layer(x) + else: + x = F.tanh(layer(x)) + emb = x + emb_quant = self.vq(emb) + x = emb_quant + for i, layer in enumerate(self.decoder): + x = layer(x) + return emb, emb_quant, x + + def encode(self, x): + """ + Encodes the input tensor `x` into a quantized embedding. + + Parameters + ---------- + x : torch.Tensor + Input tensor of shape (batch_size, channels, length). + + Returns + ------- + torch.Tensor + Quantized embedding. + """ + for i, layer in enumerate(self.encoder): + if i != len(self.encoder) - 1: + x = layer(x) + else: + x = F.tanh(layer(x)) + emb = x + emb_quant = self.vq(emb) + return emb_quant + + def decode(self, emb_quant): + """ + Decodes the quantized embeddings back into a tensor. + + Parameters + ---------- + emb_quant : torch.Tensor + Quantized embedding tensor. + + Returns + ------- + torch.Tensor + Reconstructed output tensor. + """ + x = emb_quant + for i, layer in enumerate(self.decoder): + x = layer(x) + return x + + +class CustomRoundingFunction(Function): + """ + A customizable rounding function for various rounding operations, including: + - Rounding to the nearest multiple of a specified divisor. + - Rounding to the nearest integer. + - Applying the Heaviside step function. + + Arguments + --------- + mode : str + The mode of the operation. Can be 'round', 'binary', or 'heaviside'. + divisor : float, optional + The divisor for rounding. Only used in 'round' mode. + """ + + @staticmethod + def forward(ctx, input, mode="round", divisor=1.0): + """ + Forward pass for the custom rounding function. + + Arguments + --------- + ctx : context object + Context object used to store information for the backward computation. + input : torch.Tensor + The input tensor to be processed. + mode : str + The mode of the operation ('round', 'binary', 'heaviside'). + divisor : float + The divisor for rounding. Only used in 'round' mode. + + Returns + ------- + torch.Tensor + The processed tensor after applying the operation. + """ + ctx.mode = mode + ctx.divisor = divisor + + if mode == "round": + return torch.round(divisor * input) / divisor + elif mode == "binary": + return torch.round(input) + elif mode == "heaviside": + values = torch.tensor([0.0]).type_as(input) + return torch.heaviside(input, values) + else: + raise ValueError( + f"Invalid mode '{mode}'. Supported modes: 'round', 'binary', 'heaviside'." + ) + + @staticmethod + def backward(ctx, grad_output): + """ + Backward pass for the custom rounding function. + + Arguments + --------- + ctx : context object + Context object containing information saved during the forward pass. + grad_output : torch.Tensor + The gradient of the output with respect to the loss. + + Returns + ------- + torch.Tensor + The gradient of the input with respect to the loss. + """ + # For all modes, the gradient is propagated unchanged. + return grad_output.clone(), None, None + + +class PreProcessor(nn.Module): + """ + A module for preprocessing input data through convolution and pooling operations. + It is used as an initial step before the encoder blocks in the ScalarModel, particularly when the kernel_size for average pooling operation exceeds 1. + + Arguments + --------- + n_in : int + Number of input channels. + n_out : int + Number of output channels. + num_samples : int + Number of samples for pooling. + kernel_size : int, optional + Size of the convolutional kernel (default is 7). + causal : bool, optional + If True, applies causal convolution (default is False). + """ + + def __init__(self, n_in, n_out, num_samples, kernel_size=7, causal=False): + super(PreProcessor, self).__init__() + self.pooling = torch.nn.AvgPool1d(kernel_size=num_samples) + self.conv = Conv1d(n_in, n_out, kernel_size=kernel_size, causal=causal) + self.activation = nn.PReLU() + + def forward(self, x): + """ + Applies convolution, activation, and pooling to the input data. + + Arguments + --------- + x : torch.Tensor + Input tensor. + + Returns + ------- + torch.Tensor + Processed output tensor. + """ + output = self.activation(self.conv(x)) + output = self.pooling(output) + return output + + +class PostProcessor(nn.Module): + """ + A module for postprocessing data through convolution and reshaping. + It is used as an initial step after the decoder blocks in the ScalarModel, particularly when the kernel_size for average pooling operation exceeds 1. + + Arguments + --------- + n_in : int + Number of input channels. + n_out : int + Number of output channels. + num_samples : int + Number of samples for repetition. + kernel_size : int, optional + Size of the convolutional kernel (default is 7). + causal : bool, optional + If True, applies causal convolution (default is False). + """ + + def __init__(self, n_in, n_out, num_samples, kernel_size=7, causal=False): + super(PostProcessor, self).__init__() + self.num_samples = num_samples + self.conv = Conv1d(n_in, n_out, kernel_size=kernel_size, causal=causal) + self.activation = nn.PReLU() + + def forward(self, x): + """ + Applies reshaping, repetition, and convolution to the input data. + + Arguments + --------- + x : torch.Tensor + Input tensor. + + Returns + ------- + torch.Tensor + Processed output tensor. + """ + x = torch.transpose(x, 1, 2) + B, T, C = x.size() + x = x.repeat(1, 1, self.num_samples).view(B, -1, C) + x = torch.transpose(x, 1, 2) + output = self.activation(self.conv(x)) + return output + + +class DownsampleLayer(nn.Module): + """ + A downsampling layer that applies convolution, optional pooling, and activation. + + Arguments + --------- + in_channels : int + Number of input channels. + out_channels : int + Number of output channels. + kernel_size : int + Size of the convolutional kernel. + stride : int, optional + Stride of the convolution (default is 1). + causal : bool, optional + If True, applies causal convolution (default is False). + activation : nn.Module, optional + Activation function (default is PReLU). + use_weight_norm : bool, optional + If True, applies weight normalization to the convolution (default is True). + pooling : bool, optional + If True, applies an average pooling operation (default is False). + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + causal: bool = False, + activation=nn.PReLU(), + use_weight_norm: bool = True, + pooling: bool = False, + ): + super(DownsampleLayer, self).__init__() + self.pooling = pooling + self.stride = stride + self.activation = activation + self.use_weight_norm = use_weight_norm + if pooling: + self.layer = Conv1d( + in_channels, out_channels, kernel_size, causal=causal + ) + self.pooling = nn.AvgPool1d(kernel_size=stride) + else: + self.layer = Conv1d( + in_channels, + out_channels, + kernel_size, + stride=stride, + causal=causal, + ) + if use_weight_norm: + self.layer = weight_norm(self.layer) + + def forward(self, x): + """ + Applies convolution, optional pooling, and activation to the input data. + + Arguments + --------- + x : torch.Tensor + Input tensor. + + Returns + ------- + torch.Tensor + Processed output tensor. + """ + x = self.layer(x) + x = self.activation(x) if self.activation is not None else x + if self.pooling: + x = self.pooling(x) + return x + + def remove_weight_norm(self): + """ + Removes weight normalization from the convolutional layer. + """ + if self.use_weight_norm: + remove_weight_norm(self.layer) + + +class UpsampleLayer(nn.Module): + """ + An upsampling layer that applies transposed convolution or repetition, with activation. + + Arguments + --------- + in_channels : int + Number of input channels. + out_channels : int + Number of output channels. + kernel_size : int + Size of the convolutional kernel. + stride : int, optional + Stride of the transposed convolution (default is 1). + causal : bool, optional + If True, applies causal convolution (default is False). + activation : nn.Module, optional + Activation function (default is PReLU). + use_weight_norm : bool, optional + If True, applies weight normalization to the convolution (default is True). + repeat : bool, optional + If True, applies repetition instead of transposed convolution (default is False). + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + causal: bool = False, + activation=nn.PReLU(), + use_weight_norm: bool = True, + repeat: bool = False, + ): + super(UpsampleLayer, self).__init__() + self.repeat = repeat + self.stride = stride + self.activation = activation + self.use_weight_norm = use_weight_norm + if repeat: + self.layer = Conv1d( + in_channels, out_channels, kernel_size, causal=causal + ) + else: + self.layer = ConvTranspose1d( + in_channels, + out_channels, + kernel_size, + stride=stride, + causal=causal, + ) + if use_weight_norm: + self.layer = weight_norm(self.layer) + + def forward(self, x): + """ + Applies upsampling through transposed convolution or repetition, followed by activation. + + Arguments + --------- + x : torch.Tensor + Input tensor. + + Returns + ------- + torch.Tensor + Processed output tensor. + """ + x = self.layer(x) + x = self.activation(x) if self.activation is not None else x + if self.repeat: + x = torch.transpose(x, 1, 2) + B, T, C = x.size() + x = x.repeat(1, 1, self.stride).view(B, -1, C) + x = torch.transpose(x, 1, 2) + return x + + def remove_weight_norm(self): + """ + Removes weight normalization from the convolutional layer. + """ + if self.use_weight_norm: + remove_weight_norm(self.layer) + + +class ResidualUnit(nn.Module): + """ + A residual unit with two convolutional layers and activation functions. + This module is commonly used in the encoder and decoder blocks of the ScalarModel + + Arguments + --------- + n_in : int + Number of input channels. + n_out : int + Number of output channels. + dilation : int + Dilation factor for the first convolutional layer. + res_kernel_size : int, optional + Size of the convolutional kernel for residual connections (default is 7). + causal : bool, optional + If True, applies causal convolution (default is False). + """ + + def __init__(self, n_in, n_out, dilation, res_kernel_size=7, causal=False): + super(ResidualUnit, self).__init__() + self.conv1 = weight_norm( + Conv1d( + n_in, + n_out, + kernel_size=res_kernel_size, + dilation=dilation, + causal=causal, + ) + ) + self.conv2 = weight_norm( + Conv1d(n_in, n_out, kernel_size=1, causal=causal) + ) + self.activation1 = nn.PReLU() + self.activation2 = nn.PReLU() + + def forward(self, x): + """ + Applies two convolutional layers with activations and adds the input for a residual connection. + + Arguments + --------- + x : torch.Tensor + Input tensor. + + Returns + ------- + torch.Tensor + Output tensor with residual connection applied. + """ + output = self.activation1(self.conv1(x)) + output = self.activation2(self.conv2(output)) + return output + x + + +class ResEncoderBlock(nn.Module): + """ + A residual encoder block with multiple residual units and a downsampling layer. + + Arguments + --------- + n_in : int + Number of input channels. + n_out : int + Number of output channels. + stride : int + Stride for the downsampling layer. + down_kernel_size : int + Kernel size for the downsampling layer. + res_kernel_size : int, optional + Size of the convolutional kernel for residual connections (default is 7). + causal : bool, optional + If True, applies causal convolution (default is False). + """ + + def __init__( + self, + n_in, + n_out, + stride, + down_kernel_size, + res_kernel_size=7, + causal=False, + ): + super(ResEncoderBlock, self).__init__() + self.convs = nn.ModuleList( + [ + ResidualUnit( + n_in, + n_out // 2, + dilation=1, + res_kernel_size=res_kernel_size, + causal=causal, + ), + ResidualUnit( + n_out // 2, + n_out // 2, + dilation=3, + res_kernel_size=res_kernel_size, + causal=causal, + ), + ResidualUnit( + n_out // 2, + n_out // 2, + dilation=5, + res_kernel_size=res_kernel_size, + causal=causal, + ), + ResidualUnit( + n_out // 2, + n_out // 2, + dilation=7, + res_kernel_size=res_kernel_size, + causal=causal, + ), + ResidualUnit( + n_out // 2, + n_out // 2, + dilation=9, + res_kernel_size=res_kernel_size, + causal=causal, + ), + ] + ) + self.down_conv = DownsampleLayer( + n_in, n_out, down_kernel_size, stride=stride, causal=causal + ) + + def forward(self, x): + """ + Applies a series of residual units and a downsampling layer. + + Arguments + --------- + x : torch.Tensor + Input tensor. + + Returns + ------- + torch.Tensor + Processed output tensor. + """ + for conv in self.convs: + x = conv(x) + x = self.down_conv(x) + return x + + +class ResDecoderBlock(nn.Module): + """ + A residual decoder block with upsampling and multiple residual units. + + Arguments + --------- + n_in : int + Number of input channels. + n_out : int + Number of output channels. + stride : int + Stride for the upsampling layer. + up_kernel_size : int + Kernel size for the upsampling layer. + res_kernel_size : int, optional + Size of the convolutional kernel for residual connections (default is 7). + causal : bool, optional + If True, applies causal convolution (default is False). + """ + + def __init__( + self, + n_in, + n_out, + stride, + up_kernel_size, + res_kernel_size=7, + causal=False, + ): + super(ResDecoderBlock, self).__init__() + self.up_conv = UpsampleLayer( + n_in, + n_out, + kernel_size=up_kernel_size, + stride=stride, + causal=causal, + activation=None, + ) + self.convs = nn.ModuleList( + [ + ResidualUnit( + n_out, + n_out, + dilation=1, + res_kernel_size=res_kernel_size, + causal=causal, + ), + ResidualUnit( + n_out, + n_out, + dilation=3, + res_kernel_size=res_kernel_size, + causal=causal, + ), + ResidualUnit( + n_out, + n_out, + dilation=5, + res_kernel_size=res_kernel_size, + causal=causal, + ), + ResidualUnit( + n_out, + n_out, + dilation=7, + res_kernel_size=res_kernel_size, + causal=causal, + ), + ResidualUnit( + n_out, + n_out, + dilation=9, + res_kernel_size=res_kernel_size, + causal=causal, + ), + ] + ) + + def forward(self, x): + """ + Applies upsampling followed by a series of residual units. + + Arguments + --------- + x : torch.Tensor + Input tensor. + + Returns + ------- + torch.Tensor + Processed output tensor. + """ + x = self.up_conv(x) + for conv in self.convs: + x = conv(x) + return x + + +class Conv1d(nn.Conv1d): + """ + Custom 1D convolution layer with an optional causal mode. + + This class extends PyTorch's `nn.Conv1d` and allows for causal convolutions + by automatically applying the correct amount of padding to ensure that the output + does not depend on future inputs, which is useful for sequential data processing. + + Arguments + --------- + in_channels : int + Number of input channels. + out_channels : int + Number of output channels. + kernel_size : int + Size of the convolutional kernel. + stride : int, optional + Stride of the convolution (default is 1). + dilation : int, optional + Dilation factor for the convolution (default is 1). + groups : int, optional + Number of blocked connections from input channels to output channels (default is 1). + padding_mode : str, optional + Padding mode to use ('zeros', 'reflect', 'replicate', or 'circular') (default is 'zeros'). + bias : bool, optional + If True, adds a learnable bias to the output (default is True). + padding : int, optional + Explicit padding value. If not provided, it will be computed automatically. + causal : bool, optional + If True, applies causal convolution where the output depends only on the past and current inputs (default is False). + w_init_gain : str, optional + Gain value used for Xavier initialization (e.g., 'relu', 'tanh', etc.). If provided, applies Xavier uniform initialization to the convolutional weights. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + dilation: int = 1, + groups: int = 1, + padding_mode: str = "zeros", + bias: bool = True, + padding=None, + causal: bool = False, + w_init_gain=None, + ): + self.causal = causal + if padding is None: + if causal: + padding = 0 + self.left_padding = dilation * (kernel_size - 1) + else: + padding = get_padding(kernel_size, dilation) + super(Conv1d, self).__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + padding_mode=padding_mode, + bias=bias, + ) + if w_init_gain is not None: + torch.nn.init.xavier_uniform_( + self.weight, gain=torch.nn.init.calculate_gain(w_init_gain) + ) + + def forward(self, x): + """ + Applies the forward pass of the convolutional layer. + + Arguments + --------- + x : torch.Tensor + Input tensor of shape (batch_size, channels, sequence_length). + + Returns + ------- + torch.Tensor + The output tensor after applying the convolution operation. + If `causal` is True, the input tensor is padded to ensure that + the output at each timestep only depends on the current and previous inputs. + """ + if self.causal: + x = F.pad(x.unsqueeze(2), (self.left_padding, 0, 0, 0)).squeeze(2) + + return super(Conv1d, self).forward(x) + + +class ConvTranspose1d(nn.ConvTranspose1d): + """ + Custom transposed 1D convolution layer with causal option. + + Arguments + --------- + in_channels : int + Number of input channels. + out_channels : int + Number of output channels. + kernel_size : int + Size of the convolutional kernel. + stride : int, optional + Stride of the convolution (default is 1). + output_padding : int, optional + Additional size added to one side of the output (default is 0). + groups : int, optional + Number of blocked connections (default is 1). + bias : bool, optional + If True, adds a learnable bias (default is True). + dilation : int, optional + Dilation factor (default is 1). + padding : int, optional + Explicit padding value (default is None). + padding_mode : str, optional + Padding mode (default is 'zeros'). + causal : bool, optional + If True, applies causal convolution. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + output_padding: int = 0, + groups: int = 1, + bias: bool = True, + dilation: int = 1, + padding=None, + padding_mode: str = "zeros", + causal: bool = False, + ): + if padding is None: + padding = 0 if causal else (kernel_size - stride) // 2 + if causal: + assert ( + padding == 0 + ), "padding is not allowed in causal ConvTranspose1d." + assert ( + kernel_size == 2 * stride + ), "kernel_size must be equal to 2*stride is not allowed in causal ConvTranspose1d." + super(ConvTranspose1d, self).__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + output_padding=output_padding, + groups=groups, + bias=bias, + dilation=dilation, + padding_mode=padding_mode, + ) + self.causal = causal + self.stride = stride + + def forward(self, x): + """ + Applies the transposed convolution operation. + + Arguments + --------- + x : torch.Tensor + Input tensor. + + Returns + ------- + torch.Tensor + Transposed convolved output tensor. + """ + x = super(ConvTranspose1d, self).forward(x) + if self.causal: + x = x[:, :, : -self.stride] + return x + + +class TernaryEmbedding(nn.Module): + """A module wrapper for tokens-to-ternary conversion + + Arguments + --------- + num_digits : int + The number of ternary digits""" + + def __init__(self, num_digits, emb_size=512, flat=False): + super().__init__() + self.num_digits = num_digits + self.flat = flat + + def forward(self, tokens): + """Computes the forward pass + + Arguments + --------- + tokens : torch.Tensor + the tokens + """ + squeeze = False + if tokens.dim() < 3: + squeeze = True + tokens = tokens.unsqueeze(-1) + batch_size, max_len, tracks = tokens.shape + emb = tokens_to_ternary(tokens, D=self.num_digits).float() + positions = emb.size(-1) + if self.flat: + emb = emb.unsqueeze(-2) + else: + emb = emb.reshape(batch_size, max_len, tracks, positions // tracks) + if squeeze: + emb = emb.squeeze(-2) + return emb + + +def decimal_to_ternary_matrix(decimals, D): + """ + Convert a tensor of decimal numbers to a D*T ternary matrix for each batch. + + Arguments + --------- + decimals : torch.Tensor + A 2D tensor of decimal numbers with shape (B, T), where B is the batch size + and T is the number of elements in each batch. + D : int + Number of ternary digits to represent each number (depth). + + Returns + ------- + torch.Tensor + A 3D tensor of shape (B, D, T) where each slice along the first dimension + corresponds to a batch, and each column is represented as a ternary number. + """ + B, T = decimals.shape + ternary_matrix = torch.zeros( + (B, D, T), dtype=torch.long, device=decimals.device + ) + for pos in range(D): + ternary_matrix[:, pos, :] = decimals % 3 # Modulo operation + decimals //= 3 # Floor division for next ternary digit + + return ternary_matrix + + +def ternary_matrix_to_decimal(matrix): + """ + Convert a B*D*N ternary matrix to a 2D array of decimal numbers for each batch. + + Arguments + --------- + matrix : numpy.ndarray + A 3D numpy array of shape (B, D, N), where B is the batch size, D is the number + of ternary digits, and N is the number of ternary numbers in each batch. + + Returns + ------- + numpy.ndarray + A 2D numpy array of shape (B, N), where each value represents the decimal + equivalent of the corresponding ternary number in the input matrix. + """ + ( + B, + D, + N, + ) = ( + matrix.shape + ) # B is the batch size, D is the number of digits, N is the number of ternary numbers + powers_of_three = 3 ** np.arange(D) # [3^0, 3^1, ..., 3^(D-1)] + + # Reshape powers_of_three for broadcasting: [D] -> [1, D, 1] + powers_of_three = powers_of_three[:, np.newaxis] # Shape [D, 1] + + # Compute dot product using broadcasting: matrix * powers_of_three along D axis + decimals = np.sum(matrix * powers_of_three, axis=1) # Sum along the D axis + + return decimals + + +def ternary_matrix_to_decimal_torch(matrix): + """ + Convert a B*D*N ternary matrix to a 2D array of decimal numbers for each batch. + + Arguments + --------- + matrix : numpy.ndarray + A 3D numpy array of shape (B, D, N), where B is the batch size, D is the number + of ternary digits, and N is the number of ternary numbers in each batch. + + Returns + ------- + numpy.ndarray + A 2D numpy array of shape (B, N), where each value represents the decimal + equivalent of the corresponding ternary number in the input matrix. + """ + ( + B, + D, + N, + ) = ( + matrix.shape + ) # B is the batch size, D is the number of digits, N is the number of ternary numbers + powers_of_three = 3 ** torch.arange( + D, device=matrix.device + ) # [3^0, 3^1, ..., 3^(D-1)] + + # Reshape powers_of_three for broadcasting: [D] -> [1, D, 1] + powers_of_three = powers_of_three[:, None] # Shape [D, 1] + + # Compute dot product using broadcasting: matrix * powers_of_three along D axis + decimals = torch.sum( + matrix * powers_of_three, axis=1 + ) # Sum along the D axis + + return decimals + + +def get_padding(kernel_size, dilation=1): + """ + Computes the padding size for a given kernel size and dilation. + + Arguments + --------- + kernel_size : int + Size of the convolutional kernel. + dilation : int, optional + Dilation factor for convolution (default is 1). + + Returns + ------- + int + Calculated padding size. + """ + return int((kernel_size * dilation - dilation) / 2) + + +def ternary_to_decimal(ternary, n_codebook=4): + """Converts ternary digits to their decimal equivalent + + Arguments + --------- + ternary : torch.Tensor + (Batch x Length x num_positions) - ternary digits + n_codebooks : torch.Tensor + The number of codebooks + + Returns + ------- + result: torch.Tensor + the result (Batch x Length x codebooks) + """ + chunks = ternary.chunk(n_codebook, dim=1) + codec_ls = [] + # TODO: Vectorize + for i, chunk in enumerate(chunks): + chunk = chunk + 1 + tmp_codec = ternary_matrix_to_decimal_torch(chunk) + codec_ls.append(tmp_codec) + codec_ls = torch.stack(codec_ls) + return codec_ls.permute(1, 2, 0) + + +def ternary_logits_to_tokens(logits, n_codebook=4): + """Converts ternary logits to tokens (as used for SQ-Codec) + + Arguments + --------- + logits : torch.Tensor + The logits + + Returns + ------- + tokens : torch.Tensor + Token IDs + """ + ternary_matrix = logits_to_ternary(logits) + tokens = ternary_to_decimal( + ternary_matrix.transpose(-1, -2), n_codebook=n_codebook + ) + return tokens + + +def tokens_to_ternary(tokens, D=9): + """Converts a sequence of tokens to a ternary matrix + + Arguments + --------- + tokens : torch.Tensor + A (Batch x Length x Codebooks) tensor of tokens + D : int + The number of ternary digits + + Returns + ------- + result : torch.Tensor + A (Batch x Length x Ternary Positions) tensor + with values of (-1, 0, 1)""" + has_batch = tokens.dim() > 2 + if not has_batch: + tokens = tokens.unsqueeze(0) + batch_size = tokens.size(0) + n_codebook = tokens.size(2) + tokens = tokens.view(batch_size, -1, n_codebook).permute(2, 0, 1).clone() + ternary_matrix = torch.cat( + [decimal_to_ternary_matrix(item, D=D) - 1 for item in tokens], dim=1 + ) + ternary_matrix = ternary_matrix.transpose(1, 2) + if not has_batch: + ternary_matrix = ternary_matrix[0] + return ternary_matrix + + +def logits_to_ternary(logits): + """Converts a tensor with two logits to a ternary matrix + + Arguments + --------- + logits : torch.Tensor + The logits (Batch x Length x num_positions x 3) + + Returns + ------- + result : torch.Tensor + The corresponding ternary matrix + """ + ternary = logits.argmax(-1) - 1 + return ternary + + +def ternary_loss( + predictions, + targets, + length=None, + mask=None, + targets_type="ternary", + num_positions=9, + reduction="mean", +): + if targets.dim() < 3: + targets = targets.unsqueeze(-1) + if targets_type == "tokens": + targets = tokens_to_ternary(targets.unsqueeze(-1), D=num_positions) + batch_size, max_len, positions = targets.shape + targets_cat = targets + 1 + predictions_loss = predictions.permute(0, 3, 1, 2).contiguous() + loss = nn.functional.nll_loss( + predictions_loss, targets_cat, reduction="none" + ) + if length is not None: + mask = length_to_mask(length * max_len, max_len) + mask = mask.unsqueeze(-1) + if mask is not None: + loss = loss * mask + if reduction == "mean": + loss = loss.sum(2).sum(1).sum(0) / mask.sum() + elif reduction == "batch": + loss = loss.sum(2).sum(1) / mask.sum(-1).sum(-1) + return loss diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py new file mode 100644 index 000000000..98f7067a8 --- /dev/null +++ b/benchmarks/DASB/model/valle.py @@ -0,0 +1,1593 @@ +"""An adaptation of ESPNET VALL-E for SpeechBrain +Originally by Jinchuan Tian + +https://github.com/espnet/espnet + +Authors + * Artem Ploujnikov 2024 (adaptation only) +""" + +# Copyright 2024 Jinchuan Tian +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +# Implementation of Vall-E: https://arxiv.org/abs/2301.02111 + +from io import StringIO +import logging +import re +import string +import torch +import inspect +import torchaudio +from typing import Tuple, Optional +from speechbrain.dataio.dataio import length_to_mask +from speechbrain.utils.metric_stats import ErrorRateStats + +from torch import Tensor +from torch import nn +from torch.nn import functional as F +from dataclasses import dataclass + +from speechbrain.nnet.losses import reduce_loss, truncate +from speechbrain.lobes.models.huggingface_transformers import Whisper +from speechbrain.decoders.seq2seq import S2SWhisperGreedySearcher +from speechbrain.utils.data_utils import batch_pad_right +from speechbrain.utils.logger import get_logger +from utils.data import undo_padding_tensor + +logger = get_logger(__name__) + + +@dataclass +class SpeechLMInferenceOptions: + """Inference options + """ + + device: str = None + search_algo: str = "topk_sampling" + nbest: int = 1 + sampling_temperature: float = 1.0 + top_k: int = 20 + maxlenratio: float = 0.0 + minlenratio: float = 0.0 + eos: int = 5 + start: int = 1 + masks: torch.Tensor = None + nq: int = None + allow_invalid: bool = True + + +class ValleLM(nn.Module): + """The Vall-E TTS model (decoder-only transformer), adopted from + ESPNET2 + + Arguments + --------- + vocab_size : int + Dimention of vocabulary. + nq : int + Number of codes for each token / frame, usually for speech codec. + share_emb : bool + If true, share the embedding and lm_head weight. + qk_norm : bool + If true, apply LayerNorm to q and k in atention. + dropout : float + dropout rate for attention layers. + target_dropout : float + a separate dropout applied to targets only (may be + useful to mitigate autorgressive prediction instability) + att_unit: int + Dimention of Transformer attention. + head : int + Number of heads in Transformer attention. + ar_layer : int + Number of layers in AR Transformer. + nar_layer : int + Number of layers in NAR Transformer. + n_ctx : int + maximum context length of AR & NAR Transformer. + lm_head : torch.nn.Module, optional + an alternative LM head implementation head, an alternative + to the default Linear, useful for non-trivial codecs, + such as SQ-Codec + logits_to_probs : callable, optional + A module or a function that converts logits to token probabilities to + support top-K sampling + """ + + def __init__( + self, + vocab_size, + nq, + pad_id=0, + share_emb=True, + qk_norm=False, + dropout=0.0, + target_dropout=0.0, + att_unit=256, + head=2, + ar_layer=4, + nar_layer=4, + n_ctx=3000, + emb=None, + lm_head=None, + logits_to_probs=None, + ): + super().__init__() + if emb is None: + emb = torch.nn.Embedding(vocab_size, att_unit) + self.emb = emb + if lm_head is None: + lm_head = torch.nn.Linear(att_unit, vocab_size, bias=False) + self.lm_head = lm_head + spec = inspect.getfullargspec(lm_head.forward) + self.lm_head_multitrack = "track" in spec.args + if logits_to_probs is None: + logits_to_probs = nn.Identity() + self.logits_to_probs = logits_to_probs + if share_emb: + self.lm_head.weight = self.emb.weight + + self.ar_decoder = TransformerDecoder( + n_ctx=n_ctx, + n_state=att_unit, + n_head=head, + n_layer=ar_layer, + qk_norm=qk_norm, + dropout=dropout, + target_dropout=target_dropout, + ) + if nq > 1: + # NOTE: An NAR encoder is not needed if there is only one track + self.nar_decoder = ValleNARDecoder( + n_level=nq - 1, + n_ctx=n_ctx, + n_state=att_unit, + n_head=head, + n_layer=nar_layer, + qk_norm=qk_norm, + dropout=dropout, + ) + + self.nq = nq + self.n_ctx = n_ctx + self.pad_id = pad_id + self._initialize() + + def forward( + self, + dec_seq, + dec_seq_lengths=None, + prefix_len=None, + conti_feats=None, + nar_level_idx=1, + predict_ar=True, + predict_nar=True, + ): + """Vall-E forward for training + + Arguments + --------- + dec_seq : torch.Tensor + Batch of decoder sequences (B, T, nq). + dec_seq_lengths : torch.Tensor + Lengths of batched decoder sequences (B,). + enc_seq : torch.Tensor + Batch of encoder sequences (B, T, nq), keep + the interface, may not be used. + enc_seq_lengths : torch.Tensor + Lengths of batched encoder sequences (B,), + keep the interface, may not be used. + prefix_len : torch.Tensor + Lengths of condition part in dec_seq (B,). + nar_level_idx : int + the index of the non-autoregressive level to train + predict_ar : bool + Whether to make an autoregressive prediction + predict_nar : bool + Whether to make a non-autoregressive prediction + + Returns + ------- + logits_ar : torch.Tensor + Autoregressive predictions + logits_nar : torch.Tensor + Non-autoregressive predictions + """ + + assert dec_seq.dim() == 3 + + dec_seq_emb = self.emb(dec_seq) # [B, T, nq, D] + dec_seq_emb, _ = install_continuous_features( + dec_seq_emb, None, conti_feats + ) + + # Auto-Regressive part + if predict_ar: + input_ar_emb = self.prepare_input(dec_seq_emb, prefix_len, 1)[ + :, :-1 + ] # [B, T, D] + h_ar = self.ar_decoder(input_ar_emb) + + # Non-Auto-Regressive part + if predict_nar: + input_nar_emb = self.prepare_input( + dec_seq_emb, prefix_len, nar_level_idx + )[ + :, 1: + ] # [B, T, V] + max_len = dec_seq.size(1) + mask = length_to_mask( + dec_seq_lengths * max_len - 1, max_len - 1 + ).bool() + mask = mask.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T] + h_nar = self.nar_decoder( + input_nar_emb, nar_level_idx - 1, mask=mask + ) + + # Logits + logits_ar, logits_nar = None, None + if predict_ar: + logits_ar = self.apply_lm_head(h_ar, 0) + if predict_nar: + logits_nar = self.apply_lm_head(h_nar, nar_level_idx + 1) + + return logits_ar, logits_nar + + def prepare_input(self, dec_seq_emb, prefix_len, level): + """Prepares the input sequence by adding up + embeddings that are not masked + + Arguments + --------- + dec_seq_emb : torch.Tensor + The decoder sequence embedding + prefix_len : torch.Tensor + The prefix lengths + level : int | torch.Tensor + The level number or a level mask + + Returns + ------- + result : torch.Tensor + The combined embedding + """ + # NOTE(Jinchuan): have to use "expand" here but maybe lead to extra memory usage. + # This is because both prefix_mask and level_mask are broadcastable and will + # trigger user warning. + + # (1) level mask, [B, 1, nq, 1], True is to include + if isinstance(level, int): + level = torch.ones_like(dec_seq_emb[:, 0, 0, 0]) * level + level_mask = length_to_mask(level, self.nq).bool() + level_mask = ( + level_mask.unsqueeze(1).unsqueeze(3).expand(dec_seq_emb.size()) + ) + + # (2) prefix mask, [B, T, 1, 1], True is the prefix + prefix_mask = length_to_mask( + prefix_len * dec_seq_emb.size(1), dec_seq_emb.size(1) + ).bool() + prefix_mask = ( + prefix_mask.unsqueeze(2).unsqueeze(3).expand(dec_seq_emb.size()) + ) + + # (3) mask and then sum in nq-axis. + mask = torch.logical_or(level_mask, prefix_mask) + return dec_seq_emb.masked_fill(~mask, 0.0).sum(2) + + def _init_inference(self, prefix, opts, enc_seq, suffix): + # (1) initialization + cache = self.ar_decoder.init() + + # (2) auto-regressive prefix forward on first code layer + prefix = prefix.expand(opts.nbest, -1, -1) + if opts.search_algo == "teacher_force": + suffix = suffix.expand(opts.nbest, -1, -1) + prefix_emb = self.emb(prefix).sum(dim=2) # [B, T, D] + _ = self.ar_decoder(prefix_emb, kv_cache=cache) + + # (3) auto-regressive loop on first code layer + # (3.1) AR initialization + minlen = ( + int(prefix.size(1) * opts.minlenratio) + if opts.minlenratio > 0 + else 0 + ) + maxlen = int(prefix.size(1) * opts.maxlenratio) + if opts.search_algo == "teacher_force": + assert suffix is not None + minlen = suffix.size(1) + maxlen = suffix.size(1) + if maxlen + prefix.size(1) > self.n_ctx: + maxlen = self.n_ctx - prefix.size(1) + logging.info(f"maxlen={maxlen}, minlen={minlen}") + + generated = {"token": [], "score": []} + finish_idx = ( + torch.Tensor([-1]).expand(opts.nbest).long().to(opts.device) + ) + prev_tok = ( + torch.Tensor([opts.start]) + .tile(opts.nbest, 1) + .long() + .to(opts.device) + ) + modality_index = prev_tok.flatten() + mask = modality_index_to_mask(modality_index, opts) + tracks = prefix.size(-1) + is_flattened = opts.nq == 1 and tracks > 1 + if is_flattened: + prev_tok = prev_tok.expand(1, tracks) + mask_cache = [] + return ( + prefix_emb, + generated, + finish_idx, + cache, + modality_index, + mask, + mask_cache, + prev_tok, + minlen, + maxlen, + is_flattened, + ) + + @torch.inference_mode() + def inference( + self, prefix, opts, enc_seq=None, suffix=None, + ): + """Vall-E Inference. + + Arguments + --------- + prefix : torch.Tensor + Prefix part of dec_seq (B, T, nq). + opts : SpeechLMInferenceOptions + inference options. + enc_seq : torch.Tensor + Encoder token sequence (B, T, nq). + suffix : torch.Tensor + suffix part of dec_seq (B, T, nq), + usually the target sequence for teacher-forcing. + + Returns + ------- + gen_tokens_list : list + Generated tokens + gen_scores_list : list + The scores associated with the generated tokens + """ + ( + prefix_emb, + generated, + finish_idx, + cache, + modality_index, + mask, + mask_cache, + prev_tok, + minlen, + maxlen, + is_flattened, + ) = self._init_inference(prefix, opts, enc_seq, suffix) + + modality_tokens = torch.tensor( + list(opts.masks.keys()), device=prefix.device + ) + + for step in range(maxlen): + # (3.2) AR loop + if is_flattened: + prev_tok = prev_tok.unsqueeze(1) + prev_emb = self.emb(prev_tok).squeeze(2) # [B, 1, D] + h_ar = self.ar_decoder(prev_emb, kv_cache=cache) + logits = self.logits_to_probs( + self.apply_lm_head(h_ar, 0) + ) # [B, 1, V] + if logits.dim() < 4: + logits = logits.unsqueeze(-2) + gen_tok, gen_score = logits_to_tokens( + logits, opts, mask, allow_eos=step >= minlen, nq_level=0, + ) + # [B, 1, 1] -> [B, 1] + gen_tok, gen_score = gen_tok.squeeze(1), gen_score.squeeze(1) + + generated["token"].append(gen_tok) + generated["score"].append(gen_score) + + if opts.search_algo == "teacher_force": + prev_tok = suffix[:, step : step + 1, 0] + else: + prev_tok = gen_tok # [B, 1] + + # (3.3) detect modality swtich + mask_cache.append(mask.clone()) + modality_change_mask = torch.isin(prev_tok[:, 0], modality_tokens) + # Note: The ESPNET VALL-E had + # modality_change_mask = torch.logical_and( + # prev_tok[:, 0] >= 32, prev_tok[:, 0] < 64, + # ) + if torch.any(modality_change_mask): + modality_index = torch.where( + modality_change_mask, prev_tok[:, 0], modality_index, + ) + if is_flattened: + modality_index = modality_index.flatten().squeeze() + if modality_index.dim() == 0: + modality_index = modality_index.unsqueeze(0) + if modality_index.size(0) > 1: + modality_index = modality_index[0:1] + mask = modality_index_to_mask(modality_index, opts) + logging.warning( + f"Step {step}: change modality index {modality_index}" + ) + + # (3.4) detect ended hypotheses. + finish_idx = torch.where( + torch.logical_and(prev_tok[:, 0] == opts.eos, finish_idx == -1), + step, + finish_idx, + ) + + if torch.all(torch.ge(finish_idx, 0)): + break + + if step == maxlen - 1: + logging.warning( + f"Some examples cannot finish in {maxlen} steps: {finish_idx}" + f"Consider increasing the maxlenratio" + ) + + logging.info(f"Terminate at steps: {finish_idx.cpu().tolist()}") + + # (3.4) finalize auto-regressive + if opts.allow_invalid: + valid_idx = torch.arange(len(finish_idx), device=finish_idx.device) + finish_idx = torch.where(finish_idx == -1, step, finish_idx) + else: + valid_idx = finish_idx.ne(-1).nonzero(as_tuple=True)[0] + if len(valid_idx) == 0: + self.ar_decoder.reset() + logging.warning("No valid examples. Return None") + return [], [] + elif len(valid_idx) < prefix.size(0): + logging.info( + "Only %d of %d are valid", len(valid_idx), prefix.size(0) + ) + + finish_idx = finish_idx[valid_idx] + prefix_emb = prefix_emb[valid_idx] + if opts.search_algo == "teacher_force": + suffix = suffix[valid_idx] + gen_tokens_ar = torch.cat(generated["token"], dim=1)[ + valid_idx + ].unsqueeze( + 2 + ) # [B, T, 1] + gen_scores_ar = torch.cat(generated["score"], dim=1)[ + valid_idx + ].unsqueeze(2) + gen_tokens_ar = gen_tokens_ar[:, : finish_idx.max() + 1] # idx -> count + gen_scores_ar = gen_scores_ar[:, : finish_idx.max() + 1] + + self.ar_decoder.reset() + + # (4) non-auto-regressive loop on the remained code layers + if self.nq > 1: + gen_tokens, gen_scores = self._nar_inference( + opts, + gen_tokens_ar, + gen_scores_ar, + valid_idx, + prefix_emb, + prefix, + suffix, + finish_idx, + mask_cache, + ) + else: + gen_tokens = gen_tokens_ar + gen_scores = gen_scores_ar + + gen_tokens_list, gen_scores_list = [], [] + for b in range(len(valid_idx)): + item_finish_idx = finish_idx[b] + gen_tokens_list.append(gen_tokens[b][:item_finish_idx]) + gen_scores_list.append(gen_scores[b][:item_finish_idx]) + return gen_tokens_list, gen_scores_list + + def _nar_inference( + self, + opts, + gen_tokens_ar, + gen_scores_ar, + valid_idx, + prefix_emb, + prefix, + suffix, + finish_idx, + mask_cache, + ): + # (4.1) NAR initialization + if opts.search_algo == "teacher_force": + prev_tok = suffix[:, :, 0] + else: + prev_tok = gen_tokens_ar[:, :, 0] + start_token = torch.tensor([opts.start], device=prefix.device)[ + None, None, : + ] + + start_emb = ( + self.emb(start_token).squeeze().tile(len(valid_idx), 1, 1) + ) # [B, 1, D] + prev_emb = torch.cat( + [prefix_emb[:, 1:], start_emb, self.emb(prev_tok)], dim=1 + ) # [B, T, D] + + ones = torch.ones_like(valid_idx) + mask = length_to_mask(prefix.size(1) + finish_idx + 1).bool() + mask = mask.unsqueeze(1).unsqueeze(1) + generated = {"token": [], "score": []} + + mask_cache = [mask_cache[0]] * prefix.size(1) + mask_cache + vocab_mask = torch.cat(mask_cache, dim=1) + + # (4.2) NAR loop + for step in range(1, opts.nq): + h_nar = self.nar_decoder( + prev_emb, ones * step - 1, mask=mask + ) # [B, T, D] + + logits = self.apply_lm_head(h_nar, step) + logits = self.logits_to_probs(logits) + gen_tok, gen_score = logits_to_tokens( + logits.unsqueeze(2), + opts, + vocab_mask, + search_algo="greedy_search", + allow_eos=False, + nq_level=step, + ) + gen_tok, gen_score = ( + gen_tok.squeeze(2), + gen_score.squeeze(2), + ) # [B, T] + + generated["token"].append(gen_tok[:, prefix.size(1) :]) + generated["score"].append(gen_score[:, prefix.size(1) :]) + + if opts.search_algo == "teacher_force": + prev_tok = suffix[:, :, step] + else: + prev_tok = generated["token"][-1] + prev_emb[:, prefix.size(1) :] += self.emb(prev_tok) # [B, T, D] + prev_emb[:, prefix.size(1) - 1 : prefix.size(1)] += start_emb + + # (5) combine AR and NAR results + gen_tokens_nar = torch.stack(generated["token"], dim=2) # [B, T, nq] + gen_scores_nar = torch.stack(generated["score"], dim=2) + + gen_tokens = torch.cat( + [gen_tokens_ar, gen_tokens_nar], dim=2 + ) # [B, T, nq] + gen_scores = torch.cat([gen_scores_ar, gen_scores_nar], dim=2) + return gen_tokens, gen_scores + + def apply_lm_head(self, x, track): + """Applies the language model head + + Arguments + --------- + """ + + if self.lm_head_multitrack: + result = self.lm_head(x, track) + else: + result = self.lm_head(x) + return result + + def _initialize(self): + for m in self.modules(): + if isinstance(m, torch.nn.Linear): + torch.nn.init.normal_(m.weight, mean=0.0, std=0.02) + if m.bias is not None: + torch.nn.init.zeros_(m.bias) + elif isinstance(m, torch.nn.Embedding): + torch.nn.init.normal_(m.weight, mean=0.0, std=0.02) + + +class ResidualAttentionBlock(nn.Module): + """A VALL-E residual attention block + + Arguments + --------- + n_state : int + The number of states + n_head : int + The number of heads + cross_attention : bool + Whether to use cross-attention + causal : bool + Whether to operate in causal mode (i.e. avoid attending + to future steps) + qk_norm : bool + Whether to normalize queries and keys + dropout : float + The dropout probability + """ + + def __init__( + self, + n_state, + n_head, + cross_attention=False, + causal=False, + qk_norm=False, + dropout=0.0, + ): + super().__init__() + + self.attn = MultiHeadAttention( + n_state, n_head, causal=causal, qk_norm=qk_norm, dropout=dropout, + ) + self.attn_ln = LayerNorm(n_state) + self.attn_dropout = nn.Dropout(p=dropout) + + self.cross_attn = ( + MultiHeadAttention( + n_state, n_head, causal=False, qk_norm=qk_norm, dropout=dropout, + ) + if cross_attention + else None + ) + self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None + self.cross_attn_dropout = ( + nn.Dropout(p=dropout) if cross_attention else None + ) + + n_mlp = n_state * 4 + self.mlp = nn.Sequential( + Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state) + ) + self.mlp_ln = LayerNorm(n_state) + self.mlp_dropout = nn.Dropout(p=dropout) + + def forward( + self, x, xa=None, mask=None, kv_cache=None, + ): + """The forward pass implementation + + Arguments + --------- + x : torch.Tensor + the feature tensor + xa : torch.Tensor + The tensor for cross-attention + mask : torch.Tensor + The attention mask to be applied + + """ + x = x + self.attn_dropout( + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache) + ) + if self.cross_attn: + x = x + self.cross_attn_dropout( + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache) + ) + x = x + self.mlp_dropout(self.mlp(self.mlp_ln(x))) + return x + + +class TransformerDecoder(nn.Module): + """A custom transformer decoder implementation for VALL-E + + Arguments + --------- + n_ctx : int + The context length + n_state : int + The number of states + n_head : int + The number of heads + n_layer : int + The number of layers + causal : bool + Whether to operate in causal mode (i.e. avoid attending + to future steps) + qk_norm : bool + Whether to normalize queries and keys + dropout : float + The dropout probability + target_dropout : float + The target dropout probability + layer_class : type + The layer type to be used + """ + + def __init__( + self, + n_ctx, + n_state, + n_head, + n_layer, + causal=True, + qk_norm=False, + dropout=0.0, + target_dropout=0.0, + layer_class=ResidualAttentionBlock, + ): + + super().__init__() + + self.pos_emb = nn.Embedding(n_ctx, n_state) + + self.blocks = nn.ModuleList( + [ + layer_class( + n_state=n_state, + n_head=n_head, + cross_attention=False, + causal=causal, + qk_norm=qk_norm, + dropout=dropout, + ) + for _ in range(n_layer) + ] + ) + self.ln = LayerNorm(n_state) + self.target_dropout = nn.Dropout(target_dropout) + + self.causal = causal + self.kv_cache = None + + def forward( + self, x, mask=None, kv_cache=None, + ): + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + the feature tensor + mask : torch.Tensor + The attention mask to be applied + kv_cache : dict + The key/value cache (for inference) + + Returns + ------- + result : torch.Tensor + The decoder output + """ + if self.causal and mask is not None: + raise ValueError("Causal Transformer dones't allow mask") + + offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0 + x = x + self.pos_emb.weight[offset : offset + x.shape[1]].unsqueeze(0) + tgt = self.target_dropout(x) + + for block in self.blocks: + x = block(x, tgt, mask=mask, kv_cache=kv_cache) + tgt = x + + x = self.ln(x) + return x + + def init(self): + """Initializes the key/value cache and the hooks to update it""" + self.kv_cache, self.hooks = install_kv_cache_hook(self, self.kv_cache) + return self.kv_cache + + def reset(self): + """Resets the key-value cache""" + for hook in self.hooks: + hook.remove() + self.kv_cache = None + + +class LayerNorm(nn.LayerNorm): + """A layer normalziation wrapper""" + + def forward(self, x): + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + The tensor to be normalized + + Returns + ------- + result : torch.Tensor + A normalzied tensor + """ + return super().forward(x.float()).type(x.dtype) + + +class Linear(nn.Linear): + """A linear layer wrapper that performs automatic + type conversions + """ + + def forward(self, x: Tensor) -> Tensor: + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + The input data + + Returns + ------- + result : torch.Tensor + The result + """ + return F.linear( + x, + self.weight.to(x.dtype), + None if self.bias is None else self.bias.to(x.dtype), + ) + + +class ResidualAttentionBlockAdaLN(ResidualAttentionBlock): + """"The Vall-E Adaptive Residual Attention Block + + Arguments + --------- + n_state : int + The number of states + n_head : int + The number of states + n_head : int + The number of attention heads + cross_attention : bool + The number of attention heads + causal : bool + Whether to operate in causal mode (i.e. avoid attending + to future steps) + qk_norm : bool + Queries/Keys Normalization + dropout : float + The dropout probability + """ + + def __init__( + self, + n_state, + n_head, + cross_attention=False, + causal=False, + qk_norm=False, + dropout=0.0, + ): + super(ResidualAttentionBlockAdaLN, self).__init__( + n_state=n_state, + n_head=n_head, + cross_attention=cross_attention, + causal=causal, + qk_norm=qk_norm, + dropout=dropout, + ) + + self.attn_ln = AdaLN(n_state) + self.mlp_ln = AdaLN(n_state) + + def forward( + self, x, level, xa=None, mask=None, kv_cache=None, + ): + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + The source tensor + level : torch.Tensor + The level numbers for each batch element + xa : torch.Tensor + The sequence for cross attention + mask : torch.Tensor + The attention mask + kv_cache : dict + The key/value cache (for inference) + """ + x = x + self.attn_dropout( + self.attn(self.attn_ln(x, level), mask=mask, kv_cache=kv_cache) + ) + if self.cross_attn: + x = x + self.cross_attn_dropout( + self.cross_attn( + self.cross_attn_ln(x, level), xa, kv_cache=kv_cache + ) + ) + x = x + self.mlp_dropout(self.mlp(self.mlp_ln(x, level))) + return x + + +class ValleNARDecoder(TransformerDecoder): + """The VALL-E non-autoregressive decoder + + Arguments + --------- + n_level : int + The number of levels + n_ctx : int + The context length + n_state : int + The number of states + n_head : int + The number of attention heads + n_layer : int + The number of layers + causal : bool + Whether to operate in causal mode (i.e. avoid attending + to future steps) + qk_norm : bool + Queries/Keys Normalization + dropout : float + The dropout probability + layer_class : type + The layer class to use + """ + + def __init__( + self, + n_level, + n_ctx, + n_state, + n_head, + n_layer, + causal=False, + qk_norm=False, + dropout=0.0, + layer_class=ResidualAttentionBlockAdaLN, + ): + super().__init__( + n_ctx=n_ctx, + n_state=n_state, + n_head=n_head, + n_layer=n_layer, + causal=causal, + qk_norm=qk_norm, + dropout=dropout, + layer_class=layer_class, + ) + + self.level_emb = nn.Embedding(n_level, n_state) + self.ln = AdaLN(n_state) + + def forward( + self, x, level, mask=None, kv_cache=None, + ): + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + The source tensor + level : torch.Tensor + The level numbers for each batch element + mask : torch.Tensor + The attention mask + kv_cache : dict + The key/value cache (for inference) + """ + if self.causal and mask is not None: + raise ValueError("mask is not allowed when causal") + + level = self.level_emb(level) + + offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0 + x = x + self.pos_emb.weight[offset : offset + x.shape[1]].unsqueeze(0) + + for block in self.blocks: + x = block(x, level=level, mask=mask, kv_cache=kv_cache) + + x = self.ln(x, level) + return x + + +class MultiHeadAttention(nn.Module): + """A Multi-Head Attention implementation + + Arguments + --------- + n_state : int + The number of states + n_head : int + The number of attention heads + causal : bool + Whether to operate in causal mode (i.e. avoid attending + to future steps) + qk_norm : bool + Queries/Keys Normalization + dropout : float + The dropout probability + """ + + def __init__( + self, n_state, n_head, causal=False, qk_norm=False, dropout=0.0, + ): + super().__init__() + assert n_state % n_head == 0 + self.n_head = n_head + self.query = Linear(n_state, n_state) + self.key = Linear(n_state, n_state, bias=False) + self.value = Linear(n_state, n_state) + self.out = Linear(n_state, n_state) + self.causal = causal + self.dropout = dropout + + self.qk_norm = qk_norm + if qk_norm: + self.q_norm = LayerNorm(n_state // n_head) + self.k_norm = LayerNorm(n_state // n_head) + + def forward( + self, x, xa=None, mask=None, kv_cache=None, + ): + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + The source tensor + xa : torch.Tensor + The sequence for cross attention + mask : torch.Tensor + The attention mask + kv_cache : dict + The key/value cache (for inference) + """ + q = self.query(x) + + if kv_cache is None or xa is None or self.key not in kv_cache: + # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors; + # otherwise, perform key/value projections for self- or cross-attention as usual. + k = self.key(x if xa is None else xa) + v = self.value(x if xa is None else xa) + else: + # for cross-attention, calculate keys and values once and reuse in subsequent calls. + k = kv_cache[self.key] + v = kv_cache[self.value] + + wv = self.qkv_attention(q, k, v, mask) + + return self.out(wv) + + def qkv_attention(self, q, k, v, mask=None): + """Computes self-attention + + Arguments + --------- + q : torch.Tensor + The queries tensor + k : torch.Tensor + The keys tensor + v : torch.Tensor + The values tensor + + Returns + ------- + wv : torch.Tensor + The attention output + """ + if self.causal and mask is not None: + raise ValueError("mask is not allowed when the attention is causal") + + if self.causal and q.size(1) == k.size(1): + causal = True + else: + causal = False + + q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) + k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) + v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) + + if self.qk_norm: + q = self.q_norm(q) + k = self.k_norm(k) + wv = ( + F.scaled_dot_product_attention( + q, k, v, mask, is_causal=causal, dropout_p=self.dropout + ) + .permute(0, 2, 1, 3) + .flatten(start_dim=2) + ) + + return wv + + +class AdaLN(nn.Module): + """Adaptive Layer Normalization, a Layer Norm implementation + that learns an affine transformation based on the level + embedding + + Arguemnts + --------- + n_state : int + The number of states + eps : float + The layer norm epsilon parameter""" + + def __init__(self, n_state, eps=1e-5): + super().__init__() + self.weight = nn.Linear(n_state, n_state, bias=False) + self.bias = nn.Linear(n_state, n_state, bias=False) + nn.init.constant_(self.weight.weight, 1.0) + nn.init.constant_(self.bias.weight, 0.0) + + self.n_state = n_state + self.eps = eps + + def forward(self, x, level_emb): + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + The tensor + level_emb : torch.Tensor + The level embedding + """ + w = self.weight(level_emb).unsqueeze(1) + b = self.bias(level_emb).unsqueeze(1) + x = nn.functional.layer_norm(x, (self.n_state,), eps=self.eps) + x = w * x + b + return x + + +def install_kv_cache_hook(model, cache): + """Sets up the key/value cache hook + + Arguments + --------- + model : torch.nn.Module + The model + cache : dict + The cache content + + Returns + ------- + cache : torch.Tensor + The cache dictionary (new or copied) + hooks : torch.Tensor + The installed hooks + """ + cache = {**cache} if cache is not None else {} + hooks = [] + + def save_to_cache(module, _, output): + """Saves the output in the module cache + + Arguments + --------- + module : torch.Tensor + A module instance + output : torch.Tensor + The module output + + Returns + ------- + result : torch.Tensor + Concatenated outputs + """ + if module not in cache: + # save as-is, for the first token or cross attention + cache[module] = output + else: + cache[module] = torch.cat([cache[module], output], dim=1).detach() + return cache[module] + + def install_hooks(layer): + """Installs the forward/backward hooks + + Arguments + --------- + layer : torch.nn.Module + A layer instance + """ + if isinstance(layer): + hooks.append(layer.key.register_forward_hook(save_to_cache)) + hooks.append(layer.value.register_forward_hook(save_to_cache)) + + model.apply(install_hooks) + return cache, hooks + + +def logits_to_tokens( + logits, opts, mask, search_algo=None, allow_eos=True, nq_level=None, +): + """ + Select the generated tokens and their scores based on logits prediction. + + Arguments + --------- + logits : torch.Tensor + predicted logits, of size [B, T, nq, V] + opts : SpeechLMInferenceOptions + search options + mask : torch.Tensor + mask to specify valid tokens, of size [B, 1, nq, V] + search_algo : str + search algorithm + allow_eos : bool + whether to allow end-of-sentence prediction + nq_level : int, optional + if not None, only conpute the specified codec level nq. + + Returns + ------- + gen_token_idx : torch.Tensor + The token indexes + gen_token_score : torch.Tensor + The token scores + """ + + assert logits.dim() == 4 + search_algo = search_algo if search_algo is not None else opts.search_algo + neg_inf = torch.finfo(logits.dtype).min + + # (1) Apply mask + if nq_level is not None: + mask = mask[:, :, nq_level : nq_level + 1] + + if allow_eos: + mask = mask.clone() + mask[:, :, 0, opts.eos] = False + + logits.masked_fill_(mask, neg_inf) + + # (2) token selection + if search_algo in ["topk_sampling"]: + topk_values, topk_indices = torch.topk(logits, opts.top_k, dim=-1) + probs = torch.softmax(topk_values / opts.sampling_temperature, dim=-1) + inner_indices = torch.multinomial( + probs.flatten(end_dim=-2), num_samples=1 + ).view(probs[..., :1].size()) + gen_token_idx = torch.gather(topk_indices, -1, inner_indices).squeeze( + -1 + ) + gen_token_score = ( + torch.gather(probs, -1, inner_indices).squeeze(-1).log() + ) + + elif search_algo in ["topp_sampling"]: + probs = torch.softmax(logits / opts.sampling_temperature, dim=-1) + sorted_probs, sorted_indices = torch.sort(probs, descending=True) + accum_probs = torch.cumsum(sorted_probs, dim=-1) + clip_probs = torch.where(accum_probs <= opts.top_p, sorted_probs, 0.0) + # always keep at least one candidate no matter what value it is + if torch.any(clip_probs[..., 0] == 0.0): + clip_probs[..., 0] = sorted_probs[..., 0] + clip_probs = clip_probs / clip_probs.sum(dim=-1, keepdim=True) + inner_indices = torch.multinomial( + clip_probs.flatten(end_dim=-2), num_samples=1 + ).view(clip_probs[..., :1].size()) + gen_token_idx = torch.gather(sorted_indices, -1, inner_indices).squeeze( + -1 + ) + gen_token_score = ( + torch.gather(clip_probs, -1, inner_indices).squeeze(-1).log() + ) + + elif search_algo in ["greedy_search", "teacher_force"]: + probs = logits.softmax(dim=-1) + topk_values, topk_indices = torch.topk(logits, 1, dim=-1) + gen_token_idx = topk_indices[:, :, :, 0] + gen_token_score = topk_values[:, :, :, 0].log() + + else: + raise NotImplementedError(f"opts.search_algo={opts.search_algo}") + + return gen_token_idx, gen_token_score + + +@torch.no_grad() +def install_continuous_features( + dec_emb: torch.Tensor, + enc_emb: Optional[torch.Tensor] = None, + conti_feats: Tuple = None, +): + if conti_feats is None: + return dec_emb, enc_emb + + assert dec_emb.size(0) == len(conti_feats) + if enc_emb is not None: + assert enc_emb.size(0) == len(conti_feats) + + for b, conti_feat in enumerate(conti_feats): + for conti_emb, start, end, part in conti_feat: + if part == "dec": + assert conti_emb.size(1) == dec_emb.size(2) + dec_emb[b, start:end] = conti_emb + else: + assert conti_emb.size(1) == enc_emb.size(2) + enc_emb[b, start:end] = conti_emb + + return dec_emb, enc_emb + + +def modality_index_to_mask( + modality_index, inference_opts, +): + """Converts a modality index to a mask + + Arguments + --------- + modality_index : int + The modality index + inference_opts : SpeechLMInferenceOptions + The inference options + + Returns + ------- + result : torch.Tensor + The result + """ + assert modality_index.dim() == 1 + modality_index = modality_index.cpu().tolist() + mask = torch.stack( + [inference_opts.masks[idx] for idx in modality_index], dim=0 + ).unsqueeze( + 1 + ) # [B, 1, nq, V] + + return mask + + +def masked_nll_loss( + log_probabilities, targets, mask, allowed_len_diff=3, reduction="mean" +): + """Similar to the standard nll_loss from SpeechBrain + but applies a custom mask + + Arguments + --------- + log_probabilities : torch.Tensor + The probabilities after log has been applied. + Format is [batch, log_p] or [batch, frames, log_p]. + targets : torch.Tensor + The targets, of shape [batch] or [batch, frames]. + mask : torch.Tensor + The mask for loss calculation + allowed_len_diff : int + Length difference that will be tolerated before raising an exception. + reduction : str + Options are 'mean', 'batch', 'batchmean', 'sum'. + See pytorch for 'mean', 'sum'. The 'batch' option returns + one loss per item in the batch, 'batchmean' returns sum / batch size. + """ + log_probabilities, targets = truncate( + log_probabilities, targets, allowed_len_diff + ) + log_probabilities = log_probabilities.transpose(1, -1) + loss = torch.nn.functional.nll_loss( + input=log_probabilities, target=targets.long(), reduction="none" + ) + loss *= mask + loss = reduce_loss(loss, mask, reduction, 0.0, log_probabilities, targets) + return loss + + +class SampleSelector: + """A base class for sample selectors""" + + def select(self, tokens, scores, text): + """Performs selection + + Arguments + --------- + tokens : list + The generated tokens + + scores : list + The scores + + text : str + The label for the sample + """ + raise NotImplementedError() + + +class DefaultSampleSelector(SampleSelector): + """A default no-op sample selector that simply selects the + first sample (useful only when nbest=1)""" + + def __init__(self, **kwargs): + pass + + def select(self, tokens, scores, text): + """Performs selection + + Arguments + --------- + tokens : list + The generated tokens + + scores : list + The scores + + text : str + The label for the sample + """ + return tokens[0] + + +RE_PUNCTUATION = re.compile( + "|".join(re.escape(char) for char in string.punctuation) +) + + +class WhisperASRSampleSelector(SampleSelector): + """A selector implemented using Whisper + + Arguments + --------- + tokenizer: BaseTokenizer + A tokenizer interface + source : str + The source for the Whisper model + savedir : str + The path where the Whisper model will be saved + model : Whisper + Alternatively, a pre-initialized Whisper model instance + sample_rate : int + The sample rate of the underlying Whisper model + tokenizer_sample_rate : int + The sample rate of the tokenizer provided + min_decode_ratio : float + The minimum decode ratio for ASR + max_decode_ratio : float + The maximum decode ratio for ASR + language : str + The ASR language + debug : bool + Whether debug mode is enabled. This will trigger + more verbose logging, including a WER report + token_model_kwargs : dict + Additional arguments for the tokenizer + decoding function + device : str | torch.Device + The target device + """ + + def __init__( + self, + tokenizer, + source=None, + savedir=None, + model=None, + sample_rate=16000, + tokenizer_sample_rate=16000, + min_decode_ratio=0.0, + max_decode_ratio=1.0, + language="english", + token_shift=0, + offsets=None, + debug=False, + token_model_kwargs=None, + device="cuda", + ): + self.tokenizer = tokenizer + self.sample_rate = sample_rate + self.tokenizer_sample_rate = tokenizer_sample_rate + # TODO: Pass the device + if model is not None: + self.model = model + else: + self.model = Whisper( + source, savedir, sample_rate, freeze=True, freeze_encoder=True, + ).to(device) + self.model.device = device + self.model.tokenizer.set_prefix_tokens(language, "transcribe", False) + self.searcher = S2SWhisperGreedySearcher( + self.model, + min_decode_ratio=min_decode_ratio, + max_decode_ratio=max_decode_ratio, + ) + self.token_shift = token_shift + self.offsets = offsets + self.debug = debug + if token_model_kwargs is None: + token_model_kwargs = {} + self.token_model_kwargs = token_model_kwargs + tokenizer.device = device + if hasattr(tokenizer, "codec_vocoder"): + tokenizer.codec_vocoder.to(device) + tokenizer.codec_vocoder.device = device + + def select(self, tokens, scores, text): + """Performs selection + + Arguments + --------- + tokens : list + The generated tokens + + scores : list + The scores + + text : str + The label for the sample + """ + tokens, length = batch_pad_right(tokens) + tokens_shift = tokens - self.token_shift + if self.offsets is not None: + tokens_shift = tokens_shift - self.offsets + tokens_shift = tokens_shift.clip(0) + wav = self.tokenizer.tokens_to_sig( + tokens_shift, **self.token_model_kwargs + ) + if self.sample_rate != self.tokenizer_sample_rate: + wav = torchaudio.functional.resample( + wav, + orig_freq=self.tokenizer_sample_rate, + new_freq=self.sample_rate, + ) + wav = undo_padding_tensor(wav, length) + metric = ErrorRateStats() + text = text.split(" ") + ids = range(len(wav)) + preds = [self.predict(wav_item).split(" ") for wav_item in wav] + metric.append(ids, preds, [text] * len(wav)) + sample_scores = [score["WER"] for score in metric.scores] + idx = torch.argmin(torch.tensor(sample_scores)).item() + logger.info( + "Ground truth text: %s, sample scores: %s, best: #%d", + text, + sample_scores, + idx, + ) + if self.debug: + sio = StringIO() + metric.write_stats(sio) + logger.info("%s", sio.getvalue()) + return tokens[idx] + + def predict(self, wav): + """Makes an ASR prediction + + Arguments + --------- + wav : torch.Tensor + A raw waveform + + Returns + ------- + text : str + The text predicted by the ASR + """ + if wav.dim() < 2: + wav = wav.unsqueeze(0) + wav = self.model.pad_or_trim(wav) + mels = self.model.log_mel_spectrogram(wav) + enc_out = self.model.forward_encoder(mels) + pred, _, _, _ = self.searcher( + enc_out.detach(), torch.tensor(1.0, device=wav.device) + ) + pred = self.model.tokenizer.batch_decode( + pred, skip_special_tokens=True + )[0] + pred = self.normalize(pred) + return pred + + def normalize(self, text): + """Performs text normalization (uppercase, remove whitespace, + remove punctuation) + + Arguments + --------- + text : str + Unnormalized text + + Returns + ------- + text : str + Normalized text + """ + text = text.upper() + text = text.strip() + text = RE_PUNCTUATION.sub("", text) + return text diff --git a/benchmarks/DASB/orion/hparams_tpe.yaml b/benchmarks/DASB/orion/hparams_tpe.yaml new file mode 100644 index 000000000..fb6a7c9b0 --- /dev/null +++ b/benchmarks/DASB/orion/hparams_tpe.yaml @@ -0,0 +1,6 @@ +experiment: + algorithms: + tpe: + seed: 1986 + n_initial_points: 20 + n_ei_candidates: 24 diff --git a/benchmarks/DASB/run_discriminative_benchmark.sh b/benchmarks/DASB/run_discriminative_benchmark.sh deleted file mode 100644 index 79383deb2..000000000 --- a/benchmarks/DASB/run_discriminative_benchmark.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash -# Please consult the README.md file for instructions on how to run the benchmark. - -tokenizer_name=$1 -if [[ "$tokenizer_name" == "" ]]; then - echo "Usage: run_generative_benchmark.sh " - exit 1 -fi - -output_folder='/path/to/output' -declare -a DatasetsFolders=('path/to/LibriSpeech' 'path/to/CommonVoice' 'path/to/IEMOCAP' 'path/to/SLURP' 'path/to/Google-speech-commands' 'path/to/VoiceCeleb1') -declare -a ConsideredTasks=('LibriSpeech/ASR' 'CommonVoice/ASR' 'IEMOCAP/emotion_recognition' 'SLURP/intent_classification' 'Google-speech-commands/keyword-spotting' 'VoiceCeleb1/speaker_ver') -declare -a DownStreams=('LSTM' 'LSTM' 'ecapa_tdnn' 'LSTM_linear' 'Xvector','Xvector') -declare -a Locales=('cy' 'eu') -declare -a LocalesVobSize=(100 200) - -shift -script_args="$@" - -for i in "${!ConsideredTasks[@]}"; do - task=${ConsideredTasks[i]} - downstream=${DownStreams[i]} - dataset_folder=${DatasetsFolders[i]} - recipe_extra_args="$script_args" - set -- "$recipe_extra_args" - if [[ "$task" == "CommonVoice/ASR" ]]; then - echo "${tokenizer_name}/${task}/${downstream}" - for j in "${!Locales[@]}"; do - locale=${Locales[j]} - vocab=${LocalesVobSize[j]} - python $task/$downstream/train_$tokenizer_name.py $task/$downstream/hparams/train_$tokenizer_name.yaml --output_folder $output_folder/$tokenizer_name/$task/$downstream/$locale --data_folder $dataset_folder/$locale --language $locale --output_neurons $vocab $@ - done - else - python $task/$downstream/train_$tokenizer_name.py $task/$downstream/hparams/train_$tokenizer_name.yaml --output_folder $output_folder/$tokenizer_name/$task/$downstream --data_folder $dataset_folder $@ - fi -done diff --git a/benchmarks/DASB/run_experiments.sh b/benchmarks/DASB/run_experiments.sh new file mode 100755 index 000000000..5dcd6b397 --- /dev/null +++ b/benchmarks/DASB/run_experiments.sh @@ -0,0 +1,209 @@ +#!/bin/bash + +########################################################### +# Script to run downstream evaluation training, optionally with multiple seeds. +# This script loops over seeds and trains different models. +# At the end, the final performance is computed with the aggregate_results.py script that provides the average performance. +# +# Usage: +# ./run_experiments.sh --hparams benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml --data_folder LibriSpeech --cached_data_folder cache/ \ +# --output_folder results/LibriSpeech/ASR/encodec/LSTM --task ASR --dataset LibriSpeech --seed 1986 --nruns 2 --eval_metric WER --tokens_folder LibriSpeech/extraction-emb/speech_tokenizer/save/librispeech/ + +# +# Authors: +# - Pooneh Mousavi (2024) +########################################################### + +# Initialize variables +hparams="" +data_folder="" +cached_data_folder="" +output_folder="" +task="" +dataset="" +seed="" +nruns="" +eval_metric="acc" +eval_set="test" +rnd_dir=False +additional_flags="" + + +# Function to print argument descriptions and exit +print_argument_descriptions() { + echo "Usage: $0 [options]" + echo "Options:" + echo " --hparams hparams_path Hparam YAML file" + echo " --data_folder data_folder_path Data folder path" + echo " --cached_data_folder cache_path Cached data folder path" + echo " --output_folder output_path Output folder path" + echo " --task task downstream task" + echo " --dataset dataset dataset" + echo " --seed random_seed Seed (random if not specified)" + echo " --nruns num_runs Number of runs" + echo " --eval_metric metric Evaluation metric (e.g., acc or WER)" + echo " --eval_set dev or test Evaluation set. Default: test" + echo " --rnd_dir If True the results are stored in a subdir of the output folder with a random name (useful to store all the results of an hparam tuning). Default: False" + exit 1 +} + + +# Parse command line +POSITIONAL_ARGS=() + +while [[ $# -gt 0 ]]; do + case $1 in + --hparams) + hparams="$2" + shift + shift + ;; + + --data_folder) + data_folder="$2" + shift + shift + ;; + + --cached_data_folder) + cached_data_folder="$2" + shift + shift + ;; + + --output_folder) + output_folder="$2" + shift + shift + ;; + + --task) + task="$2" + shift + shift + ;; + + --dataset) + dataset="$2" + shift + shift + ;; + + --seed) + seed="$2" + shift + shift + ;; + + --nruns) + nruns="$2" + shift + shift + ;; + + --eval_metric) + eval_metric="$2" + shift + shift + ;; + + --eval_set) + eval_set="$2" + shift + shift + ;; + + --rnd_dir) + rnd_dir="$2" + shift + shift + ;; + + + --help) + print_argument_descriptions + ;; + + -*|--*) + additional_flags+="$1 $2 " # store additional flags + shift # past argument + ;; + + + *) + POSITIONAL_ARGS+=("$1") # save positional arg + shift # past argument + ;; + esac +done + + +# Check for required arguments +if [ -z "$hparams" ] ||[ -z "$data_folder" ] || [ -z "$output_folder" ] || [ -z "$nruns" ]; then + echo "ERROR: Missing required arguments! Please provide all required options." + print_argument_descriptions +fi + +# Manage Seed (optional argument) +seed="${seed:-$RANDOM}" + + +if [ "$rnd_dir" = True ]; then + if [[ ! -z "$ORION_TRIAL_ID" ]]; then + # Use the Orion Trial ID to ensure interrupted trials are resumed + output_folder="$output_folder/$ORION_TRIAL_ID" + else + rnd_dirname=$(tr -dc 'a-zA-Z' < /dev/urandom | head -c 6) + output_folder="$output_folder/$rnd_dirname" + fi +fi + +# Make sure the output_folder is created +mkdir -p $output_folder + +# Print command line arguments and save to file +{ + echo "hparams: $hparams" + echo "data_folder: $data_folder" + echo "cached_data_folder: $cached_data_folder" + echo "output_folder: $output_folder" + echo "task: $task" + echo "dataset: $dataset" + echo "seed: $seed" + echo "nruns: $nruns" + echo "eval_metric: $eval_metric" + echo "eval_set: $eval_set" + echo "rnd_dir: $rnd_dir" + echo "additional flags: $additional_flags" +} | tee "$output_folder/flags.txt" + + +# Creating output folder +mkdir -p $output_folder +mkdir -p $data_folder +mkdir -p $cached_data_folder + +# Function to run the training experiment +run_experiment() { + +eval python $dataset/$task/train.py $hparams --cached_data_folder=$cached_data_folder --seed=$seed --data_folder=$data_folder --output_folder=$output_folder_exp \ +$additional_flags + +} + +# Run multiple training experiments (with different seeds) +for i in $(seq 0 1 $(( nruns - 1 ))); do + ((run_idx = i + 1)) + run_name=run"$run_idx" + output_folder_exp="$output_folder"/"$run_name"/$seed + + run_experiment $output_folder_exp + + + # Changing Random seed + seed=$((seed+1)) +done + + +echo 'Final Results (Performance Aggregation)' +python utils/aggregate_results.py $output_folder "$eval_metric" | tee -a $output_folder/aggregated_performance.txt diff --git a/benchmarks/DASB/run_extraction.sh b/benchmarks/DASB/run_extraction.sh new file mode 100644 index 000000000..92cc81381 --- /dev/null +++ b/benchmarks/DASB/run_extraction.sh @@ -0,0 +1,114 @@ +#!/bin/bash + +########################################################### +# Script to extracts and save tokens from dataset. +# +# Usage: +# ./ $run_extraction.sh --data_folder LibriSpeech --output_folder results/LibriSpeech/ASR/encodec/LSTM --tokenizer encodec --dataset LibriSpeech + +# Authors: +# - Pooneh Mousavi (2024) +########################################################### + +# Initialize variables +data_folder="" +output_folder="" +tokenizer="" +dataset="" +save_embedding=False +additional_flags="" + + +# Function to print argument descriptions and exit +print_argument_descriptions() { + echo "Usage: $0 [options]" + echo "Options:" + echo " --data_folder data_folder_path Data folder path" + echo " --output_folder output_path Output folder path" + echo " --tokenizer tokenizer tokenizer" + echo " --dataset dataset dataset" + echo " --save_embedding save_embedding If True the the embedding are saved. Default: False" + exit 1 +} + + +# Parse command line +POSITIONAL_ARGS=() + +while [[ $# -gt 0 ]]; do + case $1 in + --data_folder) + data_folder="$2" + shift + shift + ;; + + --output_folder) + output_folder="$2" + shift + shift + ;; + + --tokenizer) + tokenizer="$2" + shift + shift + ;; + + --dataset) + dataset="$2" + shift + shift + ;; + + --save_embedding) + save_embedding="$2" + shift + shift + ;; + + --help) + print_argument_descriptions + ;; + + -*|--*) + additional_flags+="$1 $2 " # store additional flags + shift # past argument + ;; + + + *) + POSITIONAL_ARGS+=("$1") # save positional arg + shift # past argument + ;; + esac +done + + +# Check for required arguments +if [ -z "$tokenizer" ] ||[ -z "$data_folder" ] || [ -z "$output_folder" ] || [ -z "$dataset" ]; then + echo "ERROR: Missing required arguments! Please provide all required options." + print_argument_descriptions +fi + + +# Make sure the output_folder is created +mkdir -p $output_folder + +# Print command line arguments and save to file +{ + echo "data_folder: $data_folder" + echo "output_folder: $output_folder" + echo "tokenizer: $tokenizer" + echo "dataset: $dataset" + echo "save_embedding: $save_embedding" + echo "additional flags: $additional_flags" +} | tee "$output_folder/flags.txt" + + +# Creating output folder +mkdir -p $output_folder +mkdir -p $data_folder + +python $dataset/extraction/extract.py $dataset/extraction/hparams/$tokenizer.yaml --data_folder=$data_folder --output_folder=$output_folder --save_embedding=$save_embedding \ +$additional_flags diff --git a/benchmarks/DASB/run_generative_benchmark.sh b/benchmarks/DASB/run_generative_benchmark.sh deleted file mode 100644 index d5dc0d1d4..000000000 --- a/benchmarks/DASB/run_generative_benchmark.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/bash -# Please consult the README.md file for instructions on how to run the benchmark. - -tokenizer_name=$1 -if [[ "$tokenizer_name" == "" ]]; then - echo "Usage: run_generative_benchmark.sh " - exit 1 -fi - -output_folder='path/to/output' -librimix_path='path/to/Libri2Mix' -voicebank_path='path/to/VoiceBank' -ljspeech_path='path/to/ljspeech' -utmos_path='path/to/utmos' -tts_args="--token_list_file_text %recipe_root%/hparams/char_en.txt --utmos_model_path $utmos_path" - -declare -a DatasetsFolders=(\ - "$librimix_path" \ - "$voicebank_path" \ - "$ljspeech_path" \ - "$ljspeech_path" \ -) -declare -a ConsideredTasks=(\ - 'Libri2Mix/separation' \ - 'VoiceBank/enhancement' \ - 'LJSpeech/TTS' \ - 'LJSpeech/TTS' \ -) -declare -a DownStreams=(\ - 'conformer' \ - 'conformer' \ - 'tokotron' \ - 'tokotron' \ -) -declare -a ExtraArgs=(\ - '' \ - '' \ - "$tts_args" \ - "$tts_args --enc_num_layers 3 --dec_num_layers 6" \ -) - -declare -a OutputSuffix=(\ - '' \ - '' \ - '' \ - '-small' -) - -shift -script_args="$@" - -for i in "${!ConsideredTasks[@]}"; do - task=${ConsideredTasks[i]} - downstream=${DownStreams[i]} - dataset_folder=${DatasetsFolders[i]} - extra_args=${ExtraArgs[i]} - suffix=${OutputSuffix[i]} - recipe_root="$task/$downstream" - recipe_extra_args="$script_args ${extra_args//%recipe_root%/$recipe_root}" - set -- "$recipe_extra_args" - echo "${tokenizer_name}/${task}/${downstream}" - python $task/$downstream/train_$tokenizer_name.py \ - $task/$downstream/hparams/train_$tokenizer_name.yaml \ - --output_folder $output_folder/$tokenizer_name/$task/$downstream$suffix \ - --data_folder $dataset_folder \ - $@ -done diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh new file mode 100755 index 000000000..554ed10f0 --- /dev/null +++ b/benchmarks/DASB/run_hparam_optimization.sh @@ -0,0 +1,442 @@ +#!/bin/bash + +########################################################### +# Hyperparameter Tuning Script for EEG Model with Orion +########################################################### + +# Description: +# This script facilitates hyperparameter tuning for a given audio tokenizer, dowsnteram model and dataset using Orion. + +# Usage: +# ./run_hparam_optimization.sh --exp_name 'ASR-encodec-LSTM_hopt' \ + # --hparams LibriSpeech/ASR/hparams/LSTM/train.yaml \ + # --data_folder path/LibriSpeech \ + # --cached_data_folder path/cache/ \ + # --output_folder results/LibriSpeech/ASR/encodec/LSTM \ + # --task ASR \ + # --dataset LibriSpeech \ + # --seed 1986 \ + # --nruns 1 \ + # --nruns_eval 5 \ + # --eval_metric WER \ + # --exp_max_trials 50 \ + # --tokens_folder results/LibriSpeech/extraction-emb/encodec/save/librispeech/ \ + # --run_name encodec +# Optimization Steps: +# The script supports multiple hyperparameter optimization steps. + +# Script Workflow: +# 1. Search for the orion flags in the specified hparam file. +# 2. Run the orion-hunt command for hyperparameter tuning. +# By default, TPE (Tree-structured Parzen Estimator) hyperparameter tuning is +# performed, as specified in the default orion config file at hparams/orion/hparams_tpe.yaml. +# 3. Save the best hyperparameters, which can be viewed using torch-info. +# 4. Loop until flags like @orion_step are found in the YAML file. +# +# Final Performance Evaluation: +# At the end of the optimization process, the script computes the final performance +# using the best hyperparameters on the test set. +# This is done by averaging over nruns_eval different seeds. +# +# Note: More detailed information can be found in the README.md file. + +# Authors: +# - Pooneh Mousavi 2024 +########################################################### + +# Initialize variables +exp_name="hopt" +hparams="" +data_folder="" +cached_data_folder="" +output_folder="" +task="" +dataset="" +seed=1986 +nruns="" +nruns_eval=10 +eval_metric="acc" +config_file="orion/hparams_tpe.yaml" +mne_dir="" +orion_db_address="" +orion_db_type="PickledDB" +exp_max_trials=50 +store_all=True +compress_exp=True +hparam_filter="" + +# Function to print argument descriptions and exit +print_argument_descriptions() { + echo "Usage: $0 [options]" + echo "Options:" + echo " --exp_name Name Name that Orion gives to the experiment" + echo " --hparms hparam_file YAML file containing the hparam to optimize. The hyperparameters decorated with @orion_step1 or @orion_step1 in the YAML file will be used" + echo " --data_folder data_path Folder were the data are stored. If not available, they will be downloaded there." + echo " --cached_data_folder path [Optional] Folder were the data in pkl format will be cached." + echo " --output_folder output_path Output folder were the results will be stored" + echo " --task task downstream task" + echo " --dataset dataset dataset" + echo " --seed random_seed [Optional] Seed (random if not specified)" + echo " --nruns num_runs Number of runs for each hparam selection." + echo " --nruns_eval num_runs Number of runs for the final evaluation (with best hparams) on the test set" + echo " --eval_metric metric [Optional] Evaluation metric description. Default:acc" + echo " --config_file config_file [Optional] Orion config file. Default: hparams/orion/hparams_tpe.yaml" + echo " --mne_dir mne_dir [Optional] MNE directory. Need it different from your home (see notes on MNE in README.md)" + echo " --orion_db_address [Optional] Path of the database where orion will store hparams and performance" + echo " --orion_db_type db_type [Optional] Type of the dataset that orion will use. Default: PickledDB" + echo " --exp_max_trials int [Optional] Maximum number of hparam trials for each oprimization step. Default:50" + echo " --store_all Bool [Optional] When set to True, the output folders of all hparam trials will be stored in randomly named folders. Default: False" + echo " --compress_exp Bool [Optional] When set to True, this option compresses the output folders of all hyperparameter trials into a single tar.gz file. This is particularly useful when store_all is set to True, as it helps prevent the accumulation of a large number of files. Default: False" + exit 1 +} + +POSITIONAL_ARGS=() + +while [[ $# -gt 0 ]]; do + case $1 in + + --exp_name) + exp_name="$2" + shift + shift + ;; + + --hparams) + hparams="$2" + shift + shift + ;; + + --data_folder) + data_folder="$2" + shift + shift + ;; + + --cached_data_folder) + cached_data_folder="$2" + shift + shift + ;; + + --output_folder) + output_folder="$2" + shift + shift + ;; + + --task) + task="$2" + shift + shift + ;; + + --dataset) + dataset="$2" + shift + shift + ;; + + --seed) + seed="$2" + shift + shift + ;; + + --nruns) + nruns="$2" + shift + shift + ;; + + --nruns_eval) + nruns_eval="$2" + shift + shift + ;; + + --eval_metric) + eval_metric="$2" + shift + shift + ;; + + --config_file) + config_file="$2" + shift + shift + ;; + + --mne_dir) + mne_dir="$2" + shift + shift + ;; + + --orion_db_address) + orion_db_address="$2" + shift + shift + ;; + + --orion_db_type) + orion_db_type="$2" + shift + shift + ;; + + --exp_max_trials) + exp_max_trials="$2" + shift + shift + ;; + + --store_all) + store_all="$2" + shift + shift + ;; + + --compress_exp) + compress_exp="$2" + shift + shift + ;; + + --hparam_filter) + hparam_filter="$2" + shift + shift + ;; + + --help) + print_argument_descriptions + ;; + + -*|--*) + name=$1 + value=$2 + if [[ "$name" =~ ^--eval_run_ ]]; then + name=$(echo $name | sed s/^--eval_run_/--/) + eval_run_additional_flags+="$name $value " + else + if [[ ! "$eval_run_additional_flags" =~ "$name " ]]; then + eval_run_additional_flags+="$name $value " + fi + additional_flags+="$name $value " # store additional flags + fi + shift # past argument + ;; + + + *) + POSITIONAL_ARGS+=("$1") # save positional arg + shift # past argument + ;; + esac +done + + +# Check for required arguments +if [ -z "$output_folder" ] || [ -z "$data_folder" ] || [ -z "$hparams" ] || [ -z "$nruns" ]; then + echo "ERROR: Missing required arguments! Please provide all required options." + print_argument_descriptions +fi + +# Set mne_dir if specified +if [ "$mne_dir" ]; then + export _MNE_FAKE_HOME_DIR=$mne_dir +fi + +# Assign default value to cached_data_folder +if [ -z "$cached_data_folder" ]; then + cached_data_folder="$data_folder/cache" +fi + + +# Set orion db address if specified +if [ -z "$orion_db_address" ]; then + orion_db_address=$output_folder'/'$exp_name'.pkl' +fi +export ORION_DB_ADDRESS=$orion_db_address +export ORION_DB_TYPE=$orion_db_type + +echo "-------------------------------------" +echo "Experiment Name: $exp_name" +echo "hparams: $hparams" +echo "Output Folder: $output_folder" +echo "Data Folder: $data_folder" +echo "Cached Data Folder: $cached_data_folder" +echo "task: $task" +echo "dataset: $dataset" +echo "Hparam File: $hparams" +echo "Number of Runs: $nruns" +echo "Number of Eval Runs: $nruns_eval" +echo "Eval Metric: $eval_metric" +echo "Seed: $seed" +echo "Additional Flags: $additional_flags" +echo "Orion Config File: $config_file" +echo "Orion Database type: $orion_db_type" +echo "Orion Database file: $orion_db_address" +echo "Experiment Max Trials: $exp_max_trials" +echo "-------------------------------------" + + +# This function will extract all the optimization flags added in the yaml file +# The input is a text file (e.g, a yaml file) and a pattern (e.g, "@orion_step1:") +# The ouput are the detected flags (e.g., --dropout~"uniform(0.0, 0.5)"). +get_flag() { + local file_path="$1" + local pattern="$2" + local filter="$3" + + if [[ -z "$filter" ]]; then + filter=".*" + fi + + # Check if the file exists + if [ ! -f "$file_path" ]; then + echo "Error: File '$file_path' not found." + return 1 + fi + + # Use grep to find all lines containing the pattern and then extract the flags using sed + grep -o "$pattern.*" "$file_path" | sed "s/$pattern//" | grep $filter | tr -d '\n' +} + + +# Function for updatading the hparam yaml file with the best hparams found at step 1 +update_hparams() { + local best_hparams_file="$1" + local hparams_yaml_file="$2" + local output_yaml_file="$3" + + # Read the values from best_hparams.txt into an associative array + declare -A best_hparams + while IFS=": " read -r key value; do + best_hparams["$key"]=$value + done < "$best_hparams_file" + + + # Read the hparams.yaml file into a variable + local hparams_content=$(cat "$hparams_yaml_file") + + # Update values in hparams_content using values from best_hparams + for key in "${!best_hparams[@]}"; do + local pattern="^$key: .*" + local replacement="$key: ${best_hparams[$key]}" + hparams_content=$(sed "s/$pattern/$replacement/g" <<< "$hparams_content") + done + + # Write the updated content to a new YAML file + echo "$hparams_content" > "$output_yaml_file" +} + +# Function for extracting the best hparams from orion-info +function extract_best_params() { + local input_file="$1" + local best_trial_line=$(grep -n "best trial:" "$input_file" | cut -d ":" -f 1) + local params_lines=$(tail -n +$best_trial_line "$input_file" | awk '/params:/{flag=1;next}/start time:/{flag=0}flag') + local formatted_params=$(echo "$params_lines" | sed -e 's/^[[:space:]]*//' -e 's/: /: /' -e '/^$/d' -e 's#^/##') + echo "$formatted_params" +} + +# Running hparam tuning (loop over multiple steps) +step_id=1 +hparams_step=$hparams +pattern="@orion_step1:" +opt_flags=$(get_flag "$hparams_step" "$pattern" "$hparam_filter") + +# Check if the string is empty and exit with an error if it is +if [ -z "$opt_flags" ]; then + echo "Error: Optimization flags not found in '$hparams'" + echo "Please ensure that the Orion optimization flags are set in the hparam file using in-line comments like:" + echo "# @orion_step1: --dropout~\"uniform(0.0, 0.5)\"" + exit 1 # Exit with a non-zero error code +fi + + +while [ -n "$opt_flags" ]; do + # Do something + output_folder_step="$output_folder"/step"$step_id" + mkdir -p $output_folder_step + exp_name_step="$exp_name"_step"$step_id" + + echo + echo "**********************************************************************************************" + echo "Running hparam tuning (step $step_id)..." + echo "- This might take several hours!" + echo "- The best set of hparams will be save in $output_folder_step" + echo "- You can monitor the evolution of the hparam optimization with: orion status -n $exp_name" + echo "......" + echo "**********************************************************************************************" + echo + # Setting up orion command + orion_hunt_command="orion hunt -n $exp_name_step -c $config_file --exp-max-trials $exp_max_trials \ + ./run_experiments.sh --hparams $hparams_step --data_folder $data_folder --cached_data_folder $cached_data_folder \ + --output_folder $output_folder_step/exp --task $task --dataset $dataset --seed $seed --nruns $nruns \ + --eval_metric $eval_metric --eval_set dev --rnd_dir $store_all --testing False $additional_flags" + + + # Appending the optimization flags + orion_hunt_command="$orion_hunt_command $opt_flags" + + echo $orion_hunt_command &> "$output_folder_step/orion_hunt_command.txt" + + # Execute the command for hparm tuning + eval $orion_hunt_command + + # Compress the exp folder (if required) + if [ "$compress_exp" = True ] && [ ! -e "$output_folder_step/exp.tar.gz" ]; then + tar -czf "$output_folder_step/exp.tar.gz" "$output_folder_step/exp" + if [ -d "$output_folder_step/exp" ]; then + rm -rf "$output_folder_step/exp" + fi + + fi + + # Storing best haprams + orion info --name $exp_name_step &> $output_folder_step/orion-info.txt + + # Extract list of the best hparams from orion-info + # Find the line number where "best trial:" appears + best_trial_line=$(grep -n "best trial:" $output_folder_step/orion-info.txt | cut -d ":" -f 1) + + # Extract and store the best set of hparams + best_params_output=$(extract_best_params "$output_folder_step/orion-info.txt") + best_hparams_file="$output_folder_step/best_hparams.txt" + echo "$best_params_output" > $best_hparams_file + + # Store the current best yaml file + best_yaml_file="$output_folder_step/best_hparams.yaml" + update_hparams "$best_hparams_file" "$hparams_step" "$best_yaml_file" + + # Update best hparam step + hparams_step=$best_yaml_file + + # Update step variable + ((step_id++)) + + # Update search pattern + pattern="@orion_step$step_id:" + + # update optimization flags pattern + opt_flags=$(get_flag "$hparams_step" "$pattern" "$hparam_filter") +done + +echo +echo "**********************************************************************************************" +echo "Running Final Evaluation on the best hparams (test-set)..." +echo "**********************************************************************************************" +echo + +final_yaml_file="$output_folder/best_hparams.yaml" +scp $best_yaml_file $final_yaml_file + +# Running evaluation on the test set for the best models +./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder --cached_data_folder $cached_data_folder \ + --output_folder $output_folder/best --task $task --dataset $dataset --seed $seed\ + --nruns $nruns_eval --eval_metric $eval_metric --eval_set test \ + --rnd_dir False --testing True $eval_run_additional_flags + +echo "The test performance with best hparams is available at $output_folder/best" diff --git a/benchmarks/DASB/utils/aggregate_results.py b/benchmarks/DASB/utils/aggregate_results.py new file mode 100644 index 000000000..e11046ade --- /dev/null +++ b/benchmarks/DASB/utils/aggregate_results.py @@ -0,0 +1,151 @@ +#!/usr/bin/python +""" +Snippet to aggregate the results over multiple runs of the same experiment. +This is useful when we run multiple experiments with different seeds and we +want to compute the average performance. The script also reports the final +metric to Orion (when needed for hyperparameter tuning). + +The script searches for the result files (_results.txt) and computes the mean +and the standard deviation of the given evaluation metrics (e.g., acc or f1). +The results must have an identical format (with only different performance +numbers). + +To run this script: + + > python aggregate_results.py your_result_folder acc + +Author +------ +Pooneh Mousavi 2024 +""" + +import sys +import re +import numpy as np +from orion.client import report_objective +from speechbrain.utils.data_utils import get_all_files + + +def get_prototype(res_file, eval_metric): + """Parses a result file and adds a placeholder where the aggregated metrics + should be printed. It also returns the number of detected metrics. + + Arguments + --------- + res_file: path + Path of the result file to parse. + eval_metric: path + Metric of interest (e.g, acc or f1). + + Returns + --------- + prototype: list + List of the lines of the result file (with as placeholder). + n_metrics: int + Number of metrics to replace in the result files. + """ + prototype = [] + n_metrics = 0 + + # Open the first res file and figure out where the metrics are + with open(res_file) as file_in: + for line in file_in: + if eval_metric in line: + line = line.split(eval_metric)[0] + # The placeholder for the metric is + line = line + eval_metric + " " + n_metrics = n_metrics + 1 + prototype.append(line) + return prototype, n_metrics + + +def get_metrics(res_files, eval_metric): + """Summarizes the metrics of interest in a matrix. + + Arguments + --------- + res_files: list + List of all the result files. + eval_metric: path + Metric of interest (e.g, acc or f1). + + Returns + --------- + metrics: np.array + Matrix (n_metrics, n_files) containing the metrics of interest. + """ + + # Metric initialization + metrics = np.zeros([n_metrics, len(res_files)]) + + # Loop over files + for i in range(len(res_files)): + cnt = 0 + # Metric extraction + with open(res_files[i]) as file_in: + for line in file_in: + if eval_metric in line: + # Use regex to find the test WER value + match = re.search( + rf"{eval_metric}: (\d+\.\d+(?:e[+-]?\d+)?)", line + ) + if match: + value = match.group(1) + value = float(value) + metrics[cnt, i] = value + cnt = cnt + 1 + return metrics + + +def aggregate_metrics(prototype, metrics): + """Prints the aggregated metrics.It replaces the placeholders with + the corresponding metrics. + + Arguments + --------- + prototype: list + List of the lines of the result file (with as placeholder). + metrics: np.array + Matrix (n_metrics, n_files) containing the metrics of interest. + """ + cnt = 0 + for line in prototype: + if eval_metric in line: + values_line = "[" + for i in range(len(res_files)): + values_line = values_line + "%f " % float(metrics[cnt, i]) + values_line = values_line[:-1] + values_line = values_line + "] avg: %f ± %f " % ( + float(metrics[cnt, :].mean()), + float(metrics[cnt, :].std()), + ) + line = line.replace("", values_line) + cnt = cnt + 1 + print(line) + + +if __name__ == "__main__": + output_folder = sys.argv[1] + eval_metric = sys.argv[2] + + # Getting the list of the result files in the output folder + res_files = get_all_files(output_folder, match_and=["train_log.txt"]) + + # Gettin a prototype file + prototype, n_metrics = get_prototype(res_files[0], eval_metric) + + # Extracting the metrics of interest + metrics = get_metrics(res_files, eval_metric) + + # print aggregated metrics + aggregate_metrics(prototype, metrics) + + final_metric = metrics[-1, :].mean() + + # Report final metric to Orion + # Remember: orion expects metrics to be minimized! + if eval_metric in ["acc", "f1"]: + final_metric = 1 - final_metric + elif eval_metric == "utmos": + final_metric = -final_metric + report_objective(final_metric) diff --git a/benchmarks/DASB/utils/audio_tokens.py b/benchmarks/DASB/utils/audio_tokens.py deleted file mode 100644 index 9dc4014c4..000000000 --- a/benchmarks/DASB/utils/audio_tokens.py +++ /dev/null @@ -1,193 +0,0 @@ -"""Utilities for discrete audio token models - - -Authors - * Artem Ploujnikov 2023 -""" -import torch -from speechbrain.dataio.batch import PaddedBatch -from speechbrain.utils.data_utils import batch_pad_right -from functools import partial - - -def get_silence_token( - model, - sample_length=100000, - extract_emb=True, - device=None, - model_kwargs=None, -): - """Attempts to find out the silence tokens for a given model, - if applicable - - Arguments - --------- - model : nn.Module - A discrete token model, taking (wav, lengths) as arguments - sample_length : int - The length of the sample - extract_emb : bool - Whether to extract embeddings - device : str | torch.Device - The device to use - model_kwargs : dict - Additional arguments to pass to the model - - Returns - ------- - silence_tokens : torch.Tensor - The token(s) corresponding to silence - - silece_emb : torch.Tensor - The embedding(s) corresponding to silence - - """ - if device is None: - device = next(model.parameters()).device - if model_kwargs is None: - model_kwargs = {} - - audio = torch.zeros(1, sample_length, device=device) - length = torch.ones(1, device=device) - result = model(audio, length, **model_kwargs) - tokens = result[0] - silence_tokens = tokens.squeeze(0).mode(0).values - silence_emb = None - if extract_emb: - if hasattr(model, "embeddings"): - silence_emb = model.embeddings( - silence_tokens[None, None, :] - ).squeeze() - else: - heads = tokens.shape[-1] - embs = result[1] - mode_idx = [ - (tokens[0, :, head] == silence_tokens[head]).nonzero()[0].item() - for head in range(heads) - ] - silence_emb = torch.stack( - [embs[0, idx, head] for head, idx in enumerate(mode_idx)] - ) - return silence_tokens, silence_emb - - -def feature_pad_to(tensor, length, padding=None): - """Pads feature dimensions to the specified length with the specified padding, - assuming a (Batch x Length x Features..) tensor - - Arguments - --------- - tensor : torch.Tensor - The tensor to be padded - - length : int - The length to which the tensor will be padded - - padding : torch.Tensor, optional - The padding tensor - if omitted, zero padding - will be used - - Returns - ------- - result : torch.Tensor - The padded tensor - """ - if padding is None: - padding = torch.zeros(tensor.shape[1:]) - padding = padding[None, ...].expand( - (length - tensor.size(0),) + tensor.shape[1:] - ) - return torch.cat([tensor, padding], dim=0) - - -def batch_feature_pad(tensors, padding=None): - """Similar to batch_pad_right but pads with the specified padding, whcih - can be a vector or a tensor - - Arguments - --------- - tensors : list - The list of tensors to be padded - padding : torch.Tensor - The padding tensor - - Returns - ------- - result : torch.Tensor - the padded tensor - """ - lengths_abs = torch.tensor( - [len(item) for item in tensors], device=tensors[0].device - ) - max_length = lengths_abs.max() - data = torch.stack( - [feature_pad_to(item, max_length, padding) for item in tensors] - ) - lengths = lengths_abs / max_length - return data, lengths - - -def token_collate_fn(examples, silence_token, token_keys): - """A customized collation function for audio tokens where - the specified silence token will be used as padding - instead of - zeros - - Arguments - --------- - examples : list - A list of examples - - silence_token : torch.Tensor - The token(s) representing silence - - token_keys : list - The list of keys to which special padding will be applied - - Returns - ------- - result : speechbrain.dataio.batch.PaddedBatch - A padded batch - """ - token_tensor_ids = {id(examples[0][key]) for key in token_keys} - return PaddedBatch( - examples, - padding_func=_silence_padding, - padding_kwargs={ - "silence_token": silence_token, - "token_tensor_ids": token_tensor_ids, - }, - ) - - -def _silence_padding(values, silence_token, token_tensor_ids): - return ( - batch_feature_pad(values, silence_token) - if id(values[0]) in token_tensor_ids - else batch_pad_right(values) - ) - - -def use_silence_padding(dataloader_opts, silence_token, token_keys): - """Overrides the collation function to add silence padding to - audio token features - - Arguments - --------- - dataloder_opts : dict - Dataloader options - silence_token : torch.Tensor - The tensor to be used as silence padding - token_keys : torch.Tensor - The keys to apply silence padding to - - Returns - ------- - dataloader_opts : dict - Updated data loader options - """ - return { - **dataloader_opts, - "collate_fn": partial( - token_collate_fn, silence_token=silence_token, token_keys=token_keys - ), - } diff --git a/benchmarks/DASB/utils/data.py b/benchmarks/DASB/utils/data.py index 6c68358f5..3ad31419a 100644 --- a/benchmarks/DASB/utils/data.py +++ b/benchmarks/DASB/utils/data.py @@ -89,3 +89,36 @@ def _undo_padding(batch, lengths): def as_dict(batch): """Converts a batch to a dictionary""" return {key: getattr(batch, key) for key in batch._PaddedBatch__keys} + + +def undo_padding_tensor(batch, lengths): + """Produces Python lists given a batch of sentences with + their corresponding relative lengths. + + Arguments + --------- + batch : torch.Tensor + Batch of sentences gathered in a batch. + lengths : torch.Tensor + Relative length of each sentence in the batch. + + Returns + ------- + as_list : list + A python list of the corresponding input tensor. + + Example + ------- + >>> batch=torch.rand([4,100]) + >>> lengths=torch.tensor([0.5,0.6,0.7,1.0]) + >>> snt_list=undo_padding(batch, lengths) + >>> len(snt_list) + 4 + """ + batch_max_len = batch.shape[1] + as_list = [] + for seq, seq_length in zip(batch, lengths): + actual_size = int(torch.round(seq_length * batch_max_len)) + seq_true = seq.narrow(0, 0, actual_size) + as_list.append(seq_true) + return as_list diff --git a/benchmarks/DASB/utils/eval.py b/benchmarks/DASB/utils/eval.py index c0e14f867..1e9c7d2ed 100644 --- a/benchmarks/DASB/utils/eval.py +++ b/benchmarks/DASB/utils/eval.py @@ -7,32 +7,52 @@ """ from speechbrain.inference.interfaces import Pretrained -from speechbrain.inference.ASR import EncoderDecoderASR from speechbrain.lobes.models.huggingface_transformers import Whisper +from speechbrain.lobes.models.huggingface_transformers.wav2vec2 import Wav2Vec2 from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset +from speechbrain.dataio.dataio import length_to_mask from speechbrain.decoders.seq2seq import S2SWhisperGreedySearcher from speechbrain.dataio.batch import PaddedBatch from speechbrain.utils.metric_stats import ErrorRateStats -from speechbrain.utils.superpowers import run_shell +from speechbrain.utils.data_utils import pad_right_to +from speechbrain.utils.fetching import fetch from collections import namedtuple from pathlib import Path -import os +from torch import nn import torch import torchaudio import re import string import logging -import shutil -import shlex -import subprocess + logger = logging.getLogger(__name__) + +has_transformers = False +try: + from transformers import AutoModelForAudioXVector + + has_transformers = True +except ImportError: + logger.warning( + "transformers library not found - some evaluators may be disabled" + ) + + RE_PUNCTUATION = re.compile( "|".join(re.escape(char) for char in string.punctuation) ) +SAMPLE_RATE = 16000 +DEFAULT_ENCODER_HUB = "chaanks/wav2vec2-small" +DEFAULT_MODEL_URL = "https://huggingface.co/chaanks/UTMOS/resolve/main" +DEFAULT_MODEL_NAME = "utmos.ckpt" +DEFAULT_SAVE_DIR = "./pretrained_models" +DEFAULT_JUDGE_ID = 288 +DEFAULT_DOMAIN_ID = 0 + SpeechEvaluationResult = namedtuple( "SpeechEvaluationResult", ["score", "details"] ) @@ -184,6 +204,23 @@ def resample(self, audio, sample_rate=None): ) return audio + def on_evaluation_start(self): + """Invoked when evaluation starts""" + pass + + def on_evaluation_end(self): + """Invoked when evaluation ends""" + pass + + def global_metrics(self): + """Returns global metrics (not tied to a specific sample) + + Returns + ------- + metrics : dict + A dictionary of metrics""" + return {} + def _unbatchify(value): """Removes the batch dimension from the tensor. If a single @@ -217,79 +254,26 @@ def __call__(self, wavs, length): return self.mods.model(wavs, length) -class RegressionModelSpeechEvaluator(SpeechEvaluator): - """A speech evaluator that uses a regression model - that produces a quality score (e.g. SSL fine-tuning) - for a sample of speech +class ASRSpeechEvaluator(SpeechEvaluator): + """A superclass for ASR speech evaluators Arguments --------- - source : str - The source model path or HuggingFace hub name sample_rate : int - The audio sample rate this evaluator expects + The sample rate used by the underlying ASR system + metric_mode : str + macro = metrics are evaluated per utterance and aggregated + micro = metrics are evaluated globally """ - def __init__(self, source, sample_rate=None, *args, **kwargs): + def __init__(self, sample_rate=16000, metric_mode="macro"): super().__init__(sample_rate=sample_rate) - self.model = SpeechEvaluationRegressionModel.from_hparams( - source, *args, **kwargs - ) - - def evaluate( - self, - wavs, - length, - text=None, - wavs_ref=None, - length_ref=None, - sample_rate=None, - sample_rate_ref=None, - ): - """Evaluates a batch of waveforms - - Arguments - --------- - Arguments - --------- - wavs: torch.Tensor - the waveforms to evaluate - - length: torch.Tensor - relative lengths (a 1-D tensor) - - text : list, optional - Ground truth text - - wavs_ref : torch.Tensor - the reference waveforms - - length_ref : torch.Tensor - the reference waveform lengths - - sample_rate : int, optional - The sample rate of the audio. If not provided, - the audio is assumed to be at the same sample - rate as the model + self.metric_mode = metric_mode + self.metrics = {} - sample_rate_ref : int, optional - The sample rate of the reference samples - - Returns - ------- - result : SpeechEvaluationResult - an aggregated speech evaluation result with a score - for each item - """ - wavs = self.resample(wavs, sample_rate) - scores = self.model(wavs, length) - while scores.dim() > 1 and scores.size(-1) == 1: - scores = scores.squeeze(-1) - return SpeechEvaluationResult(score=scores, details={"score": scores}) - - -class ASRSpeechEvaluator(SpeechEvaluator): - """A superclass for ASR speech evaluators""" + def on_evaluation_start(self): + """Invoked when evaluation starts""" + self.metrics = {} def evaluate( self, @@ -344,6 +328,7 @@ def evaluate( length=length_ref, text=text, sample_rate=sample_rate_ref, + metric_key="ref", ) details.update( {f"{key}_ref": value for key, value in details_ref.items()} @@ -378,126 +363,76 @@ def compute_diff_rate(self, details, device): """ ids = range(1, len(details["pred"]) + 1) - wer_metric, cer_metric = init_asr_metrics() + wer_metric, cer_metric = self.get_asr_metrics("diff") pred = self._replace_blanks(details["pred"]) pred_ref = self._replace_blanks(details["pred_ref"]) + pred = [item.split(" ") for item in pred] + pred_ref = [item.split(" ") for item in pred_ref] wer_metric.append(ids, pred, pred_ref) cer_metric.append(ids, pred, pred_ref) + count = len(ids) dwer = torch.tensor( - [score["WER"] for score in wer_metric.scores], device=device + [score["WER"] for score in wer_metric.scores[-count:]], + device=device, ) dcer = torch.tensor( - [score["WER"] for score in cer_metric.scores], device=device + [score["WER"] for score in cer_metric.scores[-count:]], + device=device, ) return {"dwer": dwer, "dcer": dcer} - def _replace_blanks(self, preds): - """Replaces blanks with single spaces, preventing an exception - in the case of an unintelligible sample + def get_asr_metrics(self, kind="regular"): + """Returns the ASR metrics Arguments --------- - """ - return [" " if item == "" else item for item in preds] - - -class EncoderDecoderASRSpeechEvaluator(ASRSpeechEvaluator): - """A speech evaluator implementation based on ASR. - Computes the Word Error Rate (WER), Character Error Rate (CER) - and a few other metrics - - Arguments - --------- - sample_rate : int - The audio sample rate this evaluator expects - """ - - def __init__(self, source, sample_rate=None, *args, **kwargs): - super().__init__(sample_rate=sample_rate) - self.asr = EncoderDecoderASR.from_hparams(source, *args, **kwargs) - self.device = next(self.asr.mods.parameters()).device - - def evaluate_samples(self, wavs, length, text, sample_rate): - wavs = self.resample(wavs, sample_rate) - if text is None: - raise ValueError("This evaluator requires ground-truth text") - predicted_words, scores, log_probs = self.transcribe_batch_with_details( - wavs, length - ) - ids = range(1, len(wavs) + 1) - wer_metric, cer_metric = init_asr_metrics() - wer_metric.append(ids, predicted_words, text) - cer_metric.append(ids, predicted_words, text) - wer = torch.tensor( - [score["WER"] for score in wer_metric.scores], device=wavs.device - ) - cer = torch.tensor( - [score["WER"] for score in cer_metric.scores], device=wavs.device - ) - prob_mean = log_probs.exp().mean(dim=-1) - return { - "wer": wer, - "cer": cer, - "beam_score": scores, - "prob_mean": prob_mean, - "pred": predicted_words, - "target": text, - } - - def transcribe_batch_with_details(self, wavs, wav_lens): - """Transcribes the input audio into a sequence of words - - The waveforms should already be in the model's desired format. - You can call: - ``normalized = EncoderDecoderASR.normalizer(signal, sample_rate)`` - to get a correctly converted signal in most cases. - - Arguments - --------- - predicted_words : list - The raw ASR predictions, fully decoded - best_scores : list - The best scores (from beam search) - best_log_probs : list - The best predicted log-probabilities (from beam search) - + kind : the kind of metrics to obtain + 'regular' - a new metric for each sample + 'micro' - a global shared metric Returns ------- - predicted_words : list - The predictions - - best_scores : torch.Tensor - The best scores (from beam search) - - best_log_probs : torch.Tensor - The best log-probabilities - + wer_metric : ErrorRateStats + the Word Error Rate (WER) metric + cer_metric : ErrorRateStats + the Character Error Rate (CER) metric """ - with torch.no_grad(): - wav_lens = wav_lens.to(self.device) - encoder_out = self.asr.encode_batch(wavs, wav_lens) - ( - hyps, - best_lens, - best_scores, - best_log_probs, - ) = self.asr.mods.decoder(encoder_out, wav_lens) - predicted_words = [ - self.asr.tokenizer.decode_ids(token_seq) for token_seq in hyps - ] - return predicted_words, best_scores, best_log_probs + if self.metric_mode == "micro": + if kind not in self.metrics: + metrics = init_asr_metrics() + self.metrics[kind] = metrics + metrics = self.metrics[kind] + else: + metrics = init_asr_metrics() + return metrics - def to(self, device): - """Transfers this module to the spcieifed device + def _replace_blanks(self, preds): + """Replaces blanks with single spaces, preventing an exception + in the case of an unintelligible sample Arguments --------- - device : str | torch.Device - the target device """ - self.asr = self.asr.to(device) - return self + return [" " if item == "" else item for item in preds] + + def global_metrics(self): + """Returns global metrics (not tied to a specific sample) + + Returns + ------- + metrics : dict + A dictionary of metrics""" + global_metrics = {} + if self.metric_mode == "micro": + wer_metric, cer_metric = self.get_asr_metrics("regular") + if wer_metric.scores: + global_metrics["wer_micro"] = wer_metric.summarize("WER") + global_metrics["cer_micro"] = cer_metric.summarize("WER") + dwer_metric, dcer_metric = self.get_asr_metrics("diff") + if dwer_metric.scores: + global_metrics["dwer_micro"] = dwer_metric.summarize("WER") + global_metrics["dcer_micro"] = dcer_metric.summarize("WER") + return global_metrics class WhisperASRSpeechEvaluator(ASRSpeechEvaluator): @@ -531,12 +466,13 @@ def __init__( source, savedir=None, sample_rate=22050, + metric_mode="macro", min_decode_ratio=0.0, max_decode_ratio=1.0, run_opts=None, unbatch=True, ): - super().__init__(sample_rate=sample_rate) + super().__init__(sample_rate=sample_rate, metric_mode=metric_mode) if run_opts is None: run_opts = {} if savedir is None: @@ -554,7 +490,9 @@ def __init__( self.unbatch = unbatch self.to(device) - def evaluate_samples(self, wavs, length, text, sample_rate): + def evaluate_samples( + self, wavs, length, text, sample_rate, metric_key="regular" + ): """Evaluates a batch of samples Arguments @@ -567,6 +505,8 @@ def evaluate_samples(self, wavs, length, text, sample_rate): Text labels corresponding to the waveforms sample_rate : int The sample rate of the waveforms + metric_key : str + The key for metrics Returns ------- @@ -582,24 +522,25 @@ def evaluate_samples(self, wavs, length, text, sample_rate): torch.ones(1, device=wavs.device), text[idx : idx + 1], sample_rate, + metric_key, ) for idx in range(batch_size) ] result = { + "pred": [result["pred"][0] for result in results], + "target": text, "wer": torch.stack( [result["wer"] for result in results] ).squeeze(-1), "cer": torch.stack( [result["cer"] for result in results] ).squeeze(-1), - "pred": [result["pred"][0] for result in results], - "target": text, } return result else: return self._evaluate_samples(wavs, length, text, sample_rate) - def _evaluate_samples(self, wavs, length, text, sample_rate): + def _evaluate_samples(self, wavs, length, text, sample_rate, metric_key): """Evaluates a batch of samples. This function is meant to be used internally. evaluate_samples will call it multiple times if unbatch is enabled. @@ -614,6 +555,8 @@ def _evaluate_samples(self, wavs, length, text, sample_rate): Text labels corresponding to the waveforms sample_rate : int The sample rate of the waveforms + metric_key : bool + Whether to compute the metrics Returns ------- @@ -632,21 +575,27 @@ def _evaluate_samples(self, wavs, length, text, sample_rate): ) predicted_words = [self.normalize(text) for text in predicted_words] ids = range(1, len(wavs) + 1) - wer_metric, cer_metric = init_asr_metrics() - wer_metric.append(ids, predicted_words, text) - cer_metric.append(ids, predicted_words, text) + wer_metric, cer_metric = self.get_asr_metrics(metric_key) + predicted_words_split = [item.split(" ") for item in predicted_words] + text_split = [item.split(" ") for item in text] + wer_metric.append(ids, predicted_words_split, text_split) + cer_metric.append(ids, predicted_words_split, text_split) + count = len(ids) wer = torch.tensor( - [score["WER"] for score in wer_metric.scores], device=wavs.device + [score["WER"] for score in wer_metric.scores[-count:]], + device=wavs.device, ) cer = torch.tensor( - [score["WER"] for score in cer_metric.scores], device=wavs.device + [score["WER"] for score in cer_metric.scores[-count:]], + device=wavs.device, ) - return { + result = { "wer": wer, "cer": cer, "pred": predicted_words, "target": text, } + return result def normalize(self, text): """Performs text normalization (uppercase, remove whitespace, @@ -743,171 +692,320 @@ def evaluate_files(self, file_names, text=None, file_names_ref=None): raise NotImplementedError() -UTMOS_REPO = "https://huggingface.co/spaces/sarulab-speech/UTMOS-demo" +class UTMOSModel(nn.Module): + """The UTMOS model wrapper + + Arguments + --------- + source : str + The WavLM source + save_path : str | path-like + The path where the model will be saved + features_dim : int, optional + The features dimension + num_domains : int, optional + The number of domains + domain_dim : int, optional + The dimension of each domain + num_judges : int, optional + The number of "judges" + judge_dim : int, optional + The dimension of each judge + decoder_hidden_size : int, optional + The size of the decoder hidden state + multiplier : float, optional + The number that the raw model output is multiplied by + to compute the score + offset : float, optional + The number that (raw output * multiplier) will be added + to in order to get the score + """ + + def __init__( + self, + source, + save_path, + features_dim=768, + num_domains=3, + domain_dim=128, + num_judges=3000, + judge_dim=128, + decoder_hidden_size=512, + multiplier=2.0, + offset=3.0, + ): + super().__init__() + + self.ssl_encoder = Wav2Vec2( + source, + save_path, + freeze=True, + output_norm=False, + freeze_feature_extractor=True, + output_all_hiddens=False, + ) + + self.domain_embedding = nn.Embedding(num_domains, domain_dim) + self.judge_embedding = nn.Embedding(num_judges, judge_dim) + + self.decoder = nn.LSTM( + input_size=features_dim + domain_dim + judge_dim, + hidden_size=decoder_hidden_size, + num_layers=1, + batch_first=True, + bidirectional=True, + ) + + self.classifier = nn.Sequential( + nn.Linear(decoder_hidden_size * 2, 2048), + torch.nn.ReLU(), + nn.Dropout(0.3), + nn.Linear(2048, 1), + ) + self.multiplier = multiplier + self.offset = offset + def forward(self, wav, domain_id=None, judge_id=None): + """Computes the forward pass -class UTMOSSpeechEvaluator(BulkSpeechEvaluator): - """An evaluation wrapper for UTMOS + Arguments + --------- + wav : torch.Tensor + The raw waveforms + domain_id : torch.Tensor + The domain identifiers + judge_id : torch.Tensor + The judge identifier + + Returns + ------- + result : torch.Tensor + The predicted rating(s) + """ + + if domain_id is None: + domain_id = torch.zeros( + len(wav), dtype=torch.int, device=wav.device + ) + if judge_id is None: + judge_id = ( + torch.ones(len(wav), dtype=torch.int, device=wav.device) + * DEFAULT_JUDGE_ID + ) + + ssl_features = self.ssl_encoder(wav) + domain_emb = self.domain_embedding(domain_id) + judge_emb = self.judge_embedding(judge_id) + + domain_emb = domain_emb.unsqueeze(1).expand( + -1, ssl_features.size(1), -1 + ) + judge_emb = judge_emb.unsqueeze(1).expand(-1, ssl_features.size(1), -1) + concatenated_feature = torch.cat( + [ssl_features, domain_emb, judge_emb], dim=2 + ) + + decoder_output, _ = self.decoder(concatenated_feature) + pred = self.classifier(decoder_output) + + return pred.mean(dim=1).squeeze(1) * self.multiplier + self.offset + + +class UTMOSSpeechEvaluator(SpeechEvaluator): + """The UTMOS speech evaluator wrapper Github: https://github.com/sarulab-speech/UTMOS22 HuggingFace: https://huggingface.co/spaces/sarulab-speech/UTMOS-demo + Arguments --------- - model_path : str | path-like - The path where the HuggingFace repository was extracted - output_folder : str | path-like - The folder where results will be output - ckpt_path : str | path-like - The path to the checkpoint to be used - script : str | path-like - The path to the evaluation script, defaults to the bundled - predict.py - python : str | path-like, optional - The path to the Python interpreter to be used, defaults to - "python". Depending on the environment, it might need to be - changed (e.g. to "python3" or an absolute path to the interpreter) - use_python : bool - Whether to launch the script using python. This flag will need to be - set to False in environments where running UTMOS requires a wrapper shell - script (e.g. to initialize a different Python virtual environment from - the one in which SpeechBrain is running) - tmp_folder : str | path-like, optional - The temporary folder where files will be copied for evaluation. If - omitted, it will be set to output_folder. This can be useful on - compute environments that provide fast local storage (e.g. certain - compute clusters) - repo : str - The repor + source : str, optional + The WavLM source + save_path : str | path-like, optional + The path where the model will be saved + model_name : str + The name of the model hub + model_url : str + The model URL (if applicable) + domain_id : int + The domain ID of the underlying model + judge_id : int + The judge ID to use (given UTMOS was trained as an ensemble + of judges) + run_opts: dict, optional + The run options + sample_rate : int + The sample rate of the underlying model """ def __init__( self, - model_path, - output_folder, - ckpt_path, - script="predict.py", - python="python", - use_python=True, - batch_size=8, - tmp_folder=None, - repo=UTMOS_REPO, + source=None, + save_path=None, + model_name=None, + model_url=None, + domain_id=None, + judge_id=None, + run_opts=None, + sample_rate=16000, ): - self.output_folder = Path(output_folder) - rand = torch.randint(1, 999999999, (1,)).item() - if tmp_folder is None: - tmp_folder = self.output_folder - else: - tmp_folder = Path(tmp_folder) - self.eval_path = (tmp_folder / f"eval_{rand}").absolute() - self.model_path = Path(model_path).absolute() - script = self.model_path / script - self.script = script - self.ckpt_path = Path(ckpt_path).absolute() - self.batch_size = batch_size - self.python = python - self.use_python = use_python - self.repo = repo - self.install() - - def install(self): - if self.model_path.exists(): - logger.info("UTMOS is already installed in %s", self.model_path) - return - logger.info( - "Attempting to install UTMOS from %s to %s", - self.repo, - self.model_path, - ) - cmd = shlex.join( - [ - "git", - "-C", - str(self.model_path.parent), - "clone", - self.repo, - str(self.model_path.name), - ] - ) - output, err, return_code = run_shell(cmd) - if return_code != 0: - raise CommandError(cmd, output, err, return_code) - logger.info("Repository clone successful, performing an LFS fetch") - cwd = Path.cwd() - try: - os.chdir(self.model_path) - cmd = shlex.join(["git", "lfs", "fetch"]) - output, err, return_code = run_shell(cmd) - if return_code != 0: - raise CommandError(cmd, output, err, return_code) - finally: - os.chdir(cwd) - if not self.ckpt_path.exists(): - raise ValueError("ckpt_path {ckpt_path} does not exist") - - def evaluate_files(self, file_names, text, file_names_ref=None): - """Evaluates multiple files + super().__init__(sample_rate=sample_rate) + self.model = UTMOSModel(source=source, save_path=save_path,) + if run_opts is not None: + device = run_opts.get("device") + if device: + self.model = self.model.to(device) + fetch(model_name, model_url, save_path) + model_path = Path(save_path) / model_name + state_dict = torch.load(model_path) + self.model.load_state_dict(state_dict) + self.model.eval() + + self.domain_id = domain_id + self.judge_id = judge_id + + def evaluate( + self, + wavs, + length, + text=None, + wavs_ref=None, + length_ref=None, + sample_rate=None, + sample_rate_ref=None, + ): + """Evaluates a batch of waveforms using UTMOS Arguments --------- - file_names : list - A list of files - - text : list - File transcripts (not required for all evaluators) - Not used in this evaluator - - file_names_ref : list, optional - A list of reference files / ground truths (if applicable) - Not used in this evaluator + wavs: torch.Tensor + the waveforms to evaluate + length: torch.Tensor + relative lengths (a 1-D tensor) + text : list, optional + Ground truth text. Ignored for UTMOS. + wavs_ref : torch.Tensor + the reference waveforms. Ignored for UTMOS. + length_ref : torch.Tensor + the reference waveform lengths. Ignored for UTMOS. + sample_rate : int, optional + The sample rate of the audio. If not provided, + the audio is assumed to be at the same sample + rate as the model + sample_rate_ref : int, optional + The sample rate of the reference samples. Ignored for UTMOS. Returns ------- result : SpeechEvaluationResult - a consolidated evaluation result + an aggregated speech evaluation result with a score + for each item """ - current_path = os.getcwd() - try: - self.eval_path.mkdir(parents=True, exist_ok=True) - logger.info("Copying the files to '%s'", self.eval_path) - for file_name in file_names: - target_file_name = self.eval_path / Path(file_name).name - shutil.copy(file_name, target_file_name) - - logger.info("Running evaluation") - result_path = self.eval_path / "result.txt" - os.chdir(self.model_path) - cmd = [ - str(self.script), - "--mode", - "predict_dir", - "--bs", - str(self.batch_size), - "--inp_dir", - str(self.eval_path), - "--out_path", - str(result_path), - "--ckpt_path", - str(self.ckpt_path), - ] - if self.use_python: - cmd = [self.python] + cmd - - output = subprocess.check_output(cmd) - logger.info("Evaluation finished, output: %s", output) - file_names = [path.name for path in self.eval_path.glob("*.wav")] - with open(result_path) as result_path: - scores = [float(line.strip()) for line in result_path] - score_map = dict(zip(file_names, scores)) - scores_ordered = [ - score_map[Path(file_name).name] for file_name in file_names - ] - return SpeechEvaluationResult( - scores_ordered, {"utmos": scores_ordered} + wavs = self.resample(wavs, sample_rate=sample_rate) + domain_id, judge_id = None, None + if self.domain_id is not None: + domain_id = ( + torch.ones(len(wavs), device=wavs.device) * self.domain_id ) - finally: - os.chdir(current_path) - shutil.rmtree(self.eval_path) + if self.judge_id is not None: + judge_id = torch.ones(len(wavs), device=wavs.device) * self.judge_id + + scores = self.model(wav=wavs, domain_id=domain_id, judge_id=judge_id) + return SpeechEvaluationResult(score=scores, details={"utmos": scores}) + + +class SpkSimWavLM(SpeechEvaluator): + """A speaker similarity evaluator based on WavLM / XVector + + Arguments + --------- + source : str + The model hub to use + savedir : str + The path where the model will be saved + model_sample_rate : int, optional + The sample rate to which all samples will be resampled + before being processed + """ + + def __init__( + self, + source, + savedir, + model_sample_rate=16000, + run_opts=None, + *args, + **kwargs, + ): + if not has_transformers: + raise ValueError( + "Unable to use the SpkSimWavLM evaluator because the " + "transformers library is not enabled" + ) + if run_opts is None: + run_opts = {} + device = run_opts.get("device") + self.model = AutoModelForAudioXVector.from_pretrained( + source, cache_dir=savedir, *args, **kwargs + ) + if device is not None: + self.model = self.model.to(device) + + self.model.eval() + self.model_sample_rate = model_sample_rate + self.device = next(self.model.parameters()).device + + def evaluate( + self, + wavs, + length, + text=None, + wavs_ref=None, + length_ref=None, + sample_rate=None, + sample_rate_ref=None, + ): + # Resample + if sample_rate is not None: + wavs = torchaudio.functional.resample( + wavs, orig_freq=sample_rate, new_freq=self.model_sample_rate + ) + if sample_rate_ref is not None: + wavs_ref = torchaudio.functional.resample( + wavs_ref, + orig_freq=sample_rate_ref, + new_freq=self.model_sample_rate, + ) + + # Concatenate + batch_size, wavs_max_len = wavs.shape + _, wavs_ref_max_len = wavs_ref.shape + length_abs = length * wavs_max_len + length_ref_abs = length_ref * wavs_ref_max_len + max_len = max(wavs_max_len, wavs_ref_max_len) + wavs, _ = pad_right_to(wavs, (batch_size, max_len)) + wavs_ref, _ = pad_right_to(wavs_ref, (batch_size, max_len)) + audio = torch.cat([wavs, wavs_ref]) + + length_cat_abs = torch.cat([length_abs, length_ref_abs]) + # Attention mask + attention_mask = length_to_mask( + length_cat_abs.int() + ).long() # 0 for masked tokens + # Forward + with torch.inference_mode(): + embs = self.model( + input_values=audio, + attention_mask=attention_mask, + output_attentions=False, + ).embeddings + hyp_embs, ref_embs = embs.split([len(wavs), len(wavs_ref)]) + scores = torch.nn.functional.cosine_similarity( + hyp_embs, ref_embs, dim=-1 + ) + + return SpeechEvaluationResult(scores, {"score": scores}) def vocoder_to_device(vocoder, device): diff --git a/benchmarks/DASB/utils/tokenizer_interface.py b/benchmarks/DASB/utils/tokenizer_interface.py new file mode 100644 index 000000000..bc2a43966 --- /dev/null +++ b/benchmarks/DASB/utils/tokenizer_interface.py @@ -0,0 +1,638 @@ +""" +Unified interface for tokenizers, standardizing the output shape of encode and decode functions. + +This class reshapes the outputs of various tokenizers to ensure consistency, simplifying integration with recipes and workflows. + +Authors +--------- +* Pooneh Mousavi, 2024 +""" + +import importlib +import sys +import os +import torch +import re +from abc import ABC, abstractmethod +from pathlib import Path +from speechbrain.lobes.models.huggingface_transformers.encodec import Encodec +from speechbrain.lobes.models.huggingface_transformers.discrete_ssl import ( + DiscreteSSL, +) +from speechbrain.lobes.models.discrete.dac import DAC +from speechbrain.lobes.models.discrete.speechtokenizer import SpeechTokenizer +from speechbrain.lobes.models.discrete.wavtokenizer import WavTokenizer +from speechbrain.lobes.models.huggingface_transformers.mimi import Mimi +from speechbrain.utils.superpowers import run_shell +from speechbrain.utils.fetching import fetch +from torch import nn +import logging +import shlex +import yaml + +logger = logging.getLogger(__name__) + +base_dir = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..") +) # noqa: E402 +sys.path.append(base_dir) # noqa: E402 + +from model.sq_codec import SQCodec # noqa: E402 + + +class BaseTokenizer(ABC): + """ + Abstract base class for tokenizers that encode signals into discrete tokens + and decode tokens back into signals. + + This class defines the essential methods that any tokenizer must implement, + including encoding, decoding, and retrieving pretrained embeddings. + + Naming Convenstion + ------------------ + B : int + Batch size. + T : int + Sequence length in the time domain. + N : int + Sequence length in the token domain. + C : int + Vocabulary size, assuming each codebook has the same number of tokens. + K : int + Number of codebooks. + """ + + def __init__(self): + """ + Initialize the BaseTokenizer. + + This is a base constructor that other tokenizers can extend. + """ + super().__init__() + + @abstractmethod + @torch.no_grad() + def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): + """ + Encode a signal into discrete tokens. + + Arguments + --------- + signal : torch.Tensor + Input signal with shape [B, T]. + lengths : torch.Tensor + Lengths of each sequence in the batch, with shape [B]. + num_codebooks : int, optional + Number of codebooks to use for encoding. If None, all codebooks are used (default: None). + If specified as an int, the tokens will be truncated to include only the first `num_codebooks` codebooks. If specified as a list, + the tokens will include only the codebooks at the specified indices. + **kwargs : dict + Additional arguments for the tokenizer. + + Returns + ------- + tokens : torch.Tensor + Discretized tokens with shape [B, N, K]. + """ + pass + + @abstractmethod + @torch.no_grad() + def tokens_to_sig(self, tokens, **kwargs): + """ + Decode discrete tokens back into a signal. + + Arguments + --------- + tokens : torch.Tensor + Input tokens with shape [B, N, K]. + **kwargs : dict + Additional arguments for the tokenizer. + + Returns + ------- + signal : torch.Tensor + Reconstructed signal with shape [B, T]. + """ + pass + + @abstractmethod + @torch.no_grad() + def get_pretrained_embeddings(self, vocab_size, num_codebooks, **kwargs): + """ + Retrieve pretrained embeddings for the tokenizer. + + Arguments + --------- + vocab_size : int + Number of tokens in each codebook. + num_codebooks : int + Number of codebooks. + **kwargs : dict + Additional arguments for embedding retrieval. + + Returns + ------- + embeddings : torch.Tensor + Pretrained embedding weights with shape [K * C, H], where H is the embedding dimension. + """ + pass + + +class EncodecTokenizer(Encodec, BaseTokenizer): + """This is a wrapper for the Encodec implemented in the SpeechBrain main repository. + + Source paper: + https://arxiv.org/abs/2210.13438 + Example + ------- + >>> model_hub = "facebook/encodec_24khz" + >>> save_path = "savedir" + >>> model = EncodecTokenizer(model_hub, save_path) + >>> emb=model.get_pretrained_embeddings() + >>> emb.shape + torch.Size([2048, 128]) + >>> audio = torch.randn(4, 1000) + >>> length = torch.tensor([1.0, .5, .75, 1.0]) + >>> tokens= model.sig_to_tokens(audio, length) + >>> tokens.shape + torch.Size([4, 4, 2]) + >>> rec = model.tokens_to_sig(tokens, lenght=length) + >>> rec.shape + torch.Size([4, 1280] + """ + + def __init__(self, *args, **kwargs): + Encodec.__init__(self, *args, **kwargs) + BaseTokenizer.__init__(self) + + @torch.no_grad() + def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): + self.eval() + tokens, _ = self.encode(signal, lengths) + if num_codebooks: + if tokens.shape[-1] < num_codebooks: + raise ValueError( + f"Model only outputs {tokens.shape[-1]} codebooks, but {num_codebooks} requested" + ) + tokens = tokens[..., :num_codebooks] + return tokens + + @torch.no_grad() + def tokens_to_sig(self, tokens, **kwargs): + self.eval() + signal = self.decode(tokens)[:, 0] + return signal + + @torch.no_grad() + def get_pretrained_embeddings( + self, vocab_size=None, num_codebooks=None, **kwargs + ): + embeddings = self.vocabulary + return embeddings.reshape(-1, embeddings.shape[-1]) + + +class DACTokenizer(DAC, BaseTokenizer): + """This is a wrapper for the DAC implemented in the SpeechBrain main repository. + + Source paper: + http://arxiv.org/abs/2306.06546 + Example + ------- + >>> model = DACTokenizer(load_pretrained=True, model_type="24KHz", model_bitrate="8kbps", tag="latest") + >>> audio = torch.randn(4, 16000) + >>> emb=model.get_pretrained_embeddings(vocab_size=1024, num_codebooks=8) + >>> emb.shape + torch.Size([8192, 1024]) + >>> tokens= model.sig_to_tokens(audio) + >>> tokens.shape + torch.Size([4, 50, 32]) + >>> rec = model.tokens_to_sig(tokens) + >>> rec.shape + torch.Size([4, 15992]) + """ + + def __init__(self, *args, **kwargs): + DAC.__init__(self, *args, **kwargs) + BaseTokenizer.__init__(self) + + @torch.no_grad() + def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): + self.eval() + tokens, _ = self(signal[:, None], n_quantizers=num_codebooks) + return tokens.movedim(-1, -2) + + @torch.no_grad() + def tokens_to_sig(self, tokens, **kwargs): + self.eval() + quantized_feats, _, _ = self.quantizer.from_codes( + tokens.movedim(-1, -2) + ) + return self.decode(quantized_feats)[:, 0] + + @torch.no_grad() + def get_pretrained_embeddings( + self, vocab_size=None, num_codebooks=None, **kwargs + ): + toks = torch.arange(vocab_size).to(next(self.parameters()).device) + toks = toks[:, None, None].expand(-1, num_codebooks, -1).clone() + self.eval() + z_q, z_p, _ = self.quantizer.from_codes(toks) + z_ps = z_p.split(z_p.shape[1] // toks.shape[1], dim=1) + z_qs = [ + self.quantizer.quantizers[i].out_proj(z_p_i) + for i, z_p_i in enumerate(z_ps) + ] + return torch.cat(z_qs)[:, :, 0] + + +class SpeechTokenizerWrapper(SpeechTokenizer, BaseTokenizer): + """This is a wrapper for the SpeechTokenizer implemented in the SpeechBrain main repository. + + Source paper: + https://arxiv.org/abs/2308.16692 + Example + ------- + >>> audio = torch.rand([10, 600]) + >>> model_hub = "fnlp/SpeechTokenizer" + >>> save_path = "savedir" + >>> model = SpeechTokenizerWrapper(model_hub, save_path) + >>> emb=model.get_pretrained_embeddings(vocab_size=1024, num_codebooks=8) + >>> emb.shape + torch.Size([8192, 1024]) + >>> tokens= model.sig_to_tokens(audio) + >>> tokens.shape + torch.Size([10, 2, 8]) + >>> rec = model.tokens_to_sig(tokens) + >>> rec.shape + torch.Size([10, 640]) + """ + + def __init__(self, *args, **kwargs): + SpeechTokenizer.__init__(self, *args, **kwargs) + BaseTokenizer.__init__(self) + + @torch.no_grad() + def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): + self.eval() + tokens = self(signal) + if num_codebooks: + if len(tokens) < num_codebooks: + raise ValueError( + f"Model only outputs {len(tokens)} codebooks, but {num_codebooks} requested" + ) + tokens = tokens[:num_codebooks] + return tokens.movedim(-3, -1) + + @torch.no_grad() + def tokens_to_sig(self, tokens, **kwargs): + self.eval() + return self.decode(tokens.movedim(-1, -3)) + + @torch.no_grad() + def get_pretrained_embeddings( + self, vocab_size=None, num_codebooks=None, **kwargs + ): + toks = torch.arange(vocab_size).to(next(self.parameters()).device) + toks = toks[None, :, None].expand(num_codebooks, -1, -1).clone() + self.eval() + embs = [ + self.model.quantizer.vq.layers[i].decode(indices) + for i, indices in enumerate(toks) + ] + return torch.cat(embs)[:, :, 0] + + +class DiscreteSSLTokenizer(DiscreteSSL, BaseTokenizer): + """This is a wrapper for the Encodec implemented in the SpeechBrain main repository. + + Source paper: + https://arxiv.org/abs/2210.13438 + Example + ------- + >>> from speechbrain.lobes.models.huggingface_transformers.wavlm import (WavLM) + >>> inputs = torch.rand([3, 2000]) + >>> model_hub = "microsoft/wavlm-large" + >>> save_path = "savedir" + >>> ssl_layer_num = [7,23] + >>> deduplicate =[False, True] + >>> bpe_tokenizers=[None, None] + >>> vocoder_repo_id = "speechbrain/hifigan-wavlm-k1000-LibriTTS" + >>> kmeans_dataset = "LibriSpeech" + >>> num_clusters = 1000 + >>> ssl_model = WavLM(model_hub, save_path,output_all_hiddens=True) + >>> model = DiscreteSSLTokenizer(save_path, ssl_model, vocoder_repo_id=vocoder_repo_id, kmeans_dataset=kmeans_dataset,num_clusters=num_clusters) + >>> emb=model.get_pretrained_embeddings(num_codebooks=ssl_layer_num) + >>> emb.shape + torch.Size([2000, 1024]) + >>> tokens= model.sig_to_tokens(inputs,num_codebooks=ssl_layer_num, deduplicates=deduplicate, bpe_tokenizers=bpe_tokenizers) + >>> tokens.shape + torch.Size([3, 6, 2]) + >>> sig = model.tokens_to_sig(tokens, SSL_layers=ssl_layer_num) + >>> sig.shape + torch.Size([3, 1920]) + """ + + def __init__(self, *args, **kwargs): + DiscreteSSL.__init__(self, *args, **kwargs) + BaseTokenizer.__init__(self) + + @torch.no_grad() + def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): + self.eval() + tokens, _, _ = self.encode( + signal, lengths, SSL_layers=num_codebooks, **kwargs + ) + return tokens + + @torch.no_grad() + def tokens_to_sig(self, tokens, **kwargs): + self.eval() + return self.decode(tokens, **kwargs).squeeze(1) + + @torch.no_grad() + def get_pretrained_embeddings( + self, vocab_size=None, num_codebooks=None, **kwargs + ): + embs = [] + for layer_num, vocabulary in zip( + self.ssl_layer_ids, self.vocabularies, + ): + if layer_num not in num_codebooks: + continue + embs.append(torch.as_tensor(vocabulary, dtype=torch.float32)) + embs = torch.cat(embs) + return embs + + +class MimiTokenizer(Mimi, BaseTokenizer): + """This is a wrapper for the Mimi implemented in the SpeechBrain main repository. + + Source paper: + https://kyutai.org/Moshi.pdf + Example + ------- + >>> model_hub = "kyutai/mimi" + >>> save_path = "savedir" + >>> model = MimiTokenizer(model_hub, save_path) + >>> emb=model.get_pretrained_embeddings() + >>> emb.shape + torch.Size([16384, 256]) + >>> audio = torch.randn(4, 48000) + >>> length = torch.tensor([1.0, .5, .75, 1.0]) + >>> tokens = model.sig_to_tokens(audio, length) + >>> tokens.shape + torch.Size([4, 25, 8]) + >>> rec = model.tokens_to_sig(tokens, length=length) + >>> rec.shape + torch.Size([4, 48000]) + """ + + def __init__(self, *args, **kwargs): + Mimi.__init__(self, *args, **kwargs) + BaseTokenizer.__init__(self) + + @torch.no_grad() + def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): + self.eval() + tokens, _ = self.encode(signal, lengths) + if num_codebooks: + if tokens.shape[1] < num_codebooks: + raise ValueError( + f"Model only outputs {tokens.shape[1]} codebooks, but {num_codebooks} requested" + ) + tokens = tokens[:, :num_codebooks, :] + return tokens.movedim(-1, -2) + + @torch.no_grad() + def tokens_to_sig(self, tokens, **kwargs): + self.eval() + signal = self.decode(tokens.movedim(-1, -2), **kwargs)[:, 0] + return signal + + @torch.no_grad() + def get_pretrained_embeddings( + self, vocab_size=None, num_codebooks=None, **kwargs + ): + return self.embeddings.view(-1, self.embeddings.size(-1)) + + +class WavTokenizerWrapper(WavTokenizer, BaseTokenizer): + """This is a wrapper for the WavTokenizer implemented in the SpeechBrain main repository. + + Source paper: + https://arxiv.org/abs/2408.16532 + + Example + ------- + >>> model_hub = "novateur/WavTokenizer" + >>> save_path = "savedir" + >>> config="wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn.yaml" + >>> checkpoint="WavTokenizer_small_600_24k_4096.ckpt" + >>> model = WavTokenizerWrapper(model_hub, save_path,config=config,checkpoint=checkpoint) + >>> emb=model.get_pretrained_embeddings() + >>> emb.shape + torch.Size([4096, 512]) + >>> audio = torch.randn(4, 48000) + >>> length = torch.tensor([1.0, .5, .75, 1.0]) + >>> tokens= model.sig_to_tokens(audio, length) + >>> tokens.shape + torch.Size([4, 80, 1]) + >>> rec = model.tokens_to_sig(tokens) + >>> rec.shape + torch.Size([4, 48000]) + """ + + def __init__(self, *args, **kwargs): + WavTokenizer.__init__(self, *args, **kwargs) + BaseTokenizer.__init__(self) + + @torch.no_grad() + def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): + self.eval() + tokens, _ = self.encode(signal) + if num_codebooks: + if tokens.shape[1] < num_codebooks: + raise ValueError( + f"Model only outputs {tokens.shape[1]} codebooks, but {num_codebooks} requested" + ) + tokens = tokens[:, :num_codebooks, :] + + return tokens.movedim(-2, -1) + + @torch.no_grad() + def tokens_to_sig(self, tokens, **kwargs): + self.eval() + signal = self.decode(tokens.movedim(-1, -2)) + return signal + + @torch.no_grad() + def get_pretrained_embeddings( + self, vocab_size=None, num_codebooks=None, **kwargs + ): + return self.embeddings + + +class SQCodecTokenizer(SQCodec, BaseTokenizer): + """This is a wrapper for the SQCoced implemented in the model folder. + + Source paper: + https://arxiv.org/abs/2406.02328, https://arxiv.org/abs/2408.13893 + + + Make sure that you download and extract the SQ-codec.zip in save_path from following Huggingface repo: + - HF repo: https://huggingface.co/Dongchao/UniAudio/blob/main/SQ-Codec.zip + + Example + ------- + >>> save_path = "savedir" + >>> config = "config.yaml" + >>> checkpoint = "ckpt_00190000.pth" + >>> model = SQCodecTokenizer(save_path, config, checkpoint) + >>> audio = torch.randn(3, 48000) + >>> tokens = model.sig_to_tokens(audio) + >>> tokens.shape + torch.Size([3, 150, 4]) + >>> rec = model.tokens_to_sig(tokens) + >>> rec.shape + torch.Size([3, 48000] + """ + + def __init__(self, *args, **kwargs): + SQCodec.__init__(self, *args, **kwargs) + BaseTokenizer.__init__(self) + + @torch.no_grad() + def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): + self.eval() + tokens, _ = self.encode(signal) + return tokens.view(tokens.shape[0], -1, self.n_codebook) + + @torch.no_grad() + def tokens_to_sig(self, tokens, **kwargs): + self.eval() + signal = self.decode(tokens.reshape(tokens.shape[0], -1), **kwargs) + return signal.squeeze(1) + + @torch.no_grad() + def get_pretrained_embeddings( + self, vocab_size=None, num_codebooks=None, **kwargs + ): + """ + This method is not implemented for SQCodec, as it uses scalar quantization + and does not have any trainable quantizer or embedding. + """ + raise ValueError( + "SQCodec does not have any trainable quantizer or embedding since it uses scalar quantization." + ) + + +DEFAULT_ESPNET_REPO = "https://github.com/espnet/espnet" + + +class ESPNetEncodecInterface(BaseTokenizer, nn.Module): + """An interface for pretrained ESPNet Encodec implementations""" + + def __init__( + self, + source, + model_ckpt, + model_config, + save_path, + sample_rate=24000, + n_codebook=32, + espnet_repo=DEFAULT_ESPNET_REPO, + espnet_commit=None, + ): + super().__init__() + self.source = source + self.model_ckpt = model_ckpt + self.model_config = model_config + self.save_path = Path(save_path) + self.sample_rate = sample_rate + self.n_codebook = n_codebook + self.espnet_repo = espnet_repo + self.espnet_commit = espnet_commit + self._load() + + def _load(self): + self._load_espnet() + ckpt_file_name = fetch( + filename=self.model_ckpt, + source=self.source, + savedir=str(self.save_path), + save_filename=str(Path(self.model_ckpt).name), + ) + config_file_name = fetch( + filename=self.model_config, + source=self.source, + savedir=str(self.save_path), + save_filename="config.yaml", + ) + with open(config_file_name) as config_file: + config = yaml.safe_load(config_file) + from espnet2.gan_codec.encodec.encodec import Encodec as ESPNetEncodec + + self.encodec = ESPNetEncodec(**config["codec_conf"]) + device = next(iter(self.encodec.parameters())).device + state_dict = torch.load(ckpt_file_name, map_location=device) + state_dict = { + re.sub("^codec.", "", key): value + for key, value in state_dict.items() + } + self.encodec.load_state_dict(state_dict) + + def _load_espnet(self): + try: + importlib.import_module("espnet2") + except ModuleNotFoundError: + self._download_espnet() + + def _download_espnet(self): + logger.info("espnet is not installed, installing") + espnet_path = self.save_path / "espnet" + if not espnet_path.exists(): + logger.info("Cloining %s into %s", self.espnet_repo, espnet_path) + cmd = shlex.join( + ["git", "clone", self.espnet_repo, str(espnet_path)] + ) + run_shell(cmd) + else: + logger.info("%s already exists", espnet_path) + if self.espnet_commit: + logger.info("Checking out %s", self.espnet_commit) + cmd = shlex.join( + ["git", "-C", str(espnet_path), "checkout", self.espnet_commit] + ) + run_shell(cmd) + logger.info("Installing") + cmd = shlex.join(["pip", "install", "-e", str(espnet_path)]) + run_shell(cmd) + logger.info("Installation completed") + + @torch.no_grad() + def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): + self.encodec.eval() + if signal.dim() < 3: + signal = signal.unsqueeze(1) + tokens = self.encodec.encode(signal) + return tokens.permute(1, 2, 0)[:, :, : self.n_codebook] + + @torch.no_grad() + def tokens_to_sig(self, tokens, **kwargs): + self.encodec.eval() + tokens = tokens.permute(2, 0, 1) + signal = self.encodec.decode(tokens, **kwargs) + return signal.squeeze(1) + + @torch.no_grad() + def get_pretrained_embeddings( + self, vocab_size=None, num_codebooks=None, **kwargs + ): + """ + This method is not implemented for ESPNet Encodec, as it uses scalar quantization + and does not have any trainable quantizer or embedding. + """ + raise ValueError( + "ESPNet Encodec does not have any trainable quantizer or embedding since it uses scalar quantization." + ) diff --git a/benchmarks/DASB/utils/tokens.py b/benchmarks/DASB/utils/tokens.py new file mode 100644 index 000000000..03ea5049c --- /dev/null +++ b/benchmarks/DASB/utils/tokens.py @@ -0,0 +1,411 @@ +""" +Unified interface for token extraction and pretrained embeddings handling for speech tokenizers. + +Authors +--------- +* Jarod Duret, 2024 +""" + +import math +import logging +import pathlib as pl +import kaldiio +import torch +import torchaudio +import numpy as np +from tqdm.auto import tqdm +import speechbrain as sb +from speechbrain.dataio.dataloader import make_dataloader +from speechbrain.dataio.dataset import DynamicItemDataset +from speechbrain.dataio.dataio import load_pkl, save_pkl + + +logger = logging.getLogger(__name__) +OPT_FILE = "opt_extract.pkl" + + +def get_device(use_cuda): + logger.info("=" * 30) + logger.info(f"USE_CUDA SET TO: {use_cuda}") + logger.info(f"CUDA AVAILABLE?: {torch.cuda.is_available()}") + logger.info("=" * 30) + use_cuda = use_cuda and torch.cuda.is_available() + return torch.device("cuda" if use_cuda else "cpu") + + +class TokensExtractor: + """ + Extracts tokens from audio data using a tokenizer and saves them to a specified format. + + Arguments + --------- + tokenizer : torch.nn.Module + The tokenizer model to use for token extraction. + sample_rate : int + The sample rate of the audio data. + src_key : str, optional + The key in the dataset that contains the audio data (default: "wav"). + id_key : str, optional + The key in the dataset that contains unique identifiers (default: "id"). + save_format : str, optional + The format to save the tokens ('numpy', 'pickle', 'soundfile_flac') (default: "numpy"). + use_cuda : bool, optional + Whether to use CUDA for computation (default: True). + dataloader_opts : dict, optional + Options for the data loader (default: None). + + Raises + ------ + ValueError + If an unsupported save_format is provided. + ValueError + If the tokenizer's sample rate does not match the provided sample_rate. + """ + + def __init__( + self, + tokenizer, + sample_rate, + src_key="wav", + id_key="id", + save_format="numpy", + use_cuda=True, + dataloader_opts=None, + ): + self.id_key = id_key + self.src_key = src_key + + self.device = get_device(use_cuda) + self.tokenizer = tokenizer.to(self.device) + self.sample_rate = sample_rate + + if tokenizer.sample_rate != self.sample_rate: + raise ValueError( + f"Sample rate mismatch: {self.sample_rate} != {tokenizer.sample_rate}" + ) + + if save_format not in ["numpy", "pickle", "soundfile_flac"]: + raise ValueError(f"Unsupported save_format: {save_format}") + self.save_format = save_format + + if not dataloader_opts: + dataloader_opts = {} + self.dataloader_opts = dataloader_opts + self.pipelines = self._make_pipelines() + + def extract_tokens( + self, dataset, num_codebooks, save_path, save_name="tokens" + ): + """ + Extracts tokens from the dataset and saves them to the specified format. + + Arguments + --------- + dataset : speechbrain.dataio.dataset.DynamicItemDataset or dict + The dataset from which to extract tokens. Can be a DynamicItemDataset or a dictionary. + num_codebooks: int + The number of codebooks to retrieve from the tokens. + save_path: str + The path where tokens will be saved. + save_name: str + The name of the .scp and .ark files. + """ + conf = { + "sample_rate": self.sample_rate, + "save_folder": save_path, + "dataset_length": len(dataset), + } + + save_path = pl.Path(save_path).absolute() + save_path.mkdir(parents=True, exist_ok=True) + + # Check if the extraction is already done (if so, skip it) + if _skip(save_path, save_name, conf): + logger.info("Skipping extraction, completed in previous run.") + return + + self.wspecifier = ( + f"ark,scp,t:{save_path}/{save_name}.ark,{save_path}/{save_name}.scp" + ) + self.writer = kaldiio.WriteHelper( + self.wspecifier, write_function="numpy" + ) + + if isinstance(dataset, dict): + dataset = DynamicItemDataset(dataset) + dataset.set_output_keys([self.src_key, self.id_key, "sig"]) + for pipeline in self.pipelines: + dataset.add_dynamic_item(pipeline) + + dataloader = make_dataloader(dataset, **self.dataloader_opts) + batch_size = self.dataloader_opts.get("batch_size", 1) + batch_count = int(math.ceil(len(dataset) / batch_size)) + for batch in tqdm(dataloader, total=batch_count): + batch = batch.to(self.device) + x, x_lengths = batch["sig"] + ids = batch[self.id_key] + batch_tokens = self.tokenizer.sig_to_tokens( + x, x_lengths, num_codebooks=num_codebooks + ) + batch_tokens = sb.utils.data_utils.undo_padding( + batch_tokens, x_lengths + ) + self.process_batch(batch_tokens, ids) + + logger.info("Extraction completed.") + + save_opt = save_path / OPT_FILE + save_pkl(conf, save_opt.as_posix()) + + def process_batch(self, batch, ids): + """ + Processes a batch of tokens and writes them to the output files. + + Arguments + --------- + batch : list + A list of tokens for each item in the batch. + ids : list + A list of unique identifiers corresponding to each item in the batch. + """ + for tokens, utt_id in zip(batch, ids): + tokens = np.array(tokens) + self.writer(utt_id, tokens) + + def _make_pipelines(self): + """ + Creates the data processing pipeline for audio data. + + The pipeline reads audio files, resamples them to the desired sample rate, and provides + the processed signal under the key "sig". + + Returns + ------- + pipeline : list + A list containing the audio processing pipeline function. + """ + + @sb.utils.data_pipeline.takes(self.src_key) + @sb.utils.data_pipeline.provides("sig") + def audio_pipeline(wav): + info = torchaudio.info(wav) + sig = sb.dataio.dataio.read_audio(wav) + sig = torchaudio.transforms.Resample( + info.sample_rate, self.sample_rate, + )(sig) + return sig + + return [audio_pipeline] + + def save_pretrained_embeddings( + self, + save_path, + save_name="embeddings", + vocab_size=None, + num_codebooks=None, + ): + """ + Saves the pretrained embeddings of the tokenizer to a specified directory. + + This method retrieves the pretrained embeddings from the tokenizer, + converts them to a NumPy array, and saves them as a `.npy` file. + + Parameters + ---------- + save_path : str or pathlib.Path + The directory where the pretrained embeddings will be saved. + If the directory does not exist, it will be created. + save_name : str, optional + The base name of the saved embeddings file (default is "embeddings"). + The embeddings will be saved as `.npy` in the specified directory. + """ + save_path = pl.Path(save_path).absolute() + save_path.mkdir(parents=True, exist_ok=True) + + embeddings = self.tokenizer.get_pretrained_embeddings( + vocab_size, num_codebooks + ) + embeddings = embeddings.cpu().numpy() + np.save(save_path / save_name, embeddings) + + def __del__(self): + """ + Close the writer. + """ + self.writer.close() + + +def _skip(save_path, save_name, conf): + """ + Detects if the dataset extraction has been already done. + If the extraction has been done, we can skip it. + + Arguments + --------- + save_path : str + The path to the directory containing extracted tokens. + save_name : str + The base name of the saved tokens file. + conf : dict + Configuration to match against saved config. + + Returns + ------- + bool + if True, the preparation phase can be skipped. + if False, it must be done. + """ + skip = True + + # Checking ark,scp files + for ext in [".ark", ".scp"]: + save_file = save_path / f"{save_name}{ext}" + if not save_file.exists: + skip = False + + # Checking saved options + save_opt = save_path / OPT_FILE + if skip is True: + if save_opt.exists(): + opts_old = load_pkl(save_opt.as_posix()) + if opts_old == conf: + skip = True + else: + skip = False + else: + skip = False + return skip + + +class TokensLoader: + """ + A loader class for retrieving tokens corresponding to utterance IDs. + + Arguments + --------- + data_path: str + The path to the data directory containing the token files. + save_name: str, optional + The base name of the tokens files (default: "tokens"). + """ + + def __init__( + self, data_path, save_name="tokens", + ): + self.data_path = pl.Path(data_path) + if not self.data_path.exists(): + raise ValueError( + f"Data folder not found: {self.data_path.as_posix()}" + ) + self.tokens = self._load(data_path, save_name) + + def tokens_by_uttid(self, utt_id, num_codebooks=None): + """ + Retrieves the tokens corresponding to a given utterance ID. + + Arguments + --------- + utt_id : str + The utterance ID to retrieve tokens for. + num_codebooks : int or list, optional + The number of codebooks to retrieve from the tokens. If specified as an int, the tokens + will be truncated to include only the first `num_codebooks` codebooks. If specified as a list, + the tokens will include only the codebooks at the specified indices. If not specified, all codebooks are returned. + + Returns + ------- + result : torch.LongTensor [T, N_Q] + The tokens associated with the utterance ID, possibly truncated to `num_codebooks` codebooks. + + Raises + ------ + KeyError + If the utterance ID is not found in the tokens. + ValueError + If `num_codebooks` is invalid or exceeds the number of available codebooks. + """ + if utt_id not in self.tokens: + raise KeyError(f"Utterance ID '{utt_id}' not found in tokens.") + tokens_path = self.tokens[utt_id] + tokens = kaldiio.load_mat(tokens_path) + tokens = torch.from_numpy(tokens).long() + + if num_codebooks is not None: + if isinstance(num_codebooks, int): + if num_codebooks <= 0: + raise ValueError( + f"Invalid num_codebooks value: {num_codebooks}. It must be a positive integer." + ) + if num_codebooks > tokens.size(-1): + raise ValueError( + f"Invalid number of codebooks: {num_codebooks}. " + f"Available codebooks: {tokens.size(-1)}." + ) + tokens = tokens[:, :num_codebooks] + elif isinstance(num_codebooks, list): + if not all( + isinstance(idx, int) and 0 <= idx < tokens.size(-1) + for idx in num_codebooks + ): + raise ValueError( + f"Invalid indices in num_codebooks list: {num_codebooks}. " + f"All indices must be integers within the range [0, {tokens.size(-1) - 1}]." + ) + tokens = tokens[:, num_codebooks] + else: + raise ValueError("num_codebooks must be an int or a list.") + + return tokens + + def _load(self, data_path, save_name): + """ + Loads the mapping from utterance IDs to token file paths. + + Arguments + --------- + data_path: str + The path to the data directory containing the token files. + save_name: str + The base name of the tokens files. + + Returns + ------- + utt2toks: dict + A dictionary mapping utterance IDs to their corresponding token file paths. + """ + scp_path = f"{data_path}/{save_name}.scp" + with open(scp_path, "r") as f: + utt2toks = { + line.strip().split(None, 1)[0]: line.strip().split(None, 1)[1] + for line in f + if line.strip() + } + return utt2toks + + def load_pretrained_embeddings(self, data_path, save_name="embeddings"): + """ + Loads pretrained embeddings from a specified path. + + Arguments + --------- + data_path : str + The directory where the embeddings are saved. + save_name : str, optional + The name of the embeddings file (default: "embeddings"). + + Returns + ------- + embeddings : torch.Tensor + The loaded embeddings as a PyTorch tensor. + + Raises + ------ + FileNotFoundError + If the embeddings file does not exist at the specified path. + """ + data_path = pl.Path(data_path).absolute() + if not self.data_path.exists(): + raise ValueError(f"Data folder not found: {data_path.as_posix()}") + embeddings = np.load(data_path / f"{save_name}.npy") + embeddings = torch.from_numpy(embeddings) + return embeddings diff --git a/speechbrain b/speechbrain index e602161f4..f07cfc76b 160000 --- a/speechbrain +++ b/speechbrain @@ -1 +1 @@ -Subproject commit e602161f4d305e13a26fc71b7dbe4a4cfeaa8847 +Subproject commit f07cfc76bd4b864c598a9ed5948caa3fe3176516