Skip to content

Commit b2f3eaf

Browse files
feat: add librispeech dataset, add text info option to commonvoice
1 parent c258266 commit b2f3eaf

File tree

6 files changed

+94
-10
lines changed

6 files changed

+94
-10
lines changed

README.md

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,38 @@ dataset[0] # (1, 158621)
4444
dataset[1] # (1, 153757)
4545
```
4646

47+
#### Full API:
48+
```py
49+
LJSpeechDataset(
50+
root: str = "./data", # The root where the dataset will be downloaded
51+
transforms: Optional[Callable] = None, # Transforms to apply to audio files
52+
)
53+
```
54+
55+
### LibriSpeech Dataset
56+
Wrapper for the [LibriSpeech](https://www.openslr.org/12) dataset (EN only). Requires `pip install datasets`. Note that this dataset requires several GBs of storage.
57+
58+
```py
59+
from audio_data_pytorch import LibriSpeechDataset
60+
61+
dataset = LibriSpeechDataset(
62+
root="./data",
63+
)
64+
65+
dataset[0] # (1, 222336)
66+
```
67+
68+
#### Full API:
69+
```py
70+
LibriSpeechDataset(
71+
root: str = "./data", # The root where the dataset will be downloaded
72+
with_info: bool = False, # Whether to return info (i.e. text, sampling rate, speaker_id)
73+
transforms: Optional[Callable] = None, # Transforms to apply to audio files
74+
)
75+
```
76+
4777
### Common Voice Dataset
48-
Multilanguage wrapper for the [Common Voice](https://commonvoice.mozilla.org/) dataset with voice-only data. Requires `pip install datasets`. Note that each language requires several GBs of storage, and that you have to confirm access for each distinct version you use e.g. [here](https://huggingface.co/datasets/mozilla-foundation/common_voice_10_0), to validate your Huggingface access token. You can provide a list of `languages` and to avoid an unbalanced dataset the values will be interleaved by downsampling the majority language to have the same number of samples as the minority language.
78+
Multilanguage wrapper for the [Common Voice](https://commonvoice.mozilla.org/). Requires `pip install datasets`. Note that each language requires several GBs of storage, and that you have to confirm access for each distinct version you use e.g. [here](https://huggingface.co/datasets/mozilla-foundation/common_voice_10_0), to validate your Huggingface access token. You can provide a list of `languages` and to avoid an unbalanced dataset the values will be interleaved by downsampling the majority language to have the same number of samples as the minority language.
4979

5080
```py
5181
from audio_data_pytorch import CommonVoiceDataset
@@ -66,7 +96,7 @@ CommonVoiceDataset(
6696
sub_version: int = 0, # Subversion: common_voice_{version}_{sub_version}
6797
root: str = "./data", # The root where the dataset will be downloaded
6898
languages: Sequence[str] = ['en'], # List of languages to include in the dataset
69-
with_sample_rate: bool = False, # Returns sample rate as second argument
99+
with_info: bool = False, # Whether to return info (i.e. text, sampling rate, age, gender, accent, locale)
70100
transforms: Optional[Callable] = None, # Transforms to apply to audio files
71101
)
72102
```
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from .common_voice_dataset import CommonVoiceDataset
2-
from .ljspeech_dataset import LJSpeechDataset
2+
from .libri_speech_dataset import LibriSpeechDataset
3+
from .lj_speech_dataset import LJSpeechDataset
34
from .wav_dataset import WAVDataset
45
from .youtube_dataset import YoutubeDataset

audio_data_pytorch/datasets/common_voice_dataset.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import os
2-
from typing import Callable, Optional, Sequence, Tuple, Union
2+
from typing import Callable, Dict, Optional, Sequence, Tuple, Union
33

44
import torch
55
from torch import Tensor
@@ -14,10 +14,10 @@ def __init__(
1414
sub_version: int = 0,
1515
root: str = "./data",
1616
languages: Sequence[str] = ["en"],
17-
with_sample_rate: bool = False,
17+
with_info: bool = False,
1818
transforms: Optional[Callable] = None,
1919
):
20-
self.with_sample_rate = with_sample_rate
20+
self.with_info = with_info
2121
self.transforms = transforms
2222

2323
from datasets import interleave_datasets, load_dataset
@@ -37,15 +37,24 @@ def __init__(
3737

3838
def __getitem__(
3939
self, idx: Union[Tensor, int]
40-
) -> Union[Tensor, Tuple[Tensor, Tensor]]:
40+
) -> Union[Tensor, Tuple[Tensor, Dict]]:
4141
idx = idx.tolist() if torch.is_tensor(idx) else idx # type: ignore
4242
data = self.dataset[idx]
43+
4344
waveform = torch.tensor(data["audio"]["array"]).view(1, -1)
44-
sample_rate = data["audio"]["sampling_rate"]
45+
46+
info = dict(
47+
sample_rate=data["audio"]["sampling_rate"],
48+
text=data["sentence"],
49+
age=data["age"],
50+
accent=data["accent"],
51+
gender=data["gender"],
52+
locale=data["locale"],
53+
)
4554

4655
if self.transforms:
4756
waveform = self.transforms(waveform)
48-
return (waveform, sample_rate) if self.with_sample_rate else waveform
57+
return (waveform, info) if self.with_info else waveform
4958

5059
def __len__(self) -> int:
5160
return len(self.dataset)
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import os
2+
from typing import Callable, Dict, Optional, Tuple, Union
3+
4+
import torch
5+
from torch import Tensor
6+
from torch.utils.data import Dataset
7+
8+
9+
class LibriSpeechDataset(Dataset):
10+
def __init__(
11+
self,
12+
root: str = "./data",
13+
with_info: bool = False,
14+
transforms: Optional[Callable] = None,
15+
):
16+
self.with_info = with_info
17+
self.transforms = transforms
18+
19+
from datasets import load_dataset
20+
21+
self.dataset = load_dataset(
22+
"librispeech_asr",
23+
"clean",
24+
split="train.100",
25+
cache_dir=os.path.join(root, "librispeech_dataset"),
26+
)
27+
28+
def __getitem__(
29+
self, idx: Union[Tensor, int]
30+
) -> Union[Tensor, Tuple[Tensor, Dict]]:
31+
idx = idx.tolist() if torch.is_tensor(idx) else idx # type: ignore
32+
data = self.dataset[idx]
33+
waveform = torch.tensor(data["audio"]["array"]).view(1, -1)
34+
info = dict(
35+
sample_rate=data["audio"]["sampling_rate"],
36+
text=data["text"],
37+
speaker_id=data["speaker_id"],
38+
)
39+
if self.transforms:
40+
waveform = self.transforms(waveform)
41+
return (waveform, info) if self.with_info else waveform
42+
43+
def __len__(self) -> int:
44+
return len(self.dataset)

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
setup(
44
name="audio-data-pytorch",
55
packages=find_packages(exclude=[]),
6-
version="0.0.10",
6+
version="0.0.11",
77
license="MIT",
88
description="Audio Data - PyTorch",
99
long_description_content_type="text/markdown",

0 commit comments

Comments
 (0)