Skip to content

Commit b703a63

Browse files
committed
Update resampling tutorial (#2773)
Summary: * Refactor benchmark script * Rename `time` variable to avoid (potential) conflicting with time module * Fix `beta` parameter in benchmark (it was not used previously) * Use `timeit` module for benchmark * Add plot * Move the comment on result at the end * Add link to an explanation of aliasing https://output.circle-artifacts.com/output/job/20b57d2f-3614-4161-a18e-e0c1a537739c/artifacts/0/docs/tutorials/audio_resampling_tutorial.html Pull Request resolved: #2773 Reviewed By: carolineechen Differential Revision: D40421337 Pulled By: mthrok fbshipit-source-id: b402f84d4517695daeca75fb84ad876ef9354b3a
1 parent fc6090e commit b703a63

File tree

1 file changed

+184
-123
lines changed

1 file changed

+184
-123
lines changed

examples/tutorials/audio_resampling_tutorial.py

Lines changed: 184 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -22,19 +22,14 @@
2222
#
2323
# First, we import the modules and define the helper functions.
2424
#
25-
# .. note::
26-
# When running this tutorial in Google Colab, install the required packages
27-
# with the following.
28-
#
29-
# .. code::
30-
#
31-
# !pip install librosa
3225

3326
import math
34-
import time
27+
import timeit
3528

3629
import librosa
30+
import resampy
3731
import matplotlib.pyplot as plt
32+
import matplotlib.colors as mcolors
3833
import pandas as pd
3934
from IPython.display import Audio, display
4035

@@ -63,18 +58,18 @@ def _get_inverse_log_freq(freq, sample_rate, offset):
6358
def _get_freq_ticks(sample_rate, offset, f_max):
6459
# Given the original sample rate used for generating the sweep,
6560
# find the x-axis value where the log-scale major frequency values fall in
66-
time, freq = [], []
61+
times, freq = [], []
6762
for exp in range(2, 5):
6863
for v in range(1, 10):
6964
f = v * 10**exp
7065
if f < sample_rate // 2:
7166
t = _get_inverse_log_freq(f, sample_rate, offset) / sample_rate
72-
time.append(t)
67+
times.append(t)
7368
freq.append(f)
7469
t_max = _get_inverse_log_freq(f_max, sample_rate, offset) / sample_rate
75-
time.append(t_max)
70+
times.append(t_max)
7671
freq.append(f_max)
77-
return time, freq
72+
return times, freq
7873

7974

8075
def get_sine_sweep(sample_rate, offset=DEFAULT_OFFSET):
@@ -118,7 +113,7 @@ def plot_sweep(
118113
# -------------------
119114
#
120115
# To resample an audio waveform from one freqeuncy to another, you can use
121-
# :py:func:`torchaudio.transforms.Resample` or
116+
# :py:class:`torchaudio.transforms.Resample` or
122117
# :py:func:`torchaudio.functional.resample`.
123118
# ``transforms.Resample`` precomputes and caches the kernel used for resampling,
124119
# while ``functional.resample`` computes it on the fly, so using
@@ -163,6 +158,9 @@ def plot_sweep(
163158
#
164159
# We see that in the spectrogram of the resampled waveform, there is an
165160
# artifact, which was not present in the original waveform.
161+
# This effect is called aliasing.
162+
# `This page <https://music.arts.uci.edu/dobrian/digitalaudio.htm>`__ has
163+
# an explanation of how it happens, and why it looks like a reflection.
166164

167165
resample_rate = 32000
168166
resampler = T.Resample(sample_rate, resample_rate, dtype=waveform.dtype)
@@ -332,147 +330,210 @@ def plot_sweep(
332330
# ``kaiser_best`` and ``kaiser_fast`` using their corresponding parameters
333331
# in ``torchaudio``.
334332
#
335-
# To elaborate on the results:
336-
#
337-
# - a larger ``lowpass_filter_width`` results in a larger resampling kernel,
338-
# and therefore increases computation time for both the kernel computation
339-
# and convolution
340-
# - using ``kaiser_window`` results in longer computation times than the default
341-
# ``sinc_interpolation`` because it is more complex to compute the intermediate
342-
# window values - a large GCD between the sample and resample rate will result
343-
# in a simplification that allows for a smaller kernel and faster kernel computation.
333+
334+
print(f"torchaudio: {torchaudio.__version__}")
335+
print(f"librosa: {librosa.__version__}")
336+
print(f"resampy: {resampy.__version__}")
337+
338+
######################################################################
344339
#
345340

341+
def benchmark_resample_functional(
342+
waveform,
343+
sample_rate,
344+
resample_rate,
345+
lowpass_filter_width=6,
346+
rolloff=0.99,
347+
resampling_method="sinc_interpolation",
348+
beta=None,
349+
iters=5,
350+
):
351+
return timeit.timeit(
352+
stmt='''
353+
torchaudio.functional.resample(
354+
waveform,
355+
sample_rate,
356+
resample_rate,
357+
lowpass_filter_width=lowpass_filter_width,
358+
rolloff=rolloff,
359+
resampling_method=resampling_method,
360+
beta=beta,
361+
)
362+
''',
363+
setup='import torchaudio',
364+
number=iters,
365+
globals=locals(),
366+
) * 1000 / iters
367+
368+
369+
######################################################################
370+
#
346371

347-
def benchmark_resample(
348-
method,
372+
def benchmark_resample_transforms(
349373
waveform,
350374
sample_rate,
351375
resample_rate,
352376
lowpass_filter_width=6,
353377
rolloff=0.99,
354378
resampling_method="sinc_interpolation",
355379
beta=None,
356-
librosa_type=None,
357380
iters=5,
358381
):
359-
if method == "functional":
360-
begin = time.monotonic()
361-
for _ in range(iters):
362-
F.resample(
363-
waveform,
364-
sample_rate,
365-
resample_rate,
366-
lowpass_filter_width=lowpass_filter_width,
367-
rolloff=rolloff,
368-
resampling_method=resampling_method,
369-
)
370-
elapsed = time.monotonic() - begin
371-
return elapsed / iters
372-
elif method == "transforms":
373-
resampler = T.Resample(
374-
sample_rate,
375-
resample_rate,
376-
lowpass_filter_width=lowpass_filter_width,
377-
rolloff=rolloff,
378-
resampling_method=resampling_method,
379-
dtype=waveform.dtype,
380-
)
381-
begin = time.monotonic()
382-
for _ in range(iters):
383-
resampler(waveform)
384-
elapsed = time.monotonic() - begin
385-
return elapsed / iters
386-
elif method == "librosa":
387-
waveform_np = waveform.squeeze().numpy()
388-
begin = time.monotonic()
389-
for _ in range(iters):
390-
librosa.resample(waveform_np, orig_sr=sample_rate, target_sr=resample_rate, res_type=librosa_type)
391-
elapsed = time.monotonic() - begin
392-
return elapsed / iters
382+
return timeit.timeit(
383+
stmt='resampler(waveform)',
384+
setup='''
385+
import torchaudio
386+
387+
resampler = torchaudio.transforms.Resample(
388+
sample_rate,
389+
resample_rate,
390+
lowpass_filter_width=lowpass_filter_width,
391+
rolloff=rolloff,
392+
resampling_method=resampling_method,
393+
dtype=waveform.dtype,
394+
beta=beta,
395+
)
396+
resampler.to(waveform.device)
397+
''',
398+
number=iters,
399+
globals=locals(),
400+
) * 1000 / iters
393401

394402

395403
######################################################################
396404
#
397405

398-
configs = {
399-
"downsample (48 -> 44.1 kHz)": [48000, 44100],
400-
"downsample (16 -> 8 kHz)": [16000, 8000],
401-
"upsample (44.1 -> 48 kHz)": [44100, 48000],
402-
"upsample (8 -> 16 kHz)": [8000, 16000],
403-
}
406+
def benchmark_resample_librosa(
407+
waveform,
408+
sample_rate,
409+
resample_rate,
410+
res_type=None,
411+
iters=5,
412+
):
413+
waveform_np = waveform.squeeze().numpy()
414+
return timeit.timeit(
415+
stmt='''
416+
librosa.resample(
417+
waveform_np,
418+
orig_sr=sample_rate,
419+
target_sr=resample_rate,
420+
res_type=res_type,
421+
)
422+
''',
423+
setup='import librosa',
424+
number=iters,
425+
globals=locals(),
426+
) * 1000 / iters
427+
428+
429+
######################################################################
430+
#
404431

405-
for label in configs:
432+
def benchmark(sample_rate, resample_rate):
406433
times, rows = [], []
407-
sample_rate = configs[label][0]
408-
resample_rate = configs[label][1]
409-
waveform = get_sine_sweep(sample_rate)
434+
waveform = get_sine_sweep(sample_rate).to(torch.float32)
435+
436+
args = (waveform, sample_rate, resample_rate)
410437

411438
# sinc 64 zero-crossings
412-
f_time = benchmark_resample("functional", waveform, sample_rate, resample_rate, lowpass_filter_width=64)
413-
t_time = benchmark_resample("transforms", waveform, sample_rate, resample_rate, lowpass_filter_width=64)
414-
times.append([None, 1000 * f_time, 1000 * t_time])
439+
f_time = benchmark_resample_functional(*args, lowpass_filter_width=64)
440+
t_time = benchmark_resample_transforms(*args, lowpass_filter_width=64)
441+
times.append([None, f_time, t_time])
415442
rows.append("sinc (width 64)")
416443

417444
# sinc 6 zero-crossings
418-
f_time = benchmark_resample("functional", waveform, sample_rate, resample_rate, lowpass_filter_width=16)
419-
t_time = benchmark_resample("transforms", waveform, sample_rate, resample_rate, lowpass_filter_width=16)
420-
times.append([None, 1000 * f_time, 1000 * t_time])
445+
f_time = benchmark_resample_functional(*args, lowpass_filter_width=16)
446+
t_time = benchmark_resample_transforms(*args, lowpass_filter_width=16)
447+
times.append([None, f_time, t_time])
421448
rows.append("sinc (width 16)")
422449

423450
# kaiser best
424-
lib_time = benchmark_resample("librosa", waveform, sample_rate, resample_rate, librosa_type="kaiser_best")
425-
f_time = benchmark_resample(
426-
"functional",
427-
waveform,
428-
sample_rate,
429-
resample_rate,
430-
lowpass_filter_width=64,
431-
rolloff=0.9475937167399596,
432-
resampling_method="kaiser_window",
433-
beta=14.769656459379492,
434-
)
435-
t_time = benchmark_resample(
436-
"transforms",
437-
waveform,
438-
sample_rate,
439-
resample_rate,
440-
lowpass_filter_width=64,
441-
rolloff=0.9475937167399596,
442-
resampling_method="kaiser_window",
443-
beta=14.769656459379492,
444-
)
445-
times.append([1000 * lib_time, 1000 * f_time, 1000 * t_time])
451+
kwargs = {
452+
"lowpass_filter_width": 64,
453+
"rolloff": 0.9475937167399596,
454+
"resampling_method": "kaiser_window",
455+
"beta": 14.769656459379492,
456+
}
457+
lib_time = benchmark_resample_librosa(*args, res_type="kaiser_best")
458+
f_time = benchmark_resample_functional(*args, **kwargs)
459+
t_time = benchmark_resample_transforms(*args, **kwargs)
460+
times.append([lib_time, f_time, t_time])
446461
rows.append("kaiser_best")
447462

448463
# kaiser fast
449-
lib_time = benchmark_resample("librosa", waveform, sample_rate, resample_rate, librosa_type="kaiser_fast")
450-
f_time = benchmark_resample(
451-
"functional",
452-
waveform,
453-
sample_rate,
454-
resample_rate,
455-
lowpass_filter_width=16,
456-
rolloff=0.85,
457-
resampling_method="kaiser_window",
458-
beta=8.555504641634386,
459-
)
460-
t_time = benchmark_resample(
461-
"transforms",
462-
waveform,
463-
sample_rate,
464-
resample_rate,
465-
lowpass_filter_width=16,
466-
rolloff=0.85,
467-
resampling_method="kaiser_window",
468-
beta=8.555504641634386,
469-
)
470-
times.append([1000 * lib_time, 1000 * f_time, 1000 * t_time])
464+
kwargs = {
465+
"lowpass_filter_width": 16,
466+
"rolloff": 0.85,
467+
"resampling_method": "kaiser_window",
468+
"beta": 8.555504641634386,
469+
}
470+
lib_time = benchmark_resample_librosa(*args, res_type="kaiser_fast")
471+
f_time = benchmark_resample_functional(*args, **kwargs)
472+
t_time = benchmark_resample_transforms(*args, **kwargs)
473+
times.append([lib_time, f_time, t_time])
471474
rows.append("kaiser_fast")
472475

473476
df = pd.DataFrame(times, columns=["librosa", "functional", "transforms"], index=rows)
474-
df.columns = pd.MultiIndex.from_product([[f"{label} time (ms)"], df.columns])
477+
return df
478+
479+
480+
######################################################################
481+
#
482+
def plot(df):
483+
print(df.round(2))
484+
ax = df.plot(kind="bar")
485+
plt.ylabel("Time Elapsed [ms]")
486+
plt.xticks(rotation = 0, fontsize=10)
487+
for cont, col, color in zip(ax.containers, df.columns, mcolors.TABLEAU_COLORS):
488+
label = ["N/A" if v != v else str(v) for v in df[col].round(2)]
489+
ax.bar_label(cont, labels=label, color=color, fontweight="bold", fontsize="x-small")
490+
491+
492+
######################################################################
493+
#
494+
# Downsample (48 -> 44.1 kHz)
495+
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
496+
497+
df = benchmark(48_000, 44_100)
498+
plot(df)
499+
500+
######################################################################
501+
#
502+
# Downsample (16 -> 8 kHz)
503+
# ~~~~~~~~~~~~~~~~~~~~~~~~
504+
505+
df = benchmark(16_000, 8_000)
506+
plot(df)
507+
508+
######################################################################
509+
#
510+
# Upsample (44.1 -> 48 kHz)
511+
# ~~~~~~~~~~~~~~~~~~~~~~~~~
512+
513+
df = benchmark(44_100, 48_000)
514+
plot(df)
515+
516+
######################################################################
517+
#
518+
# Upsample (8 -> 16 kHz)
519+
# ~~~~~~~~~~~~~~~~~~~~~~
520+
521+
df = benchmark(8_000, 16_000)
522+
plot(df)
475523

476-
print(f"torchaudio: {torchaudio.__version__}")
477-
print(f"librosa: {librosa.__version__}")
478-
display(df.round(2))
524+
######################################################################
525+
#
526+
# Summary
527+
# ~~~~~~~
528+
#
529+
# To elaborate on the results:
530+
#
531+
# - a larger ``lowpass_filter_width`` results in a larger resampling kernel,
532+
# and therefore increases computation time for both the kernel computation
533+
# and convolution
534+
# - using ``kaiser_window`` results in longer computation times than the default
535+
# ``sinc_interpolation`` because it is more complex to compute the intermediate
536+
# window values
537+
# - a large GCD between the sample and resample rate will result
538+
# in a simplification that allows for a smaller kernel and faster kernel computation.
539+
#

0 commit comments

Comments
 (0)