Update resampling tutorial (#2773)

mthrok · mthrok · commit b703a6319777 · 2022-10-17T17:29:14.000+09:00
Summary: * Refactor benchmark script * Rename `time` variable to avoid (potential) conflicting with time module * Fix `beta` parameter in benchmark (it was not used previously) * Use `timeit` module for benchmark * Add plot * Move the comment on result at the end * Add link to an explanation of aliasing https://output.circle-artifacts.com/output/job/20b57d2f-3614-4161-a18e-e0c1a537739c/artifacts/0/docs/tutorials/audio_resampling_tutorial.html Pull Request resolved: #2773 Reviewed By: carolineechen Differential Revision: D40421337 Pulled By: mthrok fbshipit-source-id: b402f84d4517695daeca75fb84ad876ef9354b3a
diff --git a/examples/tutorials/audio_resampling_tutorial.py b/examples/tutorials/audio_resampling_tutorial.py
@@ -22,19 +22,14 @@
 #
 # First, we import the modules and define the helper functions.
 #
-# .. note::
-#    When running this tutorial in Google Colab, install the required packages
-#    with the following.
-#
-#    .. code::
-#
-#       !pip install librosa
 
 import math
-import time
+import timeit
 
 import librosa
+import resampy
 import matplotlib.pyplot as plt
+import matplotlib.colors as mcolors
 import pandas as pd
 from IPython.display import Audio, display
 
@@ -63,18 +58,18 @@ def _get_inverse_log_freq(freq, sample_rate, offset):
 def _get_freq_ticks(sample_rate, offset, f_max):
     # Given the original sample rate used for generating the sweep,
     # find the x-axis value where the log-scale major frequency values fall in
-    time, freq = [], []
+    times, freq = [], []
     for exp in range(2, 5):
         for v in range(1, 10):
             f = v * 10**exp
             if f < sample_rate // 2:
                 t = _get_inverse_log_freq(f, sample_rate, offset) / sample_rate
-                time.append(t)
+                times.append(t)
                 freq.append(f)
     t_max = _get_inverse_log_freq(f_max, sample_rate, offset) / sample_rate
-    time.append(t_max)
+    times.append(t_max)
     freq.append(f_max)
-    return time, freq
+    return times, freq
 
 
 def get_sine_sweep(sample_rate, offset=DEFAULT_OFFSET):
@@ -118,7 +113,7 @@ def plot_sweep(
 # -------------------
 #
 # To resample an audio waveform from one freqeuncy to another, you can use
-# :py:func:`torchaudio.transforms.Resample` or
+# :py:class:`torchaudio.transforms.Resample` or
 # :py:func:`torchaudio.functional.resample`.
 # ``transforms.Resample`` precomputes and caches the kernel used for resampling,
 # while ``functional.resample`` computes it on the fly, so using
@@ -163,6 +158,9 @@ def plot_sweep(
 #
 # We see that in the spectrogram of the resampled waveform, there is an
 # artifact, which was not present in the original waveform.
+# This effect is called aliasing.
+# `This page <https://music.arts.uci.edu/dobrian/digitalaudio.htm>`__ has
+# an explanation of how it happens, and why it looks like a reflection.
 
 resample_rate = 32000
 resampler = T.Resample(sample_rate, resample_rate, dtype=waveform.dtype)
@@ -332,147 +330,210 @@ def plot_sweep(
 # ``kaiser_best`` and ``kaiser_fast`` using their corresponding parameters
 # in ``torchaudio``.
 #
-# To elaborate on the results:
-#
-# - a larger ``lowpass_filter_width`` results in a larger resampling kernel,
-#   and therefore increases computation time for both the kernel computation
-#   and convolution
-# - using ``kaiser_window`` results in longer computation times than the default
-#   ``sinc_interpolation`` because it is more complex to compute the intermediate
-#   window values - a large GCD between the sample and resample rate will result
-#   in a simplification that allows for a smaller kernel and faster kernel computation.
+
+print(f"torchaudio: {torchaudio.__version__}")
+print(f"librosa: {librosa.__version__}")
+print(f"resampy: {resampy.__version__}")
+
+######################################################################
 #
 
+def benchmark_resample_functional(
+    waveform,
+    sample_rate,
+    resample_rate,
+    lowpass_filter_width=6,
+    rolloff=0.99,
+    resampling_method="sinc_interpolation",
+    beta=None,
+    iters=5,
+):
+    return timeit.timeit(
+        stmt='''
+torchaudio.functional.resample(
+    waveform,
+    sample_rate,
+    resample_rate,
+    lowpass_filter_width=lowpass_filter_width,
+    rolloff=rolloff,
+    resampling_method=resampling_method,
+    beta=beta,
+)
+        ''',
+        setup='import torchaudio',
+        number=iters,
+        globals=locals(),
+    ) * 1000 / iters
+
+
+######################################################################
+#
 
-def benchmark_resample(
-    method,
+def benchmark_resample_transforms(
     waveform,
     sample_rate,
     resample_rate,
     lowpass_filter_width=6,
     rolloff=0.99,
     resampling_method="sinc_interpolation",
     beta=None,
-    librosa_type=None,
     iters=5,
 ):
-    if method == "functional":
-        begin = time.monotonic()
-        for _ in range(iters):
-            F.resample(
-                waveform,
-                sample_rate,
-                resample_rate,
-                lowpass_filter_width=lowpass_filter_width,
-                rolloff=rolloff,
-                resampling_method=resampling_method,
-            )
-        elapsed = time.monotonic() - begin
-        return elapsed / iters
-    elif method == "transforms":
-        resampler = T.Resample(
-            sample_rate,
-            resample_rate,
-            lowpass_filter_width=lowpass_filter_width,
-            rolloff=rolloff,
-            resampling_method=resampling_method,
-            dtype=waveform.dtype,
-        )
-        begin = time.monotonic()
-        for _ in range(iters):
-            resampler(waveform)
-        elapsed = time.monotonic() - begin
-        return elapsed / iters
-    elif method == "librosa":
-        waveform_np = waveform.squeeze().numpy()
-        begin = time.monotonic()
-        for _ in range(iters):
-            librosa.resample(waveform_np, orig_sr=sample_rate, target_sr=resample_rate, res_type=librosa_type)
-        elapsed = time.monotonic() - begin
-        return elapsed / iters
+    return timeit.timeit(
+        stmt='resampler(waveform)',
+        setup='''
+import torchaudio
+
+resampler = torchaudio.transforms.Resample(
+    sample_rate,
+    resample_rate,
+    lowpass_filter_width=lowpass_filter_width,
+    rolloff=rolloff,
+    resampling_method=resampling_method,
+    dtype=waveform.dtype,
+    beta=beta,
+)
+resampler.to(waveform.device)
+        ''',
+        number=iters,
+        globals=locals(),
+    ) * 1000 / iters
 
 
 ######################################################################
 #
 
-configs = {
-    "downsample (48 -> 44.1 kHz)": [48000, 44100],
-    "downsample (16 -> 8 kHz)": [16000, 8000],
-    "upsample (44.1 -> 48 kHz)": [44100, 48000],
-    "upsample (8 -> 16 kHz)": [8000, 16000],
-}
+def benchmark_resample_librosa(
+    waveform,
+    sample_rate,
+    resample_rate,
+    res_type=None,
+    iters=5,
+):
+    waveform_np = waveform.squeeze().numpy()
+    return timeit.timeit(
+        stmt='''
+librosa.resample(
+    waveform_np,
+    orig_sr=sample_rate,
+    target_sr=resample_rate,
+    res_type=res_type,
+)
+        ''',
+        setup='import librosa',
+        number=iters,
+        globals=locals(),
+    ) * 1000 / iters
+
+
+######################################################################
+#
 
-for label in configs:
+def benchmark(sample_rate, resample_rate):
     times, rows = [], []
-    sample_rate = configs[label][0]
-    resample_rate = configs[label][1]
-    waveform = get_sine_sweep(sample_rate)
+    waveform = get_sine_sweep(sample_rate).to(torch.float32)
+
+    args = (waveform, sample_rate, resample_rate)
 
     # sinc 64 zero-crossings
-    f_time = benchmark_resample("functional", waveform, sample_rate, resample_rate, lowpass_filter_width=64)
-    t_time = benchmark_resample("transforms", waveform, sample_rate, resample_rate, lowpass_filter_width=64)
-    times.append([None, 1000 * f_time, 1000 * t_time])
+    f_time = benchmark_resample_functional(*args, lowpass_filter_width=64)
+    t_time = benchmark_resample_transforms(*args, lowpass_filter_width=64)
+    times.append([None, f_time, t_time])
     rows.append("sinc (width 64)")
 
     # sinc 6 zero-crossings
-    f_time = benchmark_resample("functional", waveform, sample_rate, resample_rate, lowpass_filter_width=16)
-    t_time = benchmark_resample("transforms", waveform, sample_rate, resample_rate, lowpass_filter_width=16)
-    times.append([None, 1000 * f_time, 1000 * t_time])
+    f_time = benchmark_resample_functional(*args, lowpass_filter_width=16)
+    t_time = benchmark_resample_transforms(*args, lowpass_filter_width=16)
+    times.append([None, f_time, t_time])
     rows.append("sinc (width 16)")
 
     # kaiser best
-    lib_time = benchmark_resample("librosa", waveform, sample_rate, resample_rate, librosa_type="kaiser_best")
-    f_time = benchmark_resample(
-        "functional",
-        waveform,
-        sample_rate,
-        resample_rate,
-        lowpass_filter_width=64,
-        rolloff=0.9475937167399596,
-        resampling_method="kaiser_window",
-        beta=14.769656459379492,
-    )
-    t_time = benchmark_resample(
-        "transforms",
-        waveform,
-        sample_rate,
-        resample_rate,
-        lowpass_filter_width=64,
-        rolloff=0.9475937167399596,
-        resampling_method="kaiser_window",
-        beta=14.769656459379492,
-    )
-    times.append([1000 * lib_time, 1000 * f_time, 1000 * t_time])
+    kwargs = {
+        "lowpass_filter_width": 64,
+        "rolloff": 0.9475937167399596,
+        "resampling_method": "kaiser_window",
+        "beta": 14.769656459379492,
+    }
+    lib_time = benchmark_resample_librosa(*args, res_type="kaiser_best")
+    f_time = benchmark_resample_functional(*args, **kwargs)
+    t_time = benchmark_resample_transforms(*args, **kwargs)
+    times.append([lib_time, f_time, t_time])
     rows.append("kaiser_best")
 
     # kaiser fast
-    lib_time = benchmark_resample("librosa", waveform, sample_rate, resample_rate, librosa_type="kaiser_fast")
-    f_time = benchmark_resample(
-        "functional",
-        waveform,
-        sample_rate,
-        resample_rate,
-        lowpass_filter_width=16,
-        rolloff=0.85,
-        resampling_method="kaiser_window",
-        beta=8.555504641634386,
-    )
-    t_time = benchmark_resample(
-        "transforms",
-        waveform,
-        sample_rate,
-        resample_rate,
-        lowpass_filter_width=16,
-        rolloff=0.85,
-        resampling_method="kaiser_window",
-        beta=8.555504641634386,
-    )
-    times.append([1000 * lib_time, 1000 * f_time, 1000 * t_time])
+    kwargs = {
+        "lowpass_filter_width": 16,
+        "rolloff": 0.85,
+        "resampling_method": "kaiser_window",
+        "beta": 8.555504641634386,
+    }
+    lib_time = benchmark_resample_librosa(*args, res_type="kaiser_fast")
+    f_time = benchmark_resample_functional(*args, **kwargs)
+    t_time = benchmark_resample_transforms(*args, **kwargs)
+    times.append([lib_time, f_time, t_time])
     rows.append("kaiser_fast")
 
     df = pd.DataFrame(times, columns=["librosa", "functional", "transforms"], index=rows)
-    df.columns = pd.MultiIndex.from_product([[f"{label} time (ms)"], df.columns])
+    return df
+
+
+######################################################################
+#
+def plot(df):
+    print(df.round(2))
+    ax = df.plot(kind="bar")
+    plt.ylabel("Time Elapsed [ms]")
+    plt.xticks(rotation = 0, fontsize=10)
+    for cont, col, color in zip(ax.containers, df.columns, mcolors.TABLEAU_COLORS):
+        label = ["N/A" if v != v else str(v) for v in df[col].round(2)]
+        ax.bar_label(cont, labels=label, color=color, fontweight="bold", fontsize="x-small")
+
+
+######################################################################
+#
+# Downsample (48 -> 44.1 kHz)
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+df = benchmark(48_000, 44_100)
+plot(df)
+
+######################################################################
+#
+# Downsample (16 -> 8 kHz)
+# ~~~~~~~~~~~~~~~~~~~~~~~~
+
+df = benchmark(16_000, 8_000)
+plot(df)
+
+######################################################################
+#
+# Upsample (44.1 -> 48 kHz)
+# ~~~~~~~~~~~~~~~~~~~~~~~~~
+
+df = benchmark(44_100, 48_000)
+plot(df)
+
+######################################################################
+#
+# Upsample (8 -> 16 kHz)
+# ~~~~~~~~~~~~~~~~~~~~~~
+
+df = benchmark(8_000, 16_000)
+plot(df)
 
-    print(f"torchaudio: {torchaudio.__version__}")
-    print(f"librosa: {librosa.__version__}")
-    display(df.round(2))
+######################################################################
+#
+# Summary
+# ~~~~~~~
+#
+# To elaborate on the results:
+#
+# - a larger ``lowpass_filter_width`` results in a larger resampling kernel,
+#   and therefore increases computation time for both the kernel computation
+#   and convolution
+# - using ``kaiser_window`` results in longer computation times than the default
+#   ``sinc_interpolation`` because it is more complex to compute the intermediate
+#   window values
+# - a large GCD between the sample and resample rate will result
+#   in a simplification that allows for a smaller kernel and faster kernel computation.
+#