|
22 | 22 | # |
23 | 23 | # First, we import the modules and define the helper functions. |
24 | 24 | # |
25 | | -# .. note:: |
26 | | -# When running this tutorial in Google Colab, install the required packages |
27 | | -# with the following. |
28 | | -# |
29 | | -# .. code:: |
30 | | -# |
31 | | -# !pip install librosa |
32 | 25 |
|
33 | 26 | import math |
34 | | -import time |
| 27 | +import timeit |
35 | 28 |
|
36 | 29 | import librosa |
| 30 | +import resampy |
37 | 31 | import matplotlib.pyplot as plt |
| 32 | +import matplotlib.colors as mcolors |
38 | 33 | import pandas as pd |
39 | 34 | from IPython.display import Audio, display |
40 | 35 |
|
@@ -63,18 +58,18 @@ def _get_inverse_log_freq(freq, sample_rate, offset): |
63 | 58 | def _get_freq_ticks(sample_rate, offset, f_max): |
64 | 59 | # Given the original sample rate used for generating the sweep, |
65 | 60 | # find the x-axis value where the log-scale major frequency values fall in |
66 | | - time, freq = [], [] |
| 61 | + times, freq = [], [] |
67 | 62 | for exp in range(2, 5): |
68 | 63 | for v in range(1, 10): |
69 | 64 | f = v * 10**exp |
70 | 65 | if f < sample_rate // 2: |
71 | 66 | t = _get_inverse_log_freq(f, sample_rate, offset) / sample_rate |
72 | | - time.append(t) |
| 67 | + times.append(t) |
73 | 68 | freq.append(f) |
74 | 69 | t_max = _get_inverse_log_freq(f_max, sample_rate, offset) / sample_rate |
75 | | - time.append(t_max) |
| 70 | + times.append(t_max) |
76 | 71 | freq.append(f_max) |
77 | | - return time, freq |
| 72 | + return times, freq |
78 | 73 |
|
79 | 74 |
|
80 | 75 | def get_sine_sweep(sample_rate, offset=DEFAULT_OFFSET): |
@@ -118,7 +113,7 @@ def plot_sweep( |
118 | 113 | # ------------------- |
119 | 114 | # |
120 | 115 | # To resample an audio waveform from one freqeuncy to another, you can use |
121 | | -# :py:func:`torchaudio.transforms.Resample` or |
| 116 | +# :py:class:`torchaudio.transforms.Resample` or |
122 | 117 | # :py:func:`torchaudio.functional.resample`. |
123 | 118 | # ``transforms.Resample`` precomputes and caches the kernel used for resampling, |
124 | 119 | # while ``functional.resample`` computes it on the fly, so using |
@@ -163,6 +158,9 @@ def plot_sweep( |
163 | 158 | # |
164 | 159 | # We see that in the spectrogram of the resampled waveform, there is an |
165 | 160 | # artifact, which was not present in the original waveform. |
| 161 | +# This effect is called aliasing. |
| 162 | +# `This page <https://music.arts.uci.edu/dobrian/digitalaudio.htm>`__ has |
| 163 | +# an explanation of how it happens, and why it looks like a reflection. |
166 | 164 |
|
167 | 165 | resample_rate = 32000 |
168 | 166 | resampler = T.Resample(sample_rate, resample_rate, dtype=waveform.dtype) |
@@ -332,147 +330,210 @@ def plot_sweep( |
332 | 330 | # ``kaiser_best`` and ``kaiser_fast`` using their corresponding parameters |
333 | 331 | # in ``torchaudio``. |
334 | 332 | # |
335 | | -# To elaborate on the results: |
336 | | -# |
337 | | -# - a larger ``lowpass_filter_width`` results in a larger resampling kernel, |
338 | | -# and therefore increases computation time for both the kernel computation |
339 | | -# and convolution |
340 | | -# - using ``kaiser_window`` results in longer computation times than the default |
341 | | -# ``sinc_interpolation`` because it is more complex to compute the intermediate |
342 | | -# window values - a large GCD between the sample and resample rate will result |
343 | | -# in a simplification that allows for a smaller kernel and faster kernel computation. |
| 333 | + |
| 334 | +print(f"torchaudio: {torchaudio.__version__}") |
| 335 | +print(f"librosa: {librosa.__version__}") |
| 336 | +print(f"resampy: {resampy.__version__}") |
| 337 | + |
| 338 | +###################################################################### |
344 | 339 | # |
345 | 340 |
|
| 341 | +def benchmark_resample_functional( |
| 342 | + waveform, |
| 343 | + sample_rate, |
| 344 | + resample_rate, |
| 345 | + lowpass_filter_width=6, |
| 346 | + rolloff=0.99, |
| 347 | + resampling_method="sinc_interpolation", |
| 348 | + beta=None, |
| 349 | + iters=5, |
| 350 | +): |
| 351 | + return timeit.timeit( |
| 352 | + stmt=''' |
| 353 | +torchaudio.functional.resample( |
| 354 | + waveform, |
| 355 | + sample_rate, |
| 356 | + resample_rate, |
| 357 | + lowpass_filter_width=lowpass_filter_width, |
| 358 | + rolloff=rolloff, |
| 359 | + resampling_method=resampling_method, |
| 360 | + beta=beta, |
| 361 | +) |
| 362 | + ''', |
| 363 | + setup='import torchaudio', |
| 364 | + number=iters, |
| 365 | + globals=locals(), |
| 366 | + ) * 1000 / iters |
| 367 | + |
| 368 | + |
| 369 | +###################################################################### |
| 370 | +# |
346 | 371 |
|
347 | | -def benchmark_resample( |
348 | | - method, |
| 372 | +def benchmark_resample_transforms( |
349 | 373 | waveform, |
350 | 374 | sample_rate, |
351 | 375 | resample_rate, |
352 | 376 | lowpass_filter_width=6, |
353 | 377 | rolloff=0.99, |
354 | 378 | resampling_method="sinc_interpolation", |
355 | 379 | beta=None, |
356 | | - librosa_type=None, |
357 | 380 | iters=5, |
358 | 381 | ): |
359 | | - if method == "functional": |
360 | | - begin = time.monotonic() |
361 | | - for _ in range(iters): |
362 | | - F.resample( |
363 | | - waveform, |
364 | | - sample_rate, |
365 | | - resample_rate, |
366 | | - lowpass_filter_width=lowpass_filter_width, |
367 | | - rolloff=rolloff, |
368 | | - resampling_method=resampling_method, |
369 | | - ) |
370 | | - elapsed = time.monotonic() - begin |
371 | | - return elapsed / iters |
372 | | - elif method == "transforms": |
373 | | - resampler = T.Resample( |
374 | | - sample_rate, |
375 | | - resample_rate, |
376 | | - lowpass_filter_width=lowpass_filter_width, |
377 | | - rolloff=rolloff, |
378 | | - resampling_method=resampling_method, |
379 | | - dtype=waveform.dtype, |
380 | | - ) |
381 | | - begin = time.monotonic() |
382 | | - for _ in range(iters): |
383 | | - resampler(waveform) |
384 | | - elapsed = time.monotonic() - begin |
385 | | - return elapsed / iters |
386 | | - elif method == "librosa": |
387 | | - waveform_np = waveform.squeeze().numpy() |
388 | | - begin = time.monotonic() |
389 | | - for _ in range(iters): |
390 | | - librosa.resample(waveform_np, orig_sr=sample_rate, target_sr=resample_rate, res_type=librosa_type) |
391 | | - elapsed = time.monotonic() - begin |
392 | | - return elapsed / iters |
| 382 | + return timeit.timeit( |
| 383 | + stmt='resampler(waveform)', |
| 384 | + setup=''' |
| 385 | +import torchaudio |
| 386 | +
|
| 387 | +resampler = torchaudio.transforms.Resample( |
| 388 | + sample_rate, |
| 389 | + resample_rate, |
| 390 | + lowpass_filter_width=lowpass_filter_width, |
| 391 | + rolloff=rolloff, |
| 392 | + resampling_method=resampling_method, |
| 393 | + dtype=waveform.dtype, |
| 394 | + beta=beta, |
| 395 | +) |
| 396 | +resampler.to(waveform.device) |
| 397 | + ''', |
| 398 | + number=iters, |
| 399 | + globals=locals(), |
| 400 | + ) * 1000 / iters |
393 | 401 |
|
394 | 402 |
|
395 | 403 | ###################################################################### |
396 | 404 | # |
397 | 405 |
|
398 | | -configs = { |
399 | | - "downsample (48 -> 44.1 kHz)": [48000, 44100], |
400 | | - "downsample (16 -> 8 kHz)": [16000, 8000], |
401 | | - "upsample (44.1 -> 48 kHz)": [44100, 48000], |
402 | | - "upsample (8 -> 16 kHz)": [8000, 16000], |
403 | | -} |
| 406 | +def benchmark_resample_librosa( |
| 407 | + waveform, |
| 408 | + sample_rate, |
| 409 | + resample_rate, |
| 410 | + res_type=None, |
| 411 | + iters=5, |
| 412 | +): |
| 413 | + waveform_np = waveform.squeeze().numpy() |
| 414 | + return timeit.timeit( |
| 415 | + stmt=''' |
| 416 | +librosa.resample( |
| 417 | + waveform_np, |
| 418 | + orig_sr=sample_rate, |
| 419 | + target_sr=resample_rate, |
| 420 | + res_type=res_type, |
| 421 | +) |
| 422 | + ''', |
| 423 | + setup='import librosa', |
| 424 | + number=iters, |
| 425 | + globals=locals(), |
| 426 | + ) * 1000 / iters |
| 427 | + |
| 428 | + |
| 429 | +###################################################################### |
| 430 | +# |
404 | 431 |
|
405 | | -for label in configs: |
| 432 | +def benchmark(sample_rate, resample_rate): |
406 | 433 | times, rows = [], [] |
407 | | - sample_rate = configs[label][0] |
408 | | - resample_rate = configs[label][1] |
409 | | - waveform = get_sine_sweep(sample_rate) |
| 434 | + waveform = get_sine_sweep(sample_rate).to(torch.float32) |
| 435 | + |
| 436 | + args = (waveform, sample_rate, resample_rate) |
410 | 437 |
|
411 | 438 | # sinc 64 zero-crossings |
412 | | - f_time = benchmark_resample("functional", waveform, sample_rate, resample_rate, lowpass_filter_width=64) |
413 | | - t_time = benchmark_resample("transforms", waveform, sample_rate, resample_rate, lowpass_filter_width=64) |
414 | | - times.append([None, 1000 * f_time, 1000 * t_time]) |
| 439 | + f_time = benchmark_resample_functional(*args, lowpass_filter_width=64) |
| 440 | + t_time = benchmark_resample_transforms(*args, lowpass_filter_width=64) |
| 441 | + times.append([None, f_time, t_time]) |
415 | 442 | rows.append("sinc (width 64)") |
416 | 443 |
|
417 | 444 | # sinc 6 zero-crossings |
418 | | - f_time = benchmark_resample("functional", waveform, sample_rate, resample_rate, lowpass_filter_width=16) |
419 | | - t_time = benchmark_resample("transforms", waveform, sample_rate, resample_rate, lowpass_filter_width=16) |
420 | | - times.append([None, 1000 * f_time, 1000 * t_time]) |
| 445 | + f_time = benchmark_resample_functional(*args, lowpass_filter_width=16) |
| 446 | + t_time = benchmark_resample_transforms(*args, lowpass_filter_width=16) |
| 447 | + times.append([None, f_time, t_time]) |
421 | 448 | rows.append("sinc (width 16)") |
422 | 449 |
|
423 | 450 | # kaiser best |
424 | | - lib_time = benchmark_resample("librosa", waveform, sample_rate, resample_rate, librosa_type="kaiser_best") |
425 | | - f_time = benchmark_resample( |
426 | | - "functional", |
427 | | - waveform, |
428 | | - sample_rate, |
429 | | - resample_rate, |
430 | | - lowpass_filter_width=64, |
431 | | - rolloff=0.9475937167399596, |
432 | | - resampling_method="kaiser_window", |
433 | | - beta=14.769656459379492, |
434 | | - ) |
435 | | - t_time = benchmark_resample( |
436 | | - "transforms", |
437 | | - waveform, |
438 | | - sample_rate, |
439 | | - resample_rate, |
440 | | - lowpass_filter_width=64, |
441 | | - rolloff=0.9475937167399596, |
442 | | - resampling_method="kaiser_window", |
443 | | - beta=14.769656459379492, |
444 | | - ) |
445 | | - times.append([1000 * lib_time, 1000 * f_time, 1000 * t_time]) |
| 451 | + kwargs = { |
| 452 | + "lowpass_filter_width": 64, |
| 453 | + "rolloff": 0.9475937167399596, |
| 454 | + "resampling_method": "kaiser_window", |
| 455 | + "beta": 14.769656459379492, |
| 456 | + } |
| 457 | + lib_time = benchmark_resample_librosa(*args, res_type="kaiser_best") |
| 458 | + f_time = benchmark_resample_functional(*args, **kwargs) |
| 459 | + t_time = benchmark_resample_transforms(*args, **kwargs) |
| 460 | + times.append([lib_time, f_time, t_time]) |
446 | 461 | rows.append("kaiser_best") |
447 | 462 |
|
448 | 463 | # kaiser fast |
449 | | - lib_time = benchmark_resample("librosa", waveform, sample_rate, resample_rate, librosa_type="kaiser_fast") |
450 | | - f_time = benchmark_resample( |
451 | | - "functional", |
452 | | - waveform, |
453 | | - sample_rate, |
454 | | - resample_rate, |
455 | | - lowpass_filter_width=16, |
456 | | - rolloff=0.85, |
457 | | - resampling_method="kaiser_window", |
458 | | - beta=8.555504641634386, |
459 | | - ) |
460 | | - t_time = benchmark_resample( |
461 | | - "transforms", |
462 | | - waveform, |
463 | | - sample_rate, |
464 | | - resample_rate, |
465 | | - lowpass_filter_width=16, |
466 | | - rolloff=0.85, |
467 | | - resampling_method="kaiser_window", |
468 | | - beta=8.555504641634386, |
469 | | - ) |
470 | | - times.append([1000 * lib_time, 1000 * f_time, 1000 * t_time]) |
| 464 | + kwargs = { |
| 465 | + "lowpass_filter_width": 16, |
| 466 | + "rolloff": 0.85, |
| 467 | + "resampling_method": "kaiser_window", |
| 468 | + "beta": 8.555504641634386, |
| 469 | + } |
| 470 | + lib_time = benchmark_resample_librosa(*args, res_type="kaiser_fast") |
| 471 | + f_time = benchmark_resample_functional(*args, **kwargs) |
| 472 | + t_time = benchmark_resample_transforms(*args, **kwargs) |
| 473 | + times.append([lib_time, f_time, t_time]) |
471 | 474 | rows.append("kaiser_fast") |
472 | 475 |
|
473 | 476 | df = pd.DataFrame(times, columns=["librosa", "functional", "transforms"], index=rows) |
474 | | - df.columns = pd.MultiIndex.from_product([[f"{label} time (ms)"], df.columns]) |
| 477 | + return df |
| 478 | + |
| 479 | + |
| 480 | +###################################################################### |
| 481 | +# |
| 482 | +def plot(df): |
| 483 | + print(df.round(2)) |
| 484 | + ax = df.plot(kind="bar") |
| 485 | + plt.ylabel("Time Elapsed [ms]") |
| 486 | + plt.xticks(rotation = 0, fontsize=10) |
| 487 | + for cont, col, color in zip(ax.containers, df.columns, mcolors.TABLEAU_COLORS): |
| 488 | + label = ["N/A" if v != v else str(v) for v in df[col].round(2)] |
| 489 | + ax.bar_label(cont, labels=label, color=color, fontweight="bold", fontsize="x-small") |
| 490 | + |
| 491 | + |
| 492 | +###################################################################### |
| 493 | +# |
| 494 | +# Downsample (48 -> 44.1 kHz) |
| 495 | +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 496 | + |
| 497 | +df = benchmark(48_000, 44_100) |
| 498 | +plot(df) |
| 499 | + |
| 500 | +###################################################################### |
| 501 | +# |
| 502 | +# Downsample (16 -> 8 kHz) |
| 503 | +# ~~~~~~~~~~~~~~~~~~~~~~~~ |
| 504 | + |
| 505 | +df = benchmark(16_000, 8_000) |
| 506 | +plot(df) |
| 507 | + |
| 508 | +###################################################################### |
| 509 | +# |
| 510 | +# Upsample (44.1 -> 48 kHz) |
| 511 | +# ~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 512 | + |
| 513 | +df = benchmark(44_100, 48_000) |
| 514 | +plot(df) |
| 515 | + |
| 516 | +###################################################################### |
| 517 | +# |
| 518 | +# Upsample (8 -> 16 kHz) |
| 519 | +# ~~~~~~~~~~~~~~~~~~~~~~ |
| 520 | + |
| 521 | +df = benchmark(8_000, 16_000) |
| 522 | +plot(df) |
475 | 523 |
|
476 | | - print(f"torchaudio: {torchaudio.__version__}") |
477 | | - print(f"librosa: {librosa.__version__}") |
478 | | - display(df.round(2)) |
| 524 | +###################################################################### |
| 525 | +# |
| 526 | +# Summary |
| 527 | +# ~~~~~~~ |
| 528 | +# |
| 529 | +# To elaborate on the results: |
| 530 | +# |
| 531 | +# - a larger ``lowpass_filter_width`` results in a larger resampling kernel, |
| 532 | +# and therefore increases computation time for both the kernel computation |
| 533 | +# and convolution |
| 534 | +# - using ``kaiser_window`` results in longer computation times than the default |
| 535 | +# ``sinc_interpolation`` because it is more complex to compute the intermediate |
| 536 | +# window values |
| 537 | +# - a large GCD between the sample and resample rate will result |
| 538 | +# in a simplification that allows for a smaller kernel and faster kernel computation. |
| 539 | +# |
0 commit comments