From f2e914a36d0b74bc1ec20af9cdc497d9718d915e Mon Sep 17 00:00:00 2001 From: ChristianGeng Date: Fri, 20 Sep 2024 14:45:30 +0200 Subject: [PATCH 01/11] export read_text. --- audinterface/utils/__init__.py | 1 + tests/test_process_text.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/audinterface/utils/__init__.py b/audinterface/utils/__init__.py index 0679a8b..0d9ed08 100644 --- a/audinterface/utils/__init__.py +++ b/audinterface/utils/__init__.py @@ -1,4 +1,5 @@ from audinterface.core.utils import read_audio +from audinterface.core.utils import read_text from audinterface.core.utils import signal_index from audinterface.core.utils import sliding_window from audinterface.core.utils import to_timedelta diff --git a/tests/test_process_text.py b/tests/test_process_text.py index 4238ace..158c5c7 100644 --- a/tests/test_process_text.py +++ b/tests/test_process_text.py @@ -493,4 +493,4 @@ def test_read_data(tmpdir, data): file = audeer.path(tmpdir, "media.txt") with open(file, "w") as fp: fp.write(data) - assert audinterface.utils.read_data(file) == data + assert audinterface.utils.read_text(file) == data From 2e7d90da48efbad874175c8a96069607afcf13a5 Mon Sep 17 00:00:00 2001 From: ChristianGeng Date: Fri, 20 Sep 2024 15:40:43 +0200 Subject: [PATCH 02/11] Add read_func. argument. --- audinterface/core/process.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/audinterface/core/process.py b/audinterface/core/process.py index c7e9977..7193de3 100644 --- a/audinterface/core/process.py +++ b/audinterface/core/process.py @@ -104,6 +104,12 @@ class Process: multiprocessing multiprocessing: use multiprocessing instead of multithreading verbose: show debug messages + read_func: function to read in signals/data. When specified, + it needs to be able to read signals BOTH other data. + Per default, :func:`audinterface.utils.read_audio` + will be used for signal file(s), and + :func:`audinterface.utils.read_text` for files + with ``.json`` or ``text``extensions. Raises: ValueError: if ``resample = True``, but ``sampling_rate = None`` @@ -171,6 +177,7 @@ def __init__( num_workers: typing.Optional[int] = 1, multiprocessing: bool = False, verbose: bool = False, + read_func: typing.Callable[..., typing.Any] = None, ): if channels is not None: channels = audeer.to_list(channels) @@ -236,6 +243,14 @@ def __init__( self.win_dur = win_dur r"""Window duration.""" + # set read_audio and read_text methods + if read_func is None: + setattr(self.__class__, "read_audio", staticmethod(utils.read_audio)) + setattr(self.__class__, "read_text", staticmethod(utils.read_text)) + else: + setattr(self.__class__, "read_audio", staticmethod(read_func)) + setattr(self.__class__, "read_text", staticmethod(read_func)) + def _process_file( self, file: str, @@ -274,7 +289,7 @@ def _process_file( # Text files if ext in ["json", "txt"]: - data = utils.read_text(file, root=root) + data = self.read_text(file, root=root) y, file = self._process_data( data, idx=idx, @@ -288,7 +303,7 @@ def _process_file( # Audio/video files else: - signal, sampling_rate = utils.read_audio( + signal, sampling_rate = self.read_audio( file, start=start, end=end, From cfb1ed070129eab45f4438292261c0ac9eb79831 Mon Sep 17 00:00:00 2001 From: ChristianGeng Date: Fri, 20 Sep 2024 15:51:47 +0200 Subject: [PATCH 03/11] identity: make sr a kwarg. --- audinterface/core/process.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/audinterface/core/process.py b/audinterface/core/process.py index 7193de3..7b228b1 100644 --- a/audinterface/core/process.py +++ b/audinterface/core/process.py @@ -17,7 +17,7 @@ from audinterface.core.typing import Timestamps -def identity(signal, sampling_rate) -> np.ndarray: +def identity(signal, sampling_rate=None) -> np.ndarray: r"""Default processing function. This function is used, From 1476e62af9c1aa3a7cc528461819005025625704 Mon Sep 17 00:00:00 2001 From: ChristianGeng Date: Fri, 20 Sep 2024 23:47:14 +0200 Subject: [PATCH 04/11] Fix process_index. --- audinterface/core/process.py | 41 ++++++++++++-- tests/test_process_text.py | 101 +++++++++++++++++++++++++++++++---- 2 files changed, 128 insertions(+), 14 deletions(-) diff --git a/audinterface/core/process.py b/audinterface/core/process.py index 7b228b1..773a719 100644 --- a/audinterface/core/process.py +++ b/audinterface/core/process.py @@ -1,4 +1,5 @@ import errno +from collections.abc import Iterable import inspect import itertools import os @@ -616,10 +617,44 @@ def _process_index_wo_segment( task_description=f"Process {len(index)} segments", ) - y = list(itertools.chain.from_iterable([x[0] for x in xs])) + ys = [x[0] for x in xs] + all_dict = all(map(lambda x : isinstance(x, dict), [x[0] for x in xs])) + all_iterable = all(map(lambda x : isinstance(x, Iterable), [x[0] for x in xs])) + all_text = all(map(lambda x : isinstance(x, str), [x[0] for x in xs])) + # print(all_dict, all_iterable) + + if all_dict: + # prevent to convert to list of values + keys = list(itertools.chain.from_iterable([x.keys() for x in ys])) + values = list(itertools.chain.from_iterable([x.values() for x in ys])) + y = [{x:y} for (x, y) in zip(keys, values)] + # y = list(itertools.chain.from_iterable([[x[0]] for x in xs])) + else: + if all_iterable and all_text: + y = list(itertools.chain.from_iterable([[x[0]] for x in xs])) + else: + y = list(itertools.chain.from_iterable([x[0] for x in xs])) + files = list(itertools.chain.from_iterable([x[1] for x in xs])) - starts = list(itertools.chain.from_iterable([x[2] for x in xs])) - ends = list(itertools.chain.from_iterable([x[3] for x in xs])) + + # avoid 'NoneType' object is not iterable error + # this happends when all entries are None + try: + starts = list(itertools.chain.from_iterable([x[2] for x in xs])) + except TypeError: + pass + starts_non_iterable = [x for x in filter(None, [x[2] for x in xs])] == [] + assert starts_non_iterable, "unknown problem" + starts = [x[2] for x in xs] + + try: + ends = list(itertools.chain.from_iterable([x[3] for x in xs])) + except TypeError: + pass + ends_non_iterable = [x for x in filter(None, [x[3] for x in xs])] == [] + assert ends_non_iterable, "unknown problem" + ends = [x[3] for x in xs] + if ( len(audeer.unique(starts)) == 1 diff --git a/tests/test_process_text.py b/tests/test_process_text.py index 158c5c7..7456b5a 100644 --- a/tests/test_process_text.py +++ b/tests/test_process_text.py @@ -15,6 +15,9 @@ def identity(data): return data +def data_identity(data): + return data + def length(data): return len(data) @@ -73,6 +76,7 @@ def test_process_file( # test absolute path y = process.process_file(path) + expected_series = pd.Series( [expected_data], index=audformat.filewise_index(path), @@ -191,21 +195,63 @@ def test_process_folder( pd.testing.assert_series_equal(y, pd.Series(dtype=object)) +def _get_idx_type(preserve_index, segment_is_None, idx): + """Get expected index type. + + preserve_index: if ``True`` + and :attr:`audinterface.Process.segment` is ``None`` + the returned index + will be of same type + as the original one. + Otherwise it will be a segmented index + if any audio/video files are processed, + or a filewise index otherwise + """ + + if preserve_index and segment_is_None: + idx_type = "segmented" if audformat.is_segmented_index(idx) else "filewise" + return idx_type + + extensions = [os.path.splitext(x)[-1] for x in idx.get_level_values(0).tolist()] + # we only use wav in fixtures so this is ok + any_media = any(["wav" in x for x in extensions]) + + if any_media: + idx_type = "segmented" + else: + idx_type = "filewise" + + return idx_type + + +def _series_generator(y, index_type: str): + for idx, value in y.items(): + if index_type == "filewise": + file = idx + yield file, value + elif index_type == "segmented": + (file, _, _) = idx + yield file, value + else: + raise ValueError("index type invalid") + @pytest.mark.parametrize("num_workers", [1, 2, None]) -@pytest.mark.parametrize("file_format", ["json", "txt"]) +@pytest.mark.parametrize("file_format", ["json", "txt"]) # "json","txt" @pytest.mark.parametrize("multiprocessing", [False, True]) @pytest.mark.parametrize("preserve_index", [False, True]) +@pytest.mark.parametrize("process_func", [data_identity, None, identity]) def test_process_index( tmpdir, num_workers, file_format, multiprocessing, preserve_index, + process_func, ): cache_root = os.path.join(tmpdir, "cache") process = audinterface.Process( - process_func=None, + process_func=process_func, num_workers=num_workers, multiprocessing=multiprocessing, verbose=False, @@ -233,17 +279,27 @@ def test_process_index( starts=[0, 0, 1, 2], ends=[None, 1, 2, 3], ) + y = process.process_index( index, preserve_index=preserve_index, ) + if preserve_index: pd.testing.assert_index_equal(y.index, index) - for (path, _, _), value in y.items(): + + expected_idx_type = _get_idx_type(preserve_index, process.segment is None, index) + + for path, value in _series_generator(y, expected_idx_type): assert audinterface.utils.read_text(path) == data assert value == data - # Segmented index with relative paths + # for (path, _, _), value in y.items(): + # assert audinterface.utils.read_text(path) == data + # assert value == data + + + # # Segmented index with relative paths index = audformat.segmented_index( [file] * 4, starts=[0, 0, 1, 2], @@ -256,25 +312,40 @@ def test_process_index( ) if preserve_index: pd.testing.assert_index_equal(y.index, index) - for (file, _, _), value in y.items(): + + for file, value in _series_generator(y, expected_idx_type): assert audinterface.utils.read_text(file, root=root) == data assert value == data + # for (file, _, _), value in y.items(): + # assert audinterface.utils.read_text(file, root=root) == data + # assert value == data + # Filewise index with absolute paths index = audformat.filewise_index(path) y = process.process_index( index, preserve_index=preserve_index, ) + if preserve_index: pd.testing.assert_index_equal(y.index, index) - for path, value in y.items(): + # for path, value in y.items(): + # assert audinterface.utils.read_text(path) == data + # assert value == data + expected_idx_type = _get_idx_type(preserve_index, process.segment is None, index) + for path, value in _series_generator(y, expected_idx_type): assert audinterface.utils.read_text(path) == data assert value == data else: + expected_idx_type = _get_idx_type(preserve_index, process.segment is None, index) expected_index = audformat.filewise_index(files=list(index)) pd.testing.assert_index_equal(y.index, expected_index) - for (path, _, _), value in y.items(): + # for (path, _, _), value in y.items(): + # assert audinterface.utils.read_text(path) == data + # assert value == data + # expected_idx_type = _get_idx_type(preserve_index, process.segment is None, index) + for path, value in _series_generator(y, "filewise"): assert audinterface.utils.read_text(path) == data assert value == data @@ -287,13 +358,19 @@ def test_process_index( ) if preserve_index: pd.testing.assert_index_equal(y.index, index) - for file, value in y.items(): + for file, value in _series_generator(y, "filewise"): assert audinterface.utils.read_text(file, root=root) == data assert value == data + # for file, value in y.items(): + # assert audinterface.utils.read_text(file, root=root) == data + # assert value == data else: - for (file, _, _), value in y.items(): + for file, value in _series_generator(y, "filewise"): assert audinterface.utils.read_text(file, root=root) == data assert value == data + # for (file, _, _), value in y.items(): + # assert audinterface.utils.read_text(file, root=root) == data + # assert value == data # Cache result y = process.process_index( @@ -302,10 +379,12 @@ def test_process_index( root=root, cache_root=cache_root, ) - os.remove(path) + # breakpoint() + + os.remove(path) # Fails because second file does not exist - with pytest.raises(RuntimeError): + with pytest.raises(FileNotFoundError): process.process_index( index, preserve_index=preserve_index, From c0bc00824283f20d080d1a41744f2e8890ab9b82 Mon Sep 17 00:00:00 2001 From: ChristianGeng Date: Tue, 24 Sep 2024 12:49:42 +0200 Subject: [PATCH 05/11] Add clumsy method tp postprocess. --- audinterface/core/process.py | 70 +++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 29 deletions(-) diff --git a/audinterface/core/process.py b/audinterface/core/process.py index 773a719..a98cd59 100644 --- a/audinterface/core/process.py +++ b/audinterface/core/process.py @@ -1,5 +1,5 @@ -import errno from collections.abc import Iterable +import errno import inspect import itertools import os @@ -505,22 +505,7 @@ def process_files( ) self.verbose = verbose - y = list(itertools.chain.from_iterable([x[0] for x in xs])) - files = list(itertools.chain.from_iterable([x[1] for x in xs])) - starts = list(itertools.chain.from_iterable([x[2] for x in xs])) - ends = list(itertools.chain.from_iterable([x[3] for x in xs])) - - if ( - len(audeer.unique(starts)) == 1 - and audeer.unique(starts)[0] is None - and len(audeer.unique(ends)) == 1 - and audeer.unique(ends)[0] is None - ): - index = audformat.filewise_index(files) - else: - index = audformat.segmented_index(files, starts, ends) - y = pd.Series(y, index) - + y = self._postprocess_xs(xs) return y def process_folder( @@ -617,19 +602,44 @@ def _process_index_wo_segment( task_description=f"Process {len(index)} segments", ) + y = self._postprocess_xs(xs) + return y + + @staticmethod + def _postprocess_xs(xs): + """Postprocesses a list of tuples containing processed data, + files, starts, and ends, and returns a pandas Series. + + This is mainly factored into a separate method as it + is used in multiple places: + + - :meth:`process._process_index_wo_segment` + - :meth:`process._postprocess_xs` + + I find it hard to come up with less inelegance + + Parameters: + xs (list): A list of tuples containing processed data, + files, starts, and ends. + index (pd.Index): The index of the resulting pandas Series. + + Returns: + pd.Series: A pandas Series containing the postprocessed data. + """ ys = [x[0] for x in xs] - all_dict = all(map(lambda x : isinstance(x, dict), [x[0] for x in xs])) - all_iterable = all(map(lambda x : isinstance(x, Iterable), [x[0] for x in xs])) - all_text = all(map(lambda x : isinstance(x, str), [x[0] for x in xs])) - # print(all_dict, all_iterable) + # TODO: put into single list comprehension for all these three diagnostics + all_dict = all(map(lambda x: isinstance(x, dict), [x[0] for x in xs])) + all_iterable = all(map(lambda x: isinstance(x, Iterable), [x[0] for x in xs])) + all_text = all(map(lambda x: isinstance(x, str), [x[0] for x in xs])) if all_dict: - # prevent to convert to list of values + # prevent pd.Series from converting0 to list of values keys = list(itertools.chain.from_iterable([x.keys() for x in ys])) values = list(itertools.chain.from_iterable([x.values() for x in ys])) - y = [{x:y} for (x, y) in zip(keys, values)] - # y = list(itertools.chain.from_iterable([[x[0]] for x in xs])) + y = [{x: y} for (x, y) in zip(keys, values)] else: + # if all text, need to pack into a list in order to avoid flattening + # and the resulting dimension problems if all_iterable and all_text: y = list(itertools.chain.from_iterable([[x[0]] for x in xs])) else: @@ -637,24 +647,24 @@ def _process_index_wo_segment( files = list(itertools.chain.from_iterable([x[1] for x in xs])) - # avoid 'NoneType' object is not iterable error - # this happends when all entries are None + # avoid 'NoneType' object is not iterable error in itertools.chain + # for starts: this happens when all entries are None try: starts = list(itertools.chain.from_iterable([x[2] for x in xs])) except TypeError: pass starts_non_iterable = [x for x in filter(None, [x[2] for x in xs])] == [] assert starts_non_iterable, "unknown problem" - starts = [x[2] for x in xs] + starts = [x[2] for x in xs] + # same as for starts try: ends = list(itertools.chain.from_iterable([x[3] for x in xs])) except TypeError: pass ends_non_iterable = [x for x in filter(None, [x[3] for x in xs])] == [] assert ends_non_iterable, "unknown problem" - ends = [x[3] for x in xs] - + ends = [x[3] for x in xs] if ( len(audeer.unique(starts)) == 1 @@ -1197,6 +1207,8 @@ def _call_data( process_func_args = process_func_args or self.process_func_args special_args = self._special_args(idx, root, file, process_func_args) y = self.process_func(data, **special_args, **process_func_args) + # ensure non-scalar answer + y = [y] if len(y) == 1 else y return y def _special_args( From 1625a487e3af09c635c8d6c528a0f52527178a75 Mon Sep 17 00:00:00 2001 From: ChristianGeng Date: Tue, 24 Sep 2024 12:51:50 +0200 Subject: [PATCH 06/11] Reformat test_process_text. --- tests/test_process_text.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/tests/test_process_text.py b/tests/test_process_text.py index 7456b5a..2afc352 100644 --- a/tests/test_process_text.py +++ b/tests/test_process_text.py @@ -207,7 +207,6 @@ def _get_idx_type(preserve_index, segment_is_None, idx): if any audio/video files are processed, or a filewise index otherwise """ - if preserve_index and segment_is_None: idx_type = "segmented" if audformat.is_segmented_index(idx) else "filewise" return idx_type @@ -230,13 +229,14 @@ def _series_generator(y, index_type: str): file = idx yield file, value elif index_type == "segmented": - (file, _, _) = idx - yield file, value + (file, _, _) = idx + yield file, value else: raise ValueError("index type invalid") + @pytest.mark.parametrize("num_workers", [1, 2, None]) -@pytest.mark.parametrize("file_format", ["json", "txt"]) # "json","txt" +@pytest.mark.parametrize("file_format", ["json", "txt"]) @pytest.mark.parametrize("multiprocessing", [False, True]) @pytest.mark.parametrize("preserve_index", [False, True]) @pytest.mark.parametrize("process_func", [data_identity, None, identity]) @@ -298,7 +298,6 @@ def test_process_index( # assert audinterface.utils.read_text(path) == data # assert value == data - # # Segmented index with relative paths index = audformat.segmented_index( [file] * 4, @@ -330,21 +329,18 @@ def test_process_index( if preserve_index: pd.testing.assert_index_equal(y.index, index) - # for path, value in y.items(): - # assert audinterface.utils.read_text(path) == data - # assert value == data - expected_idx_type = _get_idx_type(preserve_index, process.segment is None, index) + expected_idx_type = _get_idx_type( + preserve_index, process.segment is None, index + ) for path, value in _series_generator(y, expected_idx_type): assert audinterface.utils.read_text(path) == data assert value == data else: - expected_idx_type = _get_idx_type(preserve_index, process.segment is None, index) + expected_idx_type = _get_idx_type( + preserve_index, process.segment is None, index + ) expected_index = audformat.filewise_index(files=list(index)) pd.testing.assert_index_equal(y.index, expected_index) - # for (path, _, _), value in y.items(): - # assert audinterface.utils.read_text(path) == data - # assert value == data - # expected_idx_type = _get_idx_type(preserve_index, process.segment is None, index) for path, value in _series_generator(y, "filewise"): assert audinterface.utils.read_text(path) == data assert value == data From 07c5f1be2211e88dbdbd206846dd0fcac837839c Mon Sep 17 00:00:00 2001 From: ChristianGeng Date: Tue, 24 Sep 2024 12:52:10 +0200 Subject: [PATCH 07/11] Add missing extension. --- tests/test_process_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_process_text.py b/tests/test_process_text.py index 2afc352..2a9105f 100644 --- a/tests/test_process_text.py +++ b/tests/test_process_text.py @@ -179,7 +179,7 @@ def test_process_folder( files = [os.path.join(root, f"file{n}.{file_format}") for n in range(num_files)] for file in files: write_text_file(file, data) - y = process.process_folder(root) + y = process.process_folder(root, filetype=file_format) pd.testing.assert_series_equal( y, process.process_files(files), From fe5f671da4dce225a50f24da195fff78d5b6b390 Mon Sep 17 00:00:00 2001 From: ChristianGeng Date: Thu, 14 Nov 2024 16:34:21 +0100 Subject: [PATCH 08/11] Fix index problems for 0-len files. --- tests/test_process_text.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tests/test_process_text.py b/tests/test_process_text.py index 2a9105f..ffd86cf 100644 --- a/tests/test_process_text.py +++ b/tests/test_process_text.py @@ -135,19 +135,28 @@ def test_process_files( paths.append(path) # test absolute paths + index = audformat.filewise_index(paths) + if num_files == 0: + index = pd.RangeIndex(0, 0, 1) + y = process.process_files(paths) expected_y = pd.Series( expected_output, - index=audformat.filewise_index(paths), + index=index, ) pd.testing.assert_series_equal(y, expected_y) # test relative paths + index = audformat.filewise_index(files) + if num_files == 0: + index = pd.RangeIndex(0, 0, 1) + y = process.process_files(files, root=root) expected_y = pd.Series( expected_output, - index=audformat.filewise_index(files), + index=index, ) + pd.testing.assert_series_equal(y, expected_y) @@ -376,8 +385,6 @@ def test_process_index( cache_root=cache_root, ) - # breakpoint() - os.remove(path) # Fails because second file does not exist with pytest.raises(FileNotFoundError): From a9eaa7feaadd6c6bcec10a30cbeb2227d77e4f10 Mon Sep 17 00:00:00 2001 From: ChristianGeng Date: Thu, 14 Nov 2024 16:34:52 +0100 Subject: [PATCH 09/11] improve docstring. --- audinterface/core/process.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/audinterface/core/process.py b/audinterface/core/process.py index a98cd59..55364bc 100644 --- a/audinterface/core/process.py +++ b/audinterface/core/process.py @@ -106,7 +106,8 @@ class Process: multiprocessing: use multiprocessing instead of multithreading verbose: show debug messages read_func: function to read in signals/data. When specified, - it needs to be able to read signals BOTH other data. + it needs to be able to read signals signal data as well + as text data. Per default, :func:`audinterface.utils.read_audio` will be used for signal file(s), and :func:`audinterface.utils.read_text` for files From 521a34478b26c66cc33aceaafae036a323ee4412 Mon Sep 17 00:00:00 2001 From: ChristianGeng Date: Thu, 14 Nov 2024 16:55:11 +0100 Subject: [PATCH 10/11] use correct interface for text. --- tests/test_process_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_process_text.py b/tests/test_process_text.py index ffd86cf..8b52126 100644 --- a/tests/test_process_text.py +++ b/tests/test_process_text.py @@ -428,7 +428,7 @@ def test_process_data( process_func_args=process_func_args, verbose=False, ) - x = process.process_signal(data, file=file) + x = process.process_data(data, file=file) if file is None: y = pd.Series([expected_signal]) From bc4fa8615c2735353ed8bbfcd39c0d027f098b34 Mon Sep 17 00:00:00 2001 From: ChristianGeng Date: Thu, 14 Nov 2024 16:55:34 +0100 Subject: [PATCH 11/11] Ruff change. --- tests/test_process_text.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_process_text.py b/tests/test_process_text.py index 8b52126..79d61e9 100644 --- a/tests/test_process_text.py +++ b/tests/test_process_text.py @@ -15,6 +15,7 @@ def identity(data): return data + def data_identity(data): return data