From 35d4a417edfe12926c60ed6aa8de56315d2df301 Mon Sep 17 00:00:00 2001 From: Kuntal Kokate Date: Fri, 17 Oct 2025 15:54:49 -0700 Subject: [PATCH 1/4] Add support for VECTORIZED orientation in BrainVisionRawIO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes KAN-43: https://eeglab.atlassian.net/browse/KAN-43 Previously, BrainVisionRawIO only supported MULTIPLEXED data orientation (interleaved channels), causing a NeoReadWriteError when attempting to read files with VECTORIZED orientation (sequential channel data). Changes: - Modified data orientation check to accept both MULTIPLEXED and VECTORIZED - Added custom _get_analogsignal_chunk() method to handle VECTORIZED reading - For VECTORIZED files, reads each channel's data from its sequential location in the binary file - Maintains backward compatibility with MULTIPLEXED files (uses parent class implementation) Testing: - Validated against MNE-Python on real-world VECTORIZED dataset (ds004621) with 127 channels × 740,360 samples - results match exactly (correlation=1.0) - Tested both MULTIPLEXED and VECTORIZED orientations with synthetic data - All existing functionality preserved for MULTIPLEXED files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- neo/rawio/brainvisionrawio.py | 81 +++++++++++++++++++++++++++++++++-- 1 file changed, 78 insertions(+), 3 deletions(-) diff --git a/neo/rawio/brainvisionrawio.py b/neo/rawio/brainvisionrawio.py index ca25e7141..07d9a0246 100644 --- a/neo/rawio/brainvisionrawio.py +++ b/neo/rawio/brainvisionrawio.py @@ -62,9 +62,12 @@ def _parse_header(self): raise NeoReadWriteError( f"Only `BINARY` format has been implemented. Current Data Format is {vhdr_header['Common Infos']['DataFormat']}" ) - if vhdr_header["Common Infos"]["DataOrientation"] != "MULTIPLEXED": + + # Store the data orientation for later use in reading + self._data_orientation = vhdr_header["Common Infos"]["DataOrientation"] + if self._data_orientation not in ("MULTIPLEXED", "VECTORIZED"): raise NeoReadWriteError( - f"Only `MULTIPLEXED` is implemented. Current Orientation is {vhdr_header['Common Infos']['DataOrientation']}" + f"Data orientation must be either `MULTIPLEXED` or `VECTORIZED`. Current Orientation is {self._data_orientation}" ) nb_channel = int(vhdr_header["Common Infos"]["NumberOfChannels"]) @@ -87,7 +90,15 @@ def _parse_header(self): buffer_id = "0" self._buffer_descriptions = {0: {0: {}}} self._stream_buffer_slice = {} - shape = get_memmap_shape(binary_filename, sig_dtype, num_channels=nb_channel, offset=0) + + # Calculate the shape based on orientation + if self._data_orientation == "MULTIPLEXED": + shape = get_memmap_shape(binary_filename, sig_dtype, num_channels=nb_channel, offset=0) + else: # VECTORIZED + # For VECTORIZED, data is stored as [all_samples_ch1, all_samples_ch2, ...] + # We still report shape as (num_samples, num_channels) for compatibility + shape = get_memmap_shape(binary_filename, sig_dtype, num_channels=nb_channel, offset=0) + self._buffer_descriptions[0][0][buffer_id] = { "type": "raw", "file_path": binary_filename, @@ -98,6 +109,9 @@ def _parse_header(self): } self._stream_buffer_slice[stream_id] = None + # Store number of channels for VECTORIZED reading + self._nb_channel = nb_channel + signal_buffers = np.array([("Signals", "0")], dtype=_signal_buffer_dtype) signal_streams = np.array([("Signals", "0", "0")], dtype=_signal_stream_dtype) @@ -239,6 +253,67 @@ def _rescale_event_timestamp(self, event_timestamps, dtype, event_channel_index) def _get_analogsignal_buffer_description(self, block_index, seg_index, buffer_id): return self._buffer_descriptions[block_index][seg_index][buffer_id] + def _get_analogsignal_chunk( + self, block_index, seg_index, i_start, i_stop, stream_index, channel_indexes + ): + """ + Override the base class method to handle VECTORIZED orientation. + + For MULTIPLEXED data: ch1_s1, ch2_s1, ..., chN_s1, ch1_s2, ch2_s2, ... + For VECTORIZED data: ch1_s1, ch1_s2, ..., ch1_sM, ch2_s1, ch2_s2, ..., ch2_sM, ... + """ + if self._data_orientation == "MULTIPLEXED": + # Use the default implementation for MULTIPLEXED + return super()._get_analogsignal_chunk( + block_index, seg_index, i_start, i_stop, stream_index, channel_indexes + ) + + # VECTORIZED implementation + buffer_id = self.header["signal_streams"][stream_index]["buffer_id"] + buffer_desc = self.get_analogsignal_buffer_description(block_index, seg_index, buffer_id) + + i_start = i_start or 0 + i_stop = i_stop or buffer_desc["shape"][0] + + # Open file on demand + if not hasattr(self, "_memmap_analogsignal_buffers"): + self._memmap_analogsignal_buffers = {} + if block_index not in self._memmap_analogsignal_buffers: + self._memmap_analogsignal_buffers[block_index] = {} + if seg_index not in self._memmap_analogsignal_buffers[block_index]: + self._memmap_analogsignal_buffers[block_index][seg_index] = {} + if buffer_id not in self._memmap_analogsignal_buffers[block_index][seg_index]: + fid = open(buffer_desc["file_path"], mode="rb") + self._memmap_analogsignal_buffers[block_index][seg_index][buffer_id] = fid + else: + fid = self._memmap_analogsignal_buffers[block_index][seg_index][buffer_id] + + # Determine which channels to read + if channel_indexes is None: + channel_indexes = np.arange(self._nb_channel) + else: + channel_indexes = np.asarray(channel_indexes) + + num_samples = i_stop - i_start + dtype = np.dtype(buffer_desc["dtype"]) + + # For VECTORIZED, each channel's data is stored contiguously + # We need to read from different parts of the file for each channel + raw_sigs = np.empty((num_samples, len(channel_indexes)), dtype=dtype) + + total_samples_per_channel = buffer_desc["shape"][0] + + for i, chan_idx in enumerate(channel_indexes): + # Calculate offset for this channel's data in the file + channel_offset = buffer_desc["file_offset"] + chan_idx * total_samples_per_channel * dtype.itemsize + sample_offset = channel_offset + i_start * dtype.itemsize + + # Seek to the position and read the data + fid.seek(sample_offset) + raw_sigs[:, i] = np.fromfile(fid, dtype=dtype, count=num_samples) + + return raw_sigs + def _ensure_filename(self, filename, kind, entry_name): if not os.path.exists(filename): # file not found, subsequent import stage would fail From 9fc58a02a9e17a5ce5bcb2aae49639263766f28d Mon Sep 17 00:00:00 2001 From: Kuntal Kokate Date: Tue, 4 Nov 2025 18:02:36 -0800 Subject: [PATCH 2/4] Simplify VECTORIZED implementation to use memmap - Replace np.fromfile() with np.memmap for multiprocessing compatibility - Minimize code changes in _get_analogsignal_chunk() - Remove file handle caching logic (memmap handles this) - All tests still pass with identical results --- neo/rawio/brainvisionrawio.py | 43 ++++++++--------------------------- 1 file changed, 9 insertions(+), 34 deletions(-) diff --git a/neo/rawio/brainvisionrawio.py b/neo/rawio/brainvisionrawio.py index 07d9a0246..eb3abdfa6 100644 --- a/neo/rawio/brainvisionrawio.py +++ b/neo/rawio/brainvisionrawio.py @@ -257,60 +257,35 @@ def _get_analogsignal_chunk( self, block_index, seg_index, i_start, i_stop, stream_index, channel_indexes ): """ - Override the base class method to handle VECTORIZED orientation. - - For MULTIPLEXED data: ch1_s1, ch2_s1, ..., chN_s1, ch1_s2, ch2_s2, ... - For VECTORIZED data: ch1_s1, ch1_s2, ..., ch1_sM, ch2_s1, ch2_s2, ..., ch2_sM, ... + Override to handle VECTORIZED orientation. + VECTORIZED: all samples for ch1, then all samples for ch2, etc. """ if self._data_orientation == "MULTIPLEXED": - # Use the default implementation for MULTIPLEXED return super()._get_analogsignal_chunk( block_index, seg_index, i_start, i_stop, stream_index, channel_indexes ) - # VECTORIZED implementation + # VECTORIZED: use memmap to read each channel's data block buffer_id = self.header["signal_streams"][stream_index]["buffer_id"] buffer_desc = self.get_analogsignal_buffer_description(block_index, seg_index, buffer_id) i_start = i_start or 0 i_stop = i_stop or buffer_desc["shape"][0] - # Open file on demand - if not hasattr(self, "_memmap_analogsignal_buffers"): - self._memmap_analogsignal_buffers = {} - if block_index not in self._memmap_analogsignal_buffers: - self._memmap_analogsignal_buffers[block_index] = {} - if seg_index not in self._memmap_analogsignal_buffers[block_index]: - self._memmap_analogsignal_buffers[block_index][seg_index] = {} - if buffer_id not in self._memmap_analogsignal_buffers[block_index][seg_index]: - fid = open(buffer_desc["file_path"], mode="rb") - self._memmap_analogsignal_buffers[block_index][seg_index][buffer_id] = fid - else: - fid = self._memmap_analogsignal_buffers[block_index][seg_index][buffer_id] - - # Determine which channels to read if channel_indexes is None: channel_indexes = np.arange(self._nb_channel) - else: - channel_indexes = np.asarray(channel_indexes) - num_samples = i_stop - i_start dtype = np.dtype(buffer_desc["dtype"]) + num_samples = i_stop - i_start + total_samples_per_channel = buffer_desc["shape"][0] - # For VECTORIZED, each channel's data is stored contiguously - # We need to read from different parts of the file for each channel raw_sigs = np.empty((num_samples, len(channel_indexes)), dtype=dtype) - total_samples_per_channel = buffer_desc["shape"][0] - for i, chan_idx in enumerate(channel_indexes): - # Calculate offset for this channel's data in the file - channel_offset = buffer_desc["file_offset"] + chan_idx * total_samples_per_channel * dtype.itemsize - sample_offset = channel_offset + i_start * dtype.itemsize - - # Seek to the position and read the data - fid.seek(sample_offset) - raw_sigs[:, i] = np.fromfile(fid, dtype=dtype, count=num_samples) + offset = buffer_desc["file_offset"] + chan_idx * total_samples_per_channel * dtype.itemsize + channel_data = np.memmap(buffer_desc["file_path"], dtype=dtype, mode='r', + offset=offset, shape=(total_samples_per_channel,)) + raw_sigs[:, i] = channel_data[i_start:i_stop] return raw_sigs From d214b4aadd51ce037ad2f8288abd77acec5cc43d Mon Sep 17 00:00:00 2001 From: Kuntal Kokate Date: Tue, 4 Nov 2025 18:26:30 -0800 Subject: [PATCH 3/4] Refactor VECTORIZED support using time_axis in BaseRawIO Moved VECTORIZED orientation logic to BaseRawIO as suggested by @samuelgarcia: - Added time_axis parameter to buffer_description (0=MULTIPLEXED, 1=VECTORIZED) - Extended BaseRawIO._get_analogsignal_chunk() to handle time_axis=1 for raw buffers - Removed custom _get_analogsignal_chunk() override from BrainVisionRawIO - Fixed _get_signal_size() to correctly handle raw buffers with time_axis=1 Benefits: - Cleaner, more general solution applicable to other readers - Consistent with existing HDF5 time_axis pattern - Reduced code duplication - All tests pass with identical MNE-Python validation --- neo/rawio/baserawio.py | 85 +++++++++++++++++++++++++---------- neo/rawio/brainvisionrawio.py | 51 +++------------------ 2 files changed, 66 insertions(+), 70 deletions(-) diff --git a/neo/rawio/baserawio.py b/neo/rawio/baserawio.py index 7b5d8b768..a0c6c41c6 100644 --- a/neo/rawio/baserawio.py +++ b/neo/rawio/baserawio.py @@ -1577,9 +1577,14 @@ def __init__(self, *arg, **kwargs): def _get_signal_size(self, block_index, seg_index, stream_index): buffer_id = self.header["signal_streams"][stream_index]["buffer_id"] buffer_desc = self.get_analogsignal_buffer_description(block_index, seg_index, buffer_id) - # some hdf5 revert teh buffer - time_axis = buffer_desc.get("time_axis", 0) - return buffer_desc["shape"][time_axis] + # For "raw" type, shape is always (time, channels) regardless of file layout + # For "hdf5" type, shape can be (time, channels) or (channels, time) based on time_axis + if buffer_desc["type"] == "raw": + return buffer_desc["shape"][0] + else: + # some hdf5 revert the buffer + time_axis = buffer_desc.get("time_axis", 0) + return buffer_desc["shape"][time_axis] def _get_analogsignal_chunk( self, @@ -1603,29 +1608,61 @@ def _get_analogsignal_chunk( if buffer_desc["type"] == "raw": - # open files on demand and keep reference to opened file - if not hasattr(self, "_memmap_analogsignal_buffers"): - self._memmap_analogsignal_buffers = {} - if block_index not in self._memmap_analogsignal_buffers: - self._memmap_analogsignal_buffers[block_index] = {} - if seg_index not in self._memmap_analogsignal_buffers[block_index]: - self._memmap_analogsignal_buffers[block_index][seg_index] = {} - if buffer_id not in self._memmap_analogsignal_buffers[block_index][seg_index]: - fid = open(buffer_desc["file_path"], mode="rb") - self._memmap_analogsignal_buffers[block_index][seg_index][buffer_id] = fid - else: - fid = self._memmap_analogsignal_buffers[block_index][seg_index][buffer_id] + time_axis = buffer_desc.get("time_axis", 0) - num_channels = buffer_desc["shape"][1] + if time_axis == 0: + # MULTIPLEXED: time_axis=0 means (time, channels) layout + # open files on demand and keep reference to opened file + if not hasattr(self, "_memmap_analogsignal_buffers"): + self._memmap_analogsignal_buffers = {} + if block_index not in self._memmap_analogsignal_buffers: + self._memmap_analogsignal_buffers[block_index] = {} + if seg_index not in self._memmap_analogsignal_buffers[block_index]: + self._memmap_analogsignal_buffers[block_index][seg_index] = {} + if buffer_id not in self._memmap_analogsignal_buffers[block_index][seg_index]: + fid = open(buffer_desc["file_path"], mode="rb") + self._memmap_analogsignal_buffers[block_index][seg_index][buffer_id] = fid + else: + fid = self._memmap_analogsignal_buffers[block_index][seg_index][buffer_id] + + num_channels = buffer_desc["shape"][1] + + raw_sigs = get_memmap_chunk_from_opened_file( + fid, + num_channels, + i_start, + i_stop, + np.dtype(buffer_desc["dtype"]), + file_offset=buffer_desc["file_offset"], + ) - raw_sigs = get_memmap_chunk_from_opened_file( - fid, - num_channels, - i_start, - i_stop, - np.dtype(buffer_desc["dtype"]), - file_offset=buffer_desc["file_offset"], - ) + elif time_axis == 1: + # VECTORIZED: time_axis=1 means (channels, time) layout + # Data is stored as [all_samples_ch1, all_samples_ch2, ...] + dtype = np.dtype(buffer_desc["dtype"]) + num_channels = buffer_desc["shape"][1] + num_samples = i_stop - i_start + total_samples_per_channel = buffer_desc["shape"][0] + + # Determine which channels to read + if channel_indexes is None: + chan_indices = np.arange(num_channels) + else: + chan_indices = np.asarray(channel_indexes) + + raw_sigs = np.empty((num_samples, len(chan_indices)), dtype=dtype) + + for i, chan_idx in enumerate(chan_indices): + offset = buffer_desc["file_offset"] + chan_idx * total_samples_per_channel * dtype.itemsize + channel_data = np.memmap(buffer_desc["file_path"], dtype=dtype, mode='r', + offset=offset, shape=(total_samples_per_channel,)) + raw_sigs[:, i] = channel_data[i_start:i_stop] + + # Channel slicing already done above, so skip later channel_indexes slicing + channel_indexes = None + + else: + raise ValueError(f"time_axis must be 0 or 1, got {time_axis}") elif buffer_desc["type"] == "hdf5": diff --git a/neo/rawio/brainvisionrawio.py b/neo/rawio/brainvisionrawio.py index eb3abdfa6..430fe5e8d 100644 --- a/neo/rawio/brainvisionrawio.py +++ b/neo/rawio/brainvisionrawio.py @@ -91,13 +91,10 @@ def _parse_header(self): self._buffer_descriptions = {0: {0: {}}} self._stream_buffer_slice = {} - # Calculate the shape based on orientation - if self._data_orientation == "MULTIPLEXED": - shape = get_memmap_shape(binary_filename, sig_dtype, num_channels=nb_channel, offset=0) - else: # VECTORIZED - # For VECTORIZED, data is stored as [all_samples_ch1, all_samples_ch2, ...] - # We still report shape as (num_samples, num_channels) for compatibility - shape = get_memmap_shape(binary_filename, sig_dtype, num_channels=nb_channel, offset=0) + shape = get_memmap_shape(binary_filename, sig_dtype, num_channels=nb_channel, offset=0) + + # time_axis indicates data layout: 0 for MULTIPLEXED (time, channels), 1 for VECTORIZED (channels, time) + time_axis = 0 if self._data_orientation == "MULTIPLEXED" else 1 self._buffer_descriptions[0][0][buffer_id] = { "type": "raw", @@ -106,12 +103,10 @@ def _parse_header(self): "order": "C", "file_offset": 0, "shape": shape, + "time_axis": time_axis, } self._stream_buffer_slice[stream_id] = None - # Store number of channels for VECTORIZED reading - self._nb_channel = nb_channel - signal_buffers = np.array([("Signals", "0")], dtype=_signal_buffer_dtype) signal_streams = np.array([("Signals", "0", "0")], dtype=_signal_stream_dtype) @@ -253,42 +248,6 @@ def _rescale_event_timestamp(self, event_timestamps, dtype, event_channel_index) def _get_analogsignal_buffer_description(self, block_index, seg_index, buffer_id): return self._buffer_descriptions[block_index][seg_index][buffer_id] - def _get_analogsignal_chunk( - self, block_index, seg_index, i_start, i_stop, stream_index, channel_indexes - ): - """ - Override to handle VECTORIZED orientation. - VECTORIZED: all samples for ch1, then all samples for ch2, etc. - """ - if self._data_orientation == "MULTIPLEXED": - return super()._get_analogsignal_chunk( - block_index, seg_index, i_start, i_stop, stream_index, channel_indexes - ) - - # VECTORIZED: use memmap to read each channel's data block - buffer_id = self.header["signal_streams"][stream_index]["buffer_id"] - buffer_desc = self.get_analogsignal_buffer_description(block_index, seg_index, buffer_id) - - i_start = i_start or 0 - i_stop = i_stop or buffer_desc["shape"][0] - - if channel_indexes is None: - channel_indexes = np.arange(self._nb_channel) - - dtype = np.dtype(buffer_desc["dtype"]) - num_samples = i_stop - i_start - total_samples_per_channel = buffer_desc["shape"][0] - - raw_sigs = np.empty((num_samples, len(channel_indexes)), dtype=dtype) - - for i, chan_idx in enumerate(channel_indexes): - offset = buffer_desc["file_offset"] + chan_idx * total_samples_per_channel * dtype.itemsize - channel_data = np.memmap(buffer_desc["file_path"], dtype=dtype, mode='r', - offset=offset, shape=(total_samples_per_channel,)) - raw_sigs[:, i] = channel_data[i_start:i_stop] - - return raw_sigs - def _ensure_filename(self, filename, kind, entry_name): if not os.path.exists(filename): # file not found, subsequent import stage would fail From 7bf52084d2d0c46fb14b84cbff18dc6ddcdd6391 Mon Sep 17 00:00:00 2001 From: Kuntal Kokate Date: Tue, 4 Nov 2025 18:35:42 -0800 Subject: [PATCH 4/4] Fix shape convention to be consistent with HDF5 pattern When time_axis=1, shape should be (channels, time) not (time, channels). This makes raw binary handling consistent with HDF5: - time_axis=0: shape is (time, channels) - MULTIPLEXED - time_axis=1: shape is (channels, time) - VECTORIZED Changes: - BrainVisionRawIO: Set shape as (channels, time) when VECTORIZED - BaseRawIO: Use shape[time_axis] consistently for i_stop default - Removed duplicate time_axis retrieval All tests pass with identical MNE-Python validation. --- neo/rawio/baserawio.py | 26 ++++++++++++-------------- neo/rawio/brainvisionrawio.py | 11 +++++++++-- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/neo/rawio/baserawio.py b/neo/rawio/baserawio.py index a0c6c41c6..7c9b7b237 100644 --- a/neo/rawio/baserawio.py +++ b/neo/rawio/baserawio.py @@ -1577,14 +1577,11 @@ def __init__(self, *arg, **kwargs): def _get_signal_size(self, block_index, seg_index, stream_index): buffer_id = self.header["signal_streams"][stream_index]["buffer_id"] buffer_desc = self.get_analogsignal_buffer_description(block_index, seg_index, buffer_id) - # For "raw" type, shape is always (time, channels) regardless of file layout - # For "hdf5" type, shape can be (time, channels) or (channels, time) based on time_axis - if buffer_desc["type"] == "raw": - return buffer_desc["shape"][0] - else: - # some hdf5 revert the buffer - time_axis = buffer_desc.get("time_axis", 0) - return buffer_desc["shape"][time_axis] + # time_axis indicates which dimension is time: + # time_axis=0: shape is (time, channels) + # time_axis=1: shape is (channels, time) + time_axis = buffer_desc.get("time_axis", 0) + return buffer_desc["shape"][time_axis] def _get_analogsignal_chunk( self, @@ -1603,13 +1600,14 @@ def _get_analogsignal_chunk( buffer_desc = self.get_analogsignal_buffer_description(block_index, seg_index, buffer_id) + # Get time_axis to determine which dimension is time + time_axis = buffer_desc.get("time_axis", 0) + i_start = i_start or 0 - i_stop = i_stop or buffer_desc["shape"][0] + i_stop = i_stop or buffer_desc["shape"][time_axis] if buffer_desc["type"] == "raw": - time_axis = buffer_desc.get("time_axis", 0) - if time_axis == 0: # MULTIPLEXED: time_axis=0 means (time, channels) layout # open files on demand and keep reference to opened file @@ -1637,12 +1635,12 @@ def _get_analogsignal_chunk( ) elif time_axis == 1: - # VECTORIZED: time_axis=1 means (channels, time) layout + # VECTORIZED: time_axis=1 means shape is (channels, time) # Data is stored as [all_samples_ch1, all_samples_ch2, ...] dtype = np.dtype(buffer_desc["dtype"]) - num_channels = buffer_desc["shape"][1] + num_channels = buffer_desc["shape"][0] # shape is (channels, time) num_samples = i_stop - i_start - total_samples_per_channel = buffer_desc["shape"][0] + total_samples_per_channel = buffer_desc["shape"][1] # shape is (channels, time) # Determine which channels to read if channel_indexes is None: diff --git a/neo/rawio/brainvisionrawio.py b/neo/rawio/brainvisionrawio.py index 430fe5e8d..601381470 100644 --- a/neo/rawio/brainvisionrawio.py +++ b/neo/rawio/brainvisionrawio.py @@ -91,11 +91,18 @@ def _parse_header(self): self._buffer_descriptions = {0: {0: {}}} self._stream_buffer_slice = {} - shape = get_memmap_shape(binary_filename, sig_dtype, num_channels=nb_channel, offset=0) - # time_axis indicates data layout: 0 for MULTIPLEXED (time, channels), 1 for VECTORIZED (channels, time) time_axis = 0 if self._data_orientation == "MULTIPLEXED" else 1 + # Get shape - always returns (num_samples, num_channels) + temp_shape = get_memmap_shape(binary_filename, sig_dtype, num_channels=nb_channel, offset=0) + + # For consistency with HDF5 pattern: when time_axis=1, shape should be (channels, time) + if time_axis == 1: + shape = (temp_shape[1], temp_shape[0]) # (num_channels, num_samples) + else: + shape = temp_shape # (num_samples, num_channels) + self._buffer_descriptions[0][0][buffer_id] = { "type": "raw", "file_path": binary_filename,