From e5ed3b824932e80c95240eb6a8e513816a114cde Mon Sep 17 00:00:00 2001 From: ArthurTlprt Date: Fri, 30 Jan 2026 12:00:44 +0100 Subject: [PATCH 1/2] fix Segmentation: Add skewness and kurtosis calculation #103 --- segmenter/planktoscope/segmenter/__init__.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/segmenter/planktoscope/segmenter/__init__.py b/segmenter/planktoscope/segmenter/__init__.py index 3eb124ac6..0975d146a 100644 --- a/segmenter/planktoscope/segmenter/__init__.py +++ b/segmenter/planktoscope/segmenter/__init__.py @@ -37,6 +37,7 @@ import numpy as np import PIL.Image import skimage.exposure +from scipy.stats import skew, kurtosis ################################################################################ # Other image processing Libraries @@ -268,9 +269,14 @@ def _get_color_info(self, bgr_img, mask): h_stddev = np.std(h_channel, where=mask) s_stddev = np.std(s_channel, where=mask) v_stddev = np.std(v_channel, where=mask) - # TODO #103 Add skewness and kurtosis calculation (with scipy) here - # using https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.skew.html#scipy.stats.skew - # and https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.kurtosis.html#scipy.stats.kurtosis + # Distribution skewness and kurtosis computation + h_skewness = skew(h_channel, bias=False, axis=None) + s_skewness = skew(s_channel, bias=False, axis=None) + v_skewness = skew(v_channel, bias=False, axis=None) + h_kurtosis = kurtosis(h_channel, bias=False, axis=None) + s_kurtosis = kurtosis(s_channel, bias=False, axis=None) + v_kurtosis = kurtosis(v_channel, bias=False, axis=None) + # h_quartiles = np.quantile(h_channel, quartiles) # s_quartiles = np.quantile(s_channel, quartiles) # v_quartiles = np.quantile(v_channel, quartiles) @@ -308,6 +314,12 @@ def _get_color_info(self, bgr_img, mask): "StdHue": h_stddev, "StdSaturation": s_stddev, "StdValue": v_stddev, + "SkewnessHue": h_skewness, + "SkewnessSaturation": s_skewness, + "SkewnessValue": v_skewness, + "KurtosisHue": h_kurtosis, + "KurtosisSaturation": s_kurtosis, + "KurtosisValue": v_kurtosis # "object_minHue": h_quartiles[0], # "object_Q05Hue": h_quartiles[1], # "object_Q25Hue": h_quartiles[2], From e26106785e9cc62590a33bda0599ddf3ea21e560 Mon Sep 17 00:00:00 2001 From: ArthurTlprt Date: Fri, 30 Jan 2026 12:07:57 +0100 Subject: [PATCH 2/2] fix Solve the FIXME: just use python's csv library, to shave off pandas's 60 MB of unnecessary disk space usage #846 --- segmenter/planktoscope/segmenter/ecotaxa.py | 53 +++++++++++++++------ 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/segmenter/planktoscope/segmenter/ecotaxa.py b/segmenter/planktoscope/segmenter/ecotaxa.py index ab3ece883..d5ad81085 100644 --- a/segmenter/planktoscope/segmenter/ecotaxa.py +++ b/segmenter/planktoscope/segmenter/ecotaxa.py @@ -20,7 +20,7 @@ import numpy -import pandas # FIXME: just use python's csv library, to shave off pandas's 60 MB of unnecessary disk space usage +import csv import zipfile import os import io @@ -262,25 +262,50 @@ def ecotaxa_export(archive_filepath, metadata, image_base_path, keep_files=False # we remove the image file if we don't want to keep it! os.remove(image_path) - tsv_content = pandas.DataFrame(tsv_content) - - tsv_type_header = [dtype_to_ecotaxa(dt) for dt in tsv_content.dtypes] - tsv_content.columns = pandas.MultiIndex.from_tuples( - list(zip(tsv_content.columns, tsv_type_header)) - ) - + # Extract column names from first row if content exists + if not tsv_content: + logger.error("No TSV content to export") + return 0 + + column_names = sorted(tsv_content[0].keys()) + + # Determine data types for each column + tsv_type_header = [] + for col in column_names: + # Check the type of the first non-None value in this column + sample_value = next((row[col] for row in tsv_content if row.get(col) is not None), None) + if sample_value is not None and isinstance(sample_value, (int, float)): + tsv_type_header.append("[f]") + else: + tsv_type_header.append("[t]") + # create the filename with the acquisition ID acquisition_id = metadata.get("acq_id") acquisition_id = acquisition_id.replace(" ", "_") tsv_filename = f"ecotaxa_{acquisition_id}.tsv" - + + # Build TSV content as string + tsv_output = io.StringIO() + writer = csv.writer(tsv_output, delimiter='\t', lineterminator='\n') + + # Write header row (column names) + writer.writerow(column_names) + + # Write type header row + writer.writerow(tsv_type_header) + + # Write data rows + for row in tsv_content: + writer.writerow([row.get(col, '') for col in column_names]) + + tsv_string = tsv_output.getvalue() + # add the tsv to the archive - archive.writestr( - tsv_filename, - io.BytesIO(tsv_content.to_csv(sep="\t", encoding="utf-8", index=False).encode()).read(), - ) + archive.writestr(tsv_filename, tsv_string.encode('utf-8')) + if keep_files: tsv_file = os.path.join(image_base_path, tsv_filename) - tsv_content.to_csv(path_or_buf=tsv_file, sep="\t", encoding="utf-8", index=False) + with open(tsv_file, 'w', encoding='utf-8', newline='') as f: + f.write(tsv_string) logger.success("Ecotaxa archive is ready!") return 1