Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ MODULES = emlearn_trees \
emlearn_iir_q15 \
emlearn_arrayutils \
emlearn_linreg \
emlearn_plsr \
emlearn_cnn_int8 \
emlearn_cnn_fp32

Expand Down
76 changes: 76 additions & 0 deletions airquality_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@

import array
import emlearn_plsr
import npyfile
import os
import os.path

def mean_squared_error(y_true, y_pred):
n = len(y_true)
return sum((yi - yi_hat) ** 2 for yi, yi_hat in zip(y_true, y_pred)) / n


def r2_score(y_true, y_pred):
n = len(y_true)
y_mean = sum(y_true) / n
ss_tot = sum((yi - y_mean) ** 2 for yi in y_true)
ss_res = sum((yi - yi_hat) ** 2 for yi, yi_hat in zip(y_true, y_pred))
return 1 - ss_res / ss_tot if ss_tot != 0 else 0.0


def load_data(data_dir):
x_file = os.path.join(data_dir, "X.npy")
y_file = os.path.join(data_dir, "y.npy")
shape_X, X_array = npyfile.load(x_file) # X_array is array.array('f')
shape_y, y_array = npyfile.load(y_file) # y_array is array.array('f')
return shape_X, X_array, shape_y, y_array


def run_plsr_reference(data_dir, n_components=5):
# -----------------------------
# Load data
# -----------------------------
shape_X, X_array, shape_y, y_array = load_data(data_dir)
n_samples = shape_y[0]
n_features = shape_X[1]

# -----------------------------
# Create and train model
# -----------------------------
model = emlearn_plsr.new(n_samples, n_features, n_components)
success = emlearn_plsr.fit(
model, X_array, y_array,
max_iterations=1000,
tolerance=1e-5,
verbose=0
)

print(success, model.is_complete())

# -----------------------------
# Compute predictions
# -----------------------------
y_pred = array.array('f')
for i in range(n_samples):
x_row = X_array[i * n_features:(i + 1) * n_features]
y_val = model.predict(x_row)
y_pred.append(y_val)

# -----------------------------
# Compute metrics
# -----------------------------
mse = mean_squared_error(y_array, y_pred)
r2 = r2_score(y_array, y_pred)

print(f"PLSR Reference Results (n_components={n_components}):")
print(f" MSE: {mse:.5f}")
print(f" R^2 score: {r2:.5f}")


if __name__ == "__main__":
# Example usage: adjust data_dir as needed
run_plsr_reference(data_dir="data", n_components=3)

run_plsr_reference(data_dir="my_spectrofood_data_L1", n_components=10)


86 changes: 86 additions & 0 deletions airquality_download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@

import os
import urllib.request
import zipfile
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error, r2_score


def download_dataset(url="https://archive.ics.uci.edu/ml/machine-learning-databases/00360/AirQualityUCI.zip",
data_dir="data",
zip_name="AirQualityUCI.zip"):
os.makedirs(data_dir, exist_ok=True)
zip_file = os.path.join(data_dir, zip_name)
if not os.path.exists(zip_file):
print("Downloading dataset...")
urllib.request.urlretrieve(url, zip_file)
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
zip_ref.extractall(data_dir)
return data_dir


def load_and_preprocess(csv_file=None, data_dir="data", feature_cols=None, target_col="CO(GT)"):
if csv_file is None:
csv_file = os.path.join(data_dir, "AirQualityUCI.csv")
df = pd.read_csv(csv_file, sep=';', decimal=',')
df = df.iloc[:, :-2] # drop last two empty columns
df.replace(-200, np.nan, inplace=True)
df.dropna(inplace=True)

if feature_cols is None:
X = df.iloc[:, 2:].values.astype(np.float32) # default all sensor columns
else:
X = df[feature_cols].values.astype(np.float32)

y = df[target_col].values.astype(np.float32).reshape(-1, 1)

scaler_X = StandardScaler()
X_scaled = np.ascontiguousarray(scaler_X.fit_transform(X))

scaler_y = StandardScaler()
y_scaled = np.ascontiguousarray(scaler_y.fit_transform(y))

np.save(os.path.join(data_dir, "X.npy"), X_scaled, allow_pickle=False)
np.save(os.path.join(data_dir, "y.npy"), y_scaled, allow_pickle=False)
return X_scaled, y_scaled, scaler_X, scaler_y


def train_and_evaluate(X, y, n_components=5, test_size=0.2, random_state=42, data_dir="data"):
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=random_state
)

pls = PLSRegression(n_components=n_components)
pls.fit(X_train, y_train)
y_pred = pls.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

np.save(os.path.join(data_dir, "pls_coef.npy"), pls.coef_)

return mse, r2, pls


def load_numpy_data(data_dir="data"):
X_loaded = np.load(os.path.join(data_dir, "X.npy"))
y_loaded = np.load(os.path.join(data_dir, "y.npy"))
return X_loaded, y_loaded


def main():
n_components = 3
data_dir = download_dataset()
X, y, _, _ = load_and_preprocess(data_dir=data_dir)
mse, r2, _ = train_and_evaluate(X, y, n_components=n_components, data_dir=data_dir)
print(f"PLSR Reference Results (n_components={n_components}):")
print(f" MSE: {mse:.5f}")
print(f" R^2: {r2:.5f}")


if __name__ == "__main__":
main()
133 changes: 133 additions & 0 deletions spectrofood_download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import os
import pandas as pd
import numpy as np
import urllib.request
from io import StringIO

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error, r2_score


DATA_URL = "https://zenodo.org/records/8362947/files/SpectroFood_dataset.csv?download=1"

def download_dataset(data_dir):
os.makedirs(data_dir, exist_ok=True)
csv_file = os.path.join(data_dir, "SpectroFood_dataset.csv")
if not os.path.exists(csv_file):
print("Downloading SpectroFood CSV...")
urllib.request.urlretrieve(DATA_URL, csv_file)
return csv_file


def load_spectrofood_chunks(csv_file, target_col="dry_matter", food_col="food"):
"""
Splits CSV into chunks using empty lines (newlines) as separators.
Each chunk is loaded with pandas.read_csv separately.
Returns list of tuples: (food_name, DataFrame)
"""
chunks = []
with open(csv_file, 'r') as f:
content = f.read()

# Split into raw text blocks on empty lines
raw_chunks = [c.strip() for c in content.split("\n\n") if c.strip()]
# FIXME: only returns 1 chunk right now
print(len(raw_chunks))

for chunk_text in raw_chunks:
# Use StringIO to read the chunk as CSV
chunk_io = StringIO(chunk_text)
try:
df_chunk = pd.read_csv(chunk_io, dtype=str, keep_default_na=False)
except pd.errors.EmptyDataError:
continue # skip empty chunks

# Determine food name: use the first column of the first row
if food_col in df_chunk.columns:
food_name = df_chunk[food_col].iloc[0].strip().replace(" ", "_")
else:
food_name = str(df_chunk.iloc[0, 0]).strip().replace(" ", "_")

# Convert numeric columns to float, ignore errors
df_chunk = df_chunk.apply(pd.to_numeric, errors='coerce')
chunks.append((food_name, df_chunk))

return chunks

def preprocess_chunk(df_chunk, target_col="DRY MATTER"):
"""
Converts DataFrame to C-contiguous X and y numpy arrays
"""

#print(df_chunk.columns)

# Keep only rows where the target column is numeric
df_chunk = df_chunk[pd.to_numeric(df_chunk[target_col], errors='coerce').notna()].copy()

# Drop columns that are entirely NaN
df_chunk = df_chunk.dropna(axis=1, how='all')

# Drop rows that are entirely NaN
df_chunk = df_chunk.dropna(axis=0, how='any')

exclude_cols = [c for c in df_chunk.columns if c == target_col or df_chunk[c].dtype == object]
X = df_chunk.drop(columns=exclude_cols).values.astype(np.float32)
y = df_chunk[target_col].values.astype(np.float32).reshape(-1, 1)

# Standardize
scaler_X = StandardScaler()
#scaler_y = StandardScaler()
X = scaler_X.fit_transform(X)
#y = scaler_y.fit_transform(y)

X = np.ascontiguousarray(X)
y = np.ascontiguousarray(y)
return X, y

def save_all_chunks(chunks, data_dir):
"""
Saves all chunks as numpy files
"""
for food_name, df in chunks:
X, y = preprocess_chunk(df)
dataset_dir = data_dir+f'_{food_name}'
os.makedirs(dataset_dir, exist_ok=True)
np.save(os.path.join(dataset_dir, f"X.npy"), X)
np.save(os.path.join(dataset_dir, f"y.npy"), y)
print(f"Saved chunk for {food_name}: {dataset_dir}")

def train_pls_for_chunks(chunks, n_components=10):
"""
Trains a scikit-learn PLSRegression model for each chunk
and prints MSE and R2
"""
for food_name, df in chunks:
X, y = preprocess_chunk(df)
# Split 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train PLS
pls = PLSRegression(n_components=n_components)
pls.fit(X_train, y_train)

# Predict and inverse scale
y_pred = pls.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"{food_name}: PLSRegression n_components={n_components} | MSE={np.sqrt(mse):.4f} | R2={r2:.4f}")

def main(data_dir="spectrofood_data"):
csv_file = download_dataset(data_dir)
chunks = load_spectrofood_chunks(csv_file)
print(f"Found {len(chunks)} chunks (food types)")
save_all_chunks(chunks, data_dir)
train_pls_for_chunks(chunks, n_components=5)

if __name__ == "__main__":
main(data_dir="my_spectrofood_data")

45 changes: 45 additions & 0 deletions src/emlearn_plsr/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Location of top-level MicroPython directory
MPY_DIR = ../../micropython

# Architecture to build for (x86, x64, armv6m, armv7m, xtensa, xtensawin)
ARCH = x64

# The ABI version for .mpy files
MPY_ABI_VERSION := 6.3

# Location of emlearn library
EMLEARN_DIR := $(shell python3 -c "import emlearn; print(emlearn.includedir)")

# enable linking of libm etc
LINK_RUNTIME=1

DIST_DIR := ../../dist/$(ARCH)_$(MPY_ABI_VERSION)


ifeq ($(ARCH),rv32imc)
CFLAGS_MATH := -DUSE_BUILTIN_SQRTF
else
CFLAGS_MATH := -DUSE_IEEE_SQRTF=1
endif


# Name of module
MOD = emlearn_plsr

# Source files (.c or .py)
SRC = plsr.c plsr.py

# Include to get the rules for compiling and linking the module
include $(MPY_DIR)/py/dynruntime.mk

# Releases
DIST_FILE = $(DIST_DIR)/$(MOD).mpy
$(DIST_DIR):
mkdir -p $@

$(DIST_FILE): $(MOD).mpy $(DIST_DIR)
cp $< $@

CFLAGS += -I$(EMLEARN_DIR) -Wno-unused-function $(CFLAGS_MATH)

dist: $(DIST_FILE)
Loading
Loading