diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index 827a16b3..65309480 100644 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -20,7 +20,7 @@ MatHud pairs a canvas with an AI assistant so users can sketch geometric scenes, 2. Geometry primitives: create, edit, and relate points, segments, vectors, triangles, rectangles, circles, ellipses, and angles. 3. Mathematics: evaluate expressions, solve equations, differentiate, integrate, work with complex numbers, and run statistical routines. 4. Graph theory: create graphs, trees, DAGs; run analyses (shortest path, MST, topological sort, BFS/DFS). -5. Statistics: plot probability distributions (normal, discrete) and bar charts. +5. Statistics: plot probability distributions (normal, discrete) and bar charts; compute descriptive statistics (mean, median, mode, quartiles, etc.). 6. Workspace operations: save, load, list, delete, import, and export named workspaces. ## Key References @@ -353,6 +353,7 @@ For features that extend existing managers (e.g., `fit_regression` in `Statistic ## Statistics - `plot a continuous normal distribution with mean 0 and sigma 1, shade from -1 to 1` - `plot a bar chart with values [10, 20, 5] and labels ["A", "B", "C"]` +- `compute descriptive statistics for [10, 20, 30, 40, 50]` ## Parametric Curves - `draw a parametric circle with x(t) = cos(t) and y(t) = sin(t)` diff --git a/README.md b/README.md index 72678727..af7ab713 100644 --- a/README.md +++ b/README.md @@ -18,13 +18,14 @@ MatHud pairs an interactive drawing canvas with an AI assistant to help visualiz 3. Plot functions, compare intersections, shade bounded regions, and translate/rotate objects to explore relationships visually. 4. Plot statistics visualizations (probability distributions and bar charts). 5. Fit regression models to data (linear, polynomial, exponential, logarithmic, power, logistic, sinusoidal) and visualize fitted curves with R² statistics. -6. Create and analyze graph theory graphs (graphs, trees, DAGs). -7. Save, list, load, and delete named workspaces so projects can be resumed or shared later. -8. Share the current canvas with the assistant using Vision mode to get feedback grounded in your drawing. -9. Attach images directly to chat messages for the AI to analyze alongside your prompts. -10. Use slash commands (`/help`, `/vision`, `/model`, `/image`, etc.) for quick local operations without waiting for an AI response. -11. Choose from multiple AI providers — OpenAI, Anthropic (Claude), and OpenRouter — with the model dropdown automatically filtered by which API keys you have configured. -12. Trigger client-side tests from the UI or chat to verify canvas behavior without leaving the app. +6. Compute descriptive statistics (mean, median, mode, standard deviation, variance, min, max, quartiles, IQR) for any dataset. +7. Create and analyze graph theory graphs (graphs, trees, DAGs). +8. Save, list, load, and delete named workspaces so projects can be resumed or shared later. +9. Share the current canvas with the assistant using Vision mode to get feedback grounded in your drawing. +10. Attach images directly to chat messages for the AI to analyze alongside your prompts. +11. Use slash commands (`/help`, `/vision`, `/model`, `/image`, etc.) for quick local operations without waiting for an AI response. +12. Choose from multiple AI providers — OpenAI, Anthropic (Claude), and OpenRouter — with the model dropdown automatically filtered by which API keys you have configured. +13. Trigger client-side tests from the UI or chat to verify canvas behavior without leaving the app. ## 3. Architecture Overview @@ -135,11 +136,12 @@ Developer utilities: 8. `plot a normal distribution with mean 0 and sigma 1, continuous, shade from -1 to 1` 9. `plot a bar chart with values [10,20,5] and labels ["A","B","C"]` 10. `fit a linear regression to x_data=[1,2,3,4,5] and y_data=[2,4,6,8,10], show points and report R²` - 11. `create an undirected weighted graph named G1 with vertices A,B,C,D and edges A-B (1), B-C (2), A-C (4), C-D (1)` - 12. `on graph G1, find the shortest path from A to D and highlight the edges` - 13. `create a DAG named D1 with vertices A,B,C,D and edges A->B, A->C, B->D, C->D; then topologically sort it` - 14. `save workspace as "demo"` / `load workspace "demo"` - 15. `run tests` + 11. `compute descriptive statistics for [10, 20, 30, 40, 50]` + 12. `create an undirected weighted graph named G1 with vertices A,B,C,D and edges A-B (1), B-C (2), A-C (4), C-D (1)` + 13. `on graph G1, find the shortest path from A to D and highlight the edges` + 14. `create a DAG named D1 with vertices A,B,C,D and edges A->B, A->C, B->D, C->D; then topologically sort it` + 15. `save workspace as "demo"` / `load workspace "demo"` + 16. `run tests` ### 6.3 Slash Commands diff --git a/documentation/Example Prompts.txt b/documentation/Example Prompts.txt index a4819f1f..bb2478e3 100644 --- a/documentation/Example Prompts.txt +++ b/documentation/Example Prompts.txt @@ -72,6 +72,11 @@ Shear the rectangle horizontally by factor 0.5 from the origin. Rotate triangle ABC by 45 degrees around point (2, 3). Rotate point A by 90 degrees around the origin. +# Descriptive Statistics +Compute descriptive statistics for the dataset [10, 20, 30, 40, 50]. +Give me the mean, median, and standard deviation of [3, 7, 7, 2, 9, 4, 1]. +What are the quartiles and IQR for the values [12, 15, 18, 22, 25, 30, 35, 40]? + # Regression Analysis Fit a linear regression to x_data=[1,2,3,4,5] and y_data=[2.1,3.9,6.2,7.8,10.1]. Show the data points and fitted curve. Fit a quadratic polynomial (degree 2) to x_data=[0,1,2,3,4] and y_data=[0,1,4,9,16]. Report the R-squared value. diff --git a/documentation/Reference Manual.txt b/documentation/Reference Manual.txt index 3c4a31cb..c80c71f3 100644 --- a/documentation/Reference Manual.txt +++ b/documentation/Reference Manual.txt @@ -316,6 +316,7 @@ Attributes: - `plot_distribution(name=None, representation="continuous", distribution_type="normal", distribution_params=None, plot_bounds=None, shade_bounds=None, curve_color=None, fill_color=None, fill_opacity=None, bar_count=None)`: Plot a probability distribution. For representation="continuous", `plot_bounds` controls the curve domain, while `shade_bounds` controls the shaded interval under the curve (clamped into `plot_bounds`). For representation="discrete", `plot_bounds` controls the bar span and `shade_bounds` is ignored. Discrete distribution plots create a `DiscretePlot` composite plus derived `Bar` drawables for rendering; derived bars are regenerated on workspace load and may be omitted from serialized canvas state to keep prompts compact. - `plot_bars(name=None, values=None, labels_below=None, labels_above=None, bar_spacing=None, bar_width=None, stroke_color=None, fill_color=None, fill_opacity=None, x_start=None, y_base=None)`: Plot a bar chart (`BarsPlot` composite plus derived `Bar` drawables). Derived bars are regenerated on workspace load and may be omitted from serialized canvas state to keep prompts compact. - `fit_regression(name=None, x_data=None, y_data=None, model_type="linear", degree=None, plot_bounds=None, curve_color=None, show_points=True, point_color=None)`: Fit a regression model to data and plot the resulting curve. Supported model types: "linear" (y=mx+b), "polynomial" (y=a0+a1*x+...+an*x^n, requires `degree`), "exponential" (y=a*e^(bx), requires positive y), "logarithmic" (y=a+b*ln(x), requires positive x), "power" (y=a*x^b, requires positive x and y), "logistic" (y=L/(1+e^(-k(x-x0)))), and "sinusoidal" (y=a*sin(bx+c)+d, requires at least 4 points). Returns function_name, expression, coefficients, r_squared, model_type, bounds, and optionally point_names. Use `delete_function` to remove the curve; delete points individually. +- `compute_descriptive_statistics(data)`: Compute descriptive statistics for a list of numbers. Returns a dictionary with: count, mean, median, mode (list of values; empty if no meaningful mode), standard_deviation (population), variance (population), min, max, q1 (first quartile), q3 (third quartile), iqr (interquartile range), and range. Quartiles use the median-of-halves (exclusive) method. All values must be finite (no Infinity or NaN). - `delete_plot(name)`: Delete a plot composite created by plot_distribution/plot_bars, including its derived components. - `zoom_to_bounds(left_bound, right_bound, top_bound, bottom_bound)`: Fit the viewport so the specified math-space rectangle is entirely visible while preserving aspect ratio - `create_colored_area(drawable1_name, drawable2_name=None, left_bound=None, right_bound=None, color="lightblue", opacity=0.3)`: Create a colored area between two objects @@ -4507,6 +4508,27 @@ Key Features: ### Statistics Utilities +#### Descriptive Statistics (`utils/statistics/descriptive.py`) + +``` +Descriptive statistics computation module. + +Key Features: + - Mean, median, mode computation + - Population standard deviation and variance + - Min, max, range + - Quartiles (Q1, Q3) via median-of-halves (exclusive) method + - Interquartile range (IQR) + - Input validation (non-empty, finite, numeric) + - No browser imports — fully testable with pytest +``` + +**Key Functions:** +- `compute_descriptive_statistics(data)`: Compute all descriptive statistics for a list of numbers. Returns a `DescriptiveStatisticsResult` TypedDict with count, mean, median, mode, standard_deviation, variance, min, max, q1, q3, iqr, and range. + +**Key Types:** +- `DescriptiveStatisticsResult(TypedDict)`: Result dict with fields: `count` (int), `mean` (float), `median` (float), `mode` (List[float]), `standard_deviation` (float), `variance` (float), `min` (float), `max` (float), `q1` (float), `q3` (float), `iqr` (float), `range` (float). + #### Distribution Expressions (`utils/statistics/distributions.py`) ``` @@ -4545,13 +4567,14 @@ Key Features: #### Statistics Manager (`managers/statistics_manager.py`) ``` -Statistics manager for probability distributions and regression plots. +Statistics manager for probability distributions, regression plots, and descriptive statistics. Key Features: - Continuous distribution plots with PDF curves and shaded areas - Discrete distribution plots using bar elements - Custom bar chart creation with values and labels - Regression fitting (linear, polynomial, exponential, etc.) + - Descriptive statistics computation (mean, median, mode, std dev, quartiles, etc.) - Plot deletion with proper cleanup of constituent drawables - Workspace restore via plot materialization methods ``` @@ -4560,6 +4583,7 @@ Key Features: - `plot_distribution(name, representation, distribution_type, ...)`: Create continuous or discrete distribution plots - `plot_bars(name, values, labels_below, labels_above, ...)`: Create custom bar charts - `fit_regression(name, x_data, y_data, model_type, ...)`: Fit regression models to data +- `compute_descriptive_statistics(data)`: Compute summary statistics (count, mean, median, mode, std dev, variance, min, max, Q1, Q3, IQR, range) - `delete_plot(name)`: Delete a plot and all its constituent drawables **Supported Regression Model Types:** diff --git a/server_tests/test_descriptive_statistics_pure.py b/server_tests/test_descriptive_statistics_pure.py new file mode 100644 index 00000000..c85c932a --- /dev/null +++ b/server_tests/test_descriptive_statistics_pure.py @@ -0,0 +1,266 @@ +from __future__ import annotations + +import unittest + +from utils.statistics.descriptive import ( + compute_descriptive_statistics, +) + + +class TestValidation(unittest.TestCase): + def test_empty_list_raises_value_error(self) -> None: + with self.assertRaises(ValueError): + compute_descriptive_statistics([]) + + def test_non_list_input_raises_type_error(self) -> None: + with self.assertRaises(TypeError): + compute_descriptive_statistics((1, 2, 3)) + with self.assertRaises(TypeError): + compute_descriptive_statistics("123") + with self.assertRaises(TypeError): + compute_descriptive_statistics(42) + + def test_non_numeric_elements_raise_type_error(self) -> None: + with self.assertRaises(TypeError): + compute_descriptive_statistics([1, "two", 3]) + + def test_non_finite_values_raise_value_error(self) -> None: + for bad in (float("inf"), float("-inf"), float("nan")): + with self.subTest(value=bad): + with self.assertRaises(ValueError): + compute_descriptive_statistics([1.0, bad, 3.0]) + + def test_boolean_values_raise_type_error(self) -> None: + with self.assertRaises(TypeError): + compute_descriptive_statistics([1, True, 3]) + + +class TestMean(unittest.TestCase): + def test_integer_dataset(self) -> None: + result = compute_descriptive_statistics([1, 2, 3, 4, 5]) + self.assertAlmostEqual(result["mean"], 3.0) + + def test_float_dataset(self) -> None: + result = compute_descriptive_statistics([1.5, 2.5, 3.5]) + self.assertAlmostEqual(result["mean"], 2.5) + + def test_negative_numbers(self) -> None: + result = compute_descriptive_statistics([-3, -1, 0, 1, 3]) + self.assertAlmostEqual(result["mean"], 0.0) + + def test_single_element(self) -> None: + result = compute_descriptive_statistics([42]) + self.assertAlmostEqual(result["mean"], 42.0) + + +class TestMedian(unittest.TestCase): + def test_odd_count(self) -> None: + result = compute_descriptive_statistics([1, 3, 5]) + self.assertAlmostEqual(result["median"], 3.0) + + def test_even_count(self) -> None: + result = compute_descriptive_statistics([1, 2, 3, 4]) + self.assertAlmostEqual(result["median"], 2.5) + + def test_single_element(self) -> None: + result = compute_descriptive_statistics([7]) + self.assertAlmostEqual(result["median"], 7.0) + + def test_unsorted_input(self) -> None: + result = compute_descriptive_statistics([5, 1, 3]) + self.assertAlmostEqual(result["median"], 3.0) + + +class TestMode(unittest.TestCase): + def test_single_mode(self) -> None: + result = compute_descriptive_statistics([1, 2, 2, 3]) + self.assertEqual(result["mode"], [2.0]) + + def test_multi_modal(self) -> None: + result = compute_descriptive_statistics([1, 1, 2, 2, 3]) + self.assertEqual(result["mode"], [1.0, 2.0]) + + def test_all_unique_no_mode(self) -> None: + result = compute_descriptive_statistics([1, 2, 3, 4]) + self.assertEqual(result["mode"], []) + + def test_all_same(self) -> None: + result = compute_descriptive_statistics([5, 5, 5]) + self.assertEqual(result["mode"], [5.0]) + + def test_all_same_frequency_multiple_distinct(self) -> None: + result = compute_descriptive_statistics([1, 1, 2, 2, 3, 3]) + self.assertEqual(result["mode"], []) + + def test_single_element_no_mode(self) -> None: + result = compute_descriptive_statistics([42]) + self.assertEqual(result["mode"], []) + + def test_two_identical_elements(self) -> None: + result = compute_descriptive_statistics([7, 7]) + self.assertEqual(result["mode"], [7.0]) + + def test_mode_elements_are_float(self) -> None: + result = compute_descriptive_statistics([1, 2, 2, 3]) + for m in result["mode"]: + self.assertIsInstance(m, float) + + +class TestVarianceAndStdDev(unittest.TestCase): + def test_known_dataset(self) -> None: + # Population variance of [2,4,4,4,5,5,7,9] = 4.0, std dev = 2.0 + result = compute_descriptive_statistics([2, 4, 4, 4, 5, 5, 7, 9]) + self.assertAlmostEqual(result["variance"], 4.0) + self.assertAlmostEqual(result["standard_deviation"], 2.0) + + def test_all_same_values(self) -> None: + result = compute_descriptive_statistics([3, 3, 3]) + self.assertAlmostEqual(result["variance"], 0.0) + self.assertAlmostEqual(result["standard_deviation"], 0.0) + + def test_population_formula(self) -> None: + # [2, 8]: mean=5, pop variance = ((2-5)^2 + (8-5)^2) / 2 = 9.0 + result = compute_descriptive_statistics([2, 8]) + self.assertAlmostEqual(result["variance"], 9.0) + + def test_numerical_robustness(self) -> None: + result = compute_descriptive_statistics([1.0, 1.0, 1.0000000000000002]) + self.assertGreaterEqual(result["variance"], 0.0) + self.assertGreaterEqual(result["standard_deviation"], 0.0) + + +class TestMinMax(unittest.TestCase): + def test_mixed_values(self) -> None: + result = compute_descriptive_statistics([-5, 0, 3, 10]) + self.assertAlmostEqual(result["min"], -5.0) + self.assertAlmostEqual(result["max"], 10.0) + + def test_single_element(self) -> None: + result = compute_descriptive_statistics([42]) + self.assertAlmostEqual(result["min"], 42.0) + self.assertAlmostEqual(result["max"], 42.0) + + def test_negative_only(self) -> None: + result = compute_descriptive_statistics([-3, -1, -7]) + self.assertAlmostEqual(result["min"], -7.0) + self.assertAlmostEqual(result["max"], -1.0) + + def test_min_max_are_float(self) -> None: + result = compute_descriptive_statistics([1, 2, 3]) + self.assertIsInstance(result["min"], float) + self.assertIsInstance(result["max"], float) + + +class TestQuartiles(unittest.TestCase): + def test_odd_n_textbook(self) -> None: + # [1,2,3,4,5,6,7]: lower=[1,2,3] Q1=2.0, upper=[5,6,7] Q3=6.0 + result = compute_descriptive_statistics([1, 2, 3, 4, 5, 6, 7]) + self.assertAlmostEqual(result["q1"], 2.0) + self.assertAlmostEqual(result["q3"], 6.0) + self.assertAlmostEqual(result["iqr"], 4.0) + + def test_even_n(self) -> None: + # [1,2,3,4,5,6,7,8]: lower=[1,2,3,4] Q1=2.5, upper=[5,6,7,8] Q3=6.5 + result = compute_descriptive_statistics([1, 2, 3, 4, 5, 6, 7, 8]) + self.assertAlmostEqual(result["q1"], 2.5) + self.assertAlmostEqual(result["q3"], 6.5) + self.assertAlmostEqual(result["iqr"], 4.0) + + def test_n_equals_1(self) -> None: + result = compute_descriptive_statistics([99]) + self.assertAlmostEqual(result["q1"], 99.0) + self.assertAlmostEqual(result["q3"], 99.0) + self.assertAlmostEqual(result["iqr"], 0.0) + + def test_n_equals_2(self) -> None: + result = compute_descriptive_statistics([10, 20]) + self.assertAlmostEqual(result["q1"], 10.0) + self.assertAlmostEqual(result["q3"], 20.0) + + def test_n_equals_3(self) -> None: + # [1,2,3]: lower=[1] Q1=1.0, upper=[3] Q3=3.0 + result = compute_descriptive_statistics([1, 2, 3]) + self.assertAlmostEqual(result["q1"], 1.0) + self.assertAlmostEqual(result["q3"], 3.0) + self.assertAlmostEqual(result["iqr"], 2.0) + + +class TestRange(unittest.TestCase): + def test_basic_range(self) -> None: + result = compute_descriptive_statistics([1, 5, 10]) + self.assertAlmostEqual(result["range"], 9.0) + + def test_single_element_range(self) -> None: + result = compute_descriptive_statistics([42]) + self.assertAlmostEqual(result["range"], 0.0) + + def test_range_is_float(self) -> None: + result = compute_descriptive_statistics([1, 10]) + self.assertIsInstance(result["range"], float) + + +class TestEdgeCases(unittest.TestCase): + def test_large_numbers(self) -> None: + data = [1e15, 2e15, 3e15] + result = compute_descriptive_statistics(data) + self.assertAlmostEqual(result["mean"], 2e15) + self.assertEqual(result["count"], 3) + + def test_very_small_numbers(self) -> None: + data = [1e-15, 2e-15, 3e-15] + result = compute_descriptive_statistics(data) + self.assertAlmostEqual(result["mean"], 2e-15) + + def test_mixed_large_and_small(self) -> None: + data = [0.001, 1000000] + result = compute_descriptive_statistics(data) + self.assertEqual(result["count"], 2) + self.assertAlmostEqual(result["min"], 0.001) + self.assertAlmostEqual(result["max"], 1000000.0) + + +class TestResultStructure(unittest.TestCase): + def test_all_expected_keys_present(self) -> None: + result = compute_descriptive_statistics([1, 2, 3, 4, 5]) + expected_keys = { + "count", + "mean", + "median", + "mode", + "standard_deviation", + "variance", + "min", + "max", + "q1", + "q3", + "iqr", + "range", + } + self.assertEqual(set(result.keys()), expected_keys) + + def test_count_is_int(self) -> None: + result = compute_descriptive_statistics([1, 2, 3]) + self.assertIsInstance(result["count"], int) + self.assertEqual(result["count"], 3) + + def test_mode_is_list(self) -> None: + result = compute_descriptive_statistics([1, 2, 3]) + self.assertIsInstance(result["mode"], list) + + def test_numeric_fields_are_float(self) -> None: + result = compute_descriptive_statistics([1, 2, 3, 4, 5]) + float_keys = [ + "mean", + "median", + "standard_deviation", + "variance", + "min", + "max", + "q1", + "q3", + "iqr", + "range", + ] + for key in float_keys: + with self.subTest(key=key): + self.assertIsInstance(result[key], float) diff --git a/server_tests/test_plot_tool_schemas.py b/server_tests/test_plot_tool_schemas.py index 52339d84..8d227a07 100644 --- a/server_tests/test_plot_tool_schemas.py +++ b/server_tests/test_plot_tool_schemas.py @@ -185,3 +185,26 @@ def test_delete_plot_schema(self) -> None: name = _require_dict(props.get("name"), "delete_plot.name") self.assertEqual(name.get("type"), "string") + + def test_compute_descriptive_statistics_schema(self) -> None: + tool = _find_tool("compute_descriptive_statistics") + fn = _require_dict(tool.get("function"), "function") + self.assertTrue(fn.get("strict")) + + params = self._get_params("compute_descriptive_statistics") + props = _require_dict( + params.get("properties"), + "compute_descriptive_statistics.properties", + ) + required = _require_list( + params.get("required"), + "compute_descriptive_statistics.required", + ) + + self.assertEqual(required, ["data"]) + + data = _require_dict(props.get("data"), "data") + self.assertEqual(data.get("type"), "array") + data_items = _require_dict(data.get("items"), "data.items") + self.assertEqual(data_items.get("type"), "number") + self.assertEqual(data.get("minItems"), 1) diff --git a/static/client/canvas.py b/static/client/canvas.py index 901c6b4e..0ad2e9d3 100644 --- a/static/client/canvas.py +++ b/static/client/canvas.py @@ -1120,6 +1120,18 @@ def fit_regression( ), ) + def compute_descriptive_statistics( + self, + *, + data: List[float], + ) -> Dict[str, Any]: + return cast( + Dict[str, Any], + self.drawable_manager.compute_descriptive_statistics( + data=data, + ), + ) + # ------------------- Graph Methods ------------------- def create_graph(self, graph_state: "GraphState") -> "Drawable": return self.drawable_manager.create_graph(graph_state) diff --git a/static/client/function_registry.py b/static/client/function_registry.py index e8fb1a3e..b6325d29 100644 --- a/static/client/function_registry.py +++ b/static/client/function_registry.py @@ -197,6 +197,7 @@ def get_available_functions( "plot_bars": canvas.plot_bars, "delete_plot": canvas.delete_plot, "fit_regression": canvas.fit_regression, + "compute_descriptive_statistics": canvas.compute_descriptive_statistics, # ===== ANGLE OPERATIONS ===== "create_angle": canvas.create_angle, "delete_angle": canvas.delete_angle, diff --git a/static/client/managers/drawable_manager.py b/static/client/managers/drawable_manager.py index 91130425..1e91dc43 100644 --- a/static/client/managers/drawable_manager.py +++ b/static/client/managers/drawable_manager.py @@ -974,6 +974,18 @@ def fit_regression( ), ) + def compute_descriptive_statistics( + self, + *, + data: List[float], + ) -> Dict[str, Any]: + return cast( + Dict[str, Any], + self.statistics_manager.compute_descriptive_statistics( + data=data, + ), + ) + # ------------------- Graph Methods ------------------- def create_graph(self, graph_state: "GraphState") -> "Drawable": return self.graph_manager.create_graph(graph_state) diff --git a/static/client/managers/statistics_manager.py b/static/client/managers/statistics_manager.py index adba6472..ae4ca4d1 100644 --- a/static/client/managers/statistics_manager.py +++ b/static/client/managers/statistics_manager.py @@ -25,6 +25,9 @@ from drawables.discrete_plot import DiscretePlot from drawables.plot import Plot from utils.statistics.distributions import default_normal_bounds, normal_pdf_expression +from utils.statistics.descriptive import ( + compute_descriptive_statistics as _compute_descriptive_statistics, +) from utils.statistics.regression import fit_regression as _fit_regression, SUPPORTED_MODEL_TYPES if TYPE_CHECKING: @@ -519,6 +522,25 @@ def _get_point_manager(self) -> Optional["PointManager"]: pass return None + def compute_descriptive_statistics( + self, + *, + data: List[float], + ) -> Dict[str, Any]: + """Compute descriptive statistics for a list of numbers. + + Delegates to the pure algorithm in ``utils.statistics.descriptive``. + No canvas mutations or undo/redo archiving — purely computational. + + Args: + data: Non-empty list of finite numbers. + + Returns: + Dict with count, mean, median, mode, standard_deviation, + variance, min, max, q1, q3, iqr, range. + """ + return dict(_compute_descriptive_statistics(data)) + def delete_plot(self, name: str) -> bool: started_at = time.perf_counter() self._log_operation_debug("delete_plot", "start", details={"name": name}) diff --git a/static/client/utils/statistics/__init__.py b/static/client/utils/statistics/__init__.py index ef5adcba..250766e8 100644 --- a/static/client/utils/statistics/__init__.py +++ b/static/client/utils/statistics/__init__.py @@ -1,17 +1,22 @@ -"""Statistics utilities for regression and distribution analysis. +"""Statistics utilities for regression, distribution, and descriptive analysis. -This package provides statistical functions for curve fitting and -probability distribution generation. +This package provides statistical functions for curve fitting, +probability distribution generation, and descriptive statistics. Key Features: - Multiple regression model types (linear, polynomial, exponential, etc.) - R-squared calculation for goodness of fit - Expression building for fitted models - Normal distribution PDF expression generation + - Descriptive statistics (mean, median, mode, quartiles, etc.) """ from __future__ import annotations +from utils.statistics.descriptive import ( + DescriptiveStatisticsResult, + compute_descriptive_statistics, +) from utils.statistics.regression import ( RegressionResult, SUPPORTED_MODEL_TYPES, diff --git a/static/client/utils/statistics/descriptive.py b/static/client/utils/statistics/descriptive.py new file mode 100644 index 00000000..ab34c7b1 --- /dev/null +++ b/static/client/utils/statistics/descriptive.py @@ -0,0 +1,187 @@ +"""Descriptive statistics computation module. + +Provides a pure-Python function to compute standard descriptive statistics +(mean, median, mode, standard deviation, variance, min, max, quartiles) +for a list of numbers. No browser imports — fully testable with pytest. +""" + +from __future__ import annotations + +import math +from typing import List, TypedDict + + +class DescriptiveStatisticsResult(TypedDict): + """Result of descriptive statistics computation.""" + + count: int + mean: float + median: float + mode: List[float] + standard_deviation: float + variance: float + min: float + max: float + q1: float + q3: float + iqr: float + range: float + + +# --------------------------------------------------------------------------- +# Validation helpers +# --------------------------------------------------------------------------- + + +def _validate_data(data: List[float]) -> None: + """Validate that *data* is a non-empty list of finite numbers. + + Raises: + TypeError: If *data* is not a list or contains non-numeric elements + (including booleans). + ValueError: If *data* is empty or contains non-finite values. + """ + if not isinstance(data, list): + raise TypeError(f"data must be a list, got {type(data).__name__}") + + if len(data) == 0: + raise ValueError("data must not be empty") + + for i, v in enumerate(data): + if isinstance(v, bool): + raise TypeError(f"data[{i}] is a bool; only int and float are accepted") + if not isinstance(v, (int, float)): + raise TypeError(f"data[{i}] must be int or float, got {type(v).__name__}") + if not math.isfinite(v): + raise ValueError(f"data[{i}] is not finite: {v}") + + +# --------------------------------------------------------------------------- +# Computation helpers +# --------------------------------------------------------------------------- + + +def _compute_median(sorted_data: List[float]) -> float: + """Return the median of an already-sorted list.""" + n = len(sorted_data) + mid = n // 2 + if n % 2 == 1: + return float(sorted_data[mid]) + return (sorted_data[mid - 1] + sorted_data[mid]) / 2.0 + + +def _compute_quartiles(sorted_data: List[float]) -> tuple[float, float]: + """Return (Q1, Q3) using the median-of-halves (exclusive) method. + + - Q1 = median of the lower half (excluding the median element for odd N). + - Q3 = median of the upper half (excluding the median element for odd N). + - Edge cases: N=1 -> Q1=Q3=value; N=2 -> Q1=min, Q3=max. + """ + n = len(sorted_data) + + if n == 1: + return (float(sorted_data[0]), float(sorted_data[0])) + + if n == 2: + return (float(sorted_data[0]), float(sorted_data[1])) + + mid = n // 2 + if n % 2 == 1: + lower = sorted_data[:mid] + upper = sorted_data[mid + 1 :] + else: + lower = sorted_data[:mid] + upper = sorted_data[mid:] + + return (_compute_median(lower), _compute_median(upper)) + + +def _compute_mode(sorted_data: List[float]) -> List[float]: + """Return the mode(s) of an already-sorted list. + + - Returns all values tied for the highest frequency, sorted ascending. + - If every value appears the same number of times (multiple distinct + values), returns ``[]`` — no meaningful mode. + - Exception: if all values are identical, that value IS the mode. + """ + # Build frequency map (iterate sorted data for deterministic order) + freq: dict[float, int] = {} + for v in sorted_data: + fv = float(v) + freq[fv] = freq.get(fv, 0) + 1 + + max_freq = max(freq.values()) + + # All unique → no mode + if max_freq == 1: + return [] + + modes = sorted(v for v, c in freq.items() if c == max_freq) + + # All equal frequency with multiple distinct values → no mode + if len(modes) == len(freq) and len(freq) > 1: + return [] + + return modes + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +def compute_descriptive_statistics( + data: List[float], +) -> DescriptiveStatisticsResult: + """Compute descriptive statistics for a list of numbers. + + Args: + data: Non-empty list of finite numbers (int or float). + + Returns: + A ``DescriptiveStatisticsResult`` dict containing: + count, mean, median, mode, standard_deviation, variance, + min, max, q1, q3, iqr, range. + + Raises: + TypeError: If *data* is not a list or contains non-numeric values. + ValueError: If *data* is empty or contains non-finite values. + """ + _validate_data(data) + + sorted_data = sorted(data) + n = len(sorted_data) + + # Central tendency + mean = sum(sorted_data) / n + median = _compute_median(sorted_data) + mode = _compute_mode(sorted_data) + + # Spread — population formula (divide by N) + variance = sum((x - mean) ** 2 for x in sorted_data) / n + variance = max(0.0, variance) # clamp tiny negatives from float rounding + std_dev = math.sqrt(variance) + + # Extremes + data_min = float(sorted_data[0]) + data_max = float(sorted_data[-1]) + data_range = data_max - data_min + + # Quartiles + q1, q3 = _compute_quartiles(sorted_data) + iqr = q3 - q1 + + return DescriptiveStatisticsResult( + count=n, + mean=mean, + median=median, + mode=mode, + standard_deviation=std_dev, + variance=variance, + min=data_min, + max=data_max, + q1=q1, + q3=q3, + iqr=iqr, + range=data_range, + ) diff --git a/static/functions_definitions.py b/static/functions_definitions.py index 892d7b3b..240ef995 100644 --- a/static/functions_definitions.py +++ b/static/functions_definitions.py @@ -2509,6 +2509,27 @@ }, }, }, + { + "type": "function", + "function": { + "name": "compute_descriptive_statistics", + "description": "Computes descriptive statistics for a list of numbers. Returns count, mean, median, mode, population standard deviation, population variance, min, max, first quartile (Q1), third quartile (Q3), interquartile range (IQR), and range. Mode is returned as a list of values with the highest frequency; the list is empty if all values appear with equal frequency (no meaningful mode). Quartiles use the median-of-halves (exclusive) method. Use this for summarizing datasets, finding central tendency and spread, or answering questions about data distributions.", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "data": { + "type": "array", + "items": {"type": "number"}, + "minItems": 1, + "description": "Array of numbers to analyze. Must contain at least one number. All values must be finite (no Infinity or NaN).", + } + }, + "required": ["data"], + "additionalProperties": False, + }, + }, + }, # END PLOT FUNCTIONS # START ANGLE FUNCTIONS {