diff --git a/machine_learning/linear_regression.cpp b/machine_learning/linear_regression.cpp new file mode 100644 index 00000000000..365bebb63f8 --- /dev/null +++ b/machine_learning/linear_regression.cpp @@ -0,0 +1,287 @@ +/** + * @file + * @brief Implementation of + * [Linear Regression](https://en.wikipedia.org/wiki/Linear_regression) using + * [Gradient Descent](https://en.wikipedia.org/wiki/Gradient_descent) + * + * @author + * [Abhinav Prakash](https://github.com/abhinavprakash-x) + * + * @details + * Linear Regression is a supervised machine learning algorithm that models + * the relationship between a dependent variable `y` and one or more + * independent variables `X` using a linear function. + * + * ### Key Concepts + * - **Equation:** y = W·X + b + * where: + * - y is the scalar output (target variable) + * - X is the feature vector (independent variables) + * - W is the weight vector (slope) + * - b is the bias term (intercept) + * + * - **Prediction:** ŷ = W·Xᵢ + b + * - **Cost Function:** J(W, b) = (1 / 2m) * Σ (ŷ - y)² + λ * ||W||² + * - **Gradient Updates:** + * W := W - α * (1/m) * Xᵀ(ŷ - y) + * b := b - α * (1/m) * Σ(ŷ - y) + * + * ### Notes + * - Implements batch gradient descent for optimization. + * - Supports feature normalization and L2 regularization (Ridge penalty). + * - Uses only the C++17 Standard Library (no external dependencies). + * + * @see [Wikipedia: Linear + * Regression](https://en.wikipedia.org/wiki/Linear_regression) + * @see [Wikipedia: Gradient + * Descent](https://en.wikipedia.org/wiki/Gradient_descent) + * + */ + +#include +#include +#include + +/** + * @namespace machine_learning + * @brief Machine Learning Algorithms + * @note All members are kept public intentionally for simplicity and + * educational clarity. + */ +namespace machine_learning { +class LinearRegression { + public: + std::vector weights; + double bias; + size_t epochs; + double learning_rate; // Denoted by alpha + double lambda; // Regularization Term + std::vector mean; // For Normalization of Data + std::vector std_deviation; // For Normalization of Data + + LinearRegression(double lr = 0.01, size_t e = 1000, double reg = 0.1) + : bias(0.0), learning_rate(lr), epochs(e), lambda(reg) {} + + /** + * @brief Compute Mean and Standard Deviation for each feature. + * @param X Array of Feature Vectors + */ + void compute_normalization_params( + const std::vector>& X) { + size_t m = X.size(); + size_t n = X[0].size(); + mean.resize(n, 0.0); + std_deviation.resize(n, 0.0); + + for (size_t j = 0; j < n; ++j) { + for (size_t i = 0; i < m; ++i) { + mean[j] += X[i][j]; + } + mean[j] /= static_cast(m); + + for (size_t i = 0; i < m; ++i) { + std_deviation[j] += std::pow(X[i][j] - mean[j], 2); + } + std_deviation[j] = + std::sqrt(std_deviation[j] / static_cast(m)); + if (std_deviation[j] == 0) + std_deviation[j] = 1; + } + } + + /** + * @brief Apply Normalization to features + * @param X Feature matrix to normalize. + * @return Normalized feature matrix. + * + * @details + * Equation: X_norm = (X - mean) / std_deviation + * This is called Z Score Normalization and it converts + * large numbers to smaller numbers for easier computation. + * + * eg. {100,200,123} -> {0.1,0.2,0.123} + */ + std::vector> normalize_features( + const std::vector>& X) const { + std::vector> X_norm = X; + for (size_t i = 0; i < X.size(); ++i) { + for (size_t j = 0; j < X[0].size(); ++j) { + X_norm[i][j] = (X[i][j] - mean[j]) / std_deviation[j]; + } + } + return X_norm; + } + + /** + * @brief Compute the Cost function (Mean Squared Error and + * L2 Regularization). + * @param X Feature Matrix + * @param y Outputs Corresponding to X + * @return Cost Function + * + * @details + * Calculates Cost function as + * J(w,b) = (1 / 2m) * Σ (ŷ - y)² + λ * ||W||² + * where, m is the number of examples in dataset + * y_hat is the predicted value by model + * y is the output value given in dataset + * lambda is the regularization term + * ||W||^2 is the L2 Norm + */ + double compute_cost(const std::vector>& X, + const std::vector& y) const { + size_t m = X.size(); + size_t n = X[0].size(); + double cost = 0.0; + + for (size_t i = 0; i < m; ++i) { + double prediction = bias; + for (size_t j = 0; j < n; ++j) { + prediction += weights[j] * X[i][j]; + } + double error = prediction - y[i]; + cost += error * error; + } + cost /= (2.0 * m); + + double reg = 0.0; + for (double w : weights) { + reg += w * w; + } + + cost += (lambda / (2.0 * m)) * reg; + return cost; + } + + /** + * @brief Fit Data into LR Model + * @param X_raw The Feature Values given in dataset + * @param y the output values corresponding to X + * + * @details + * Values of w and b are calculated and updated over + * multiple epochs. + * MSE (mean squared error) = (prediction - y)^2 + * y_hat (prediction) = w*X + b + * grad_w = partial derivative of Cost over w + * grad_w = MSE * X + * grad_b = MSE + * + * Parameters are simultaneously updated as: + * w := w - alpha * grad_w + * b := b - alpha * grad_b + * + * this is called batch gradient descent + * where alpha is learning rate + * Finally everything is regularized to prevent model from "Overfitting" + */ + void fit(const std::vector>& X_raw, + const std::vector& y) { + compute_normalization_params(X_raw); + std::vector> X = normalize_features(X_raw); + + size_t m = X.size(); + size_t n = X[0].size(); + weights.assign(n, 0.0); + bias = 0.0; + + for (size_t epoch = 0; epoch < epochs; ++epoch) { + std::vector y_hat(m, 0.0); + for (size_t i = 0; i < m; ++i) { + y_hat[i] = bias; + for (size_t j = 0; j < n; ++j) { + y_hat[i] += weights[j] * X[i][j]; + } + } + + std::vector grad_w(n, 0.0); + double grad_b = 0.0; + + for (size_t i = 0; i < m; ++i) { + double error = y_hat[i] - y[i]; + grad_b += error; + for (size_t j = 0; j < n; ++j) { + grad_w[j] += error * X[i][j]; + } + } + + for (size_t j = 0; j < n; ++j) { + grad_w[j] = grad_w[j] / m + (lambda / m) * weights[j]; + } + grad_b /= m; + + for (size_t j = 0; j < n; ++j) { + weights[j] -= learning_rate * grad_w[j]; + } + bias -= learning_rate * grad_b; + } + } + + /** + * @brief Predict Outputs using LR Model on given features + * @param X_raw The Feature Values given in dataset + * @return The Prediced Values + * + * @details + * Once Model has found optimized values for w and b + * the prediction is calculated. + */ + std::vector predict( + const std::vector>& X_raw) const { + std::vector> X = normalize_features(X_raw); + size_t m = X.size(); + size_t n = X[0].size(); + + std::vector predictions(m, 0.0); + for (size_t i = 0; i < m; ++i) { + double pred = bias; + for (size_t j = 0; j < n; ++j) pred += weights[j] * X[i][j]; + predictions[i] = pred; + } + return predictions; + } +}; +} // namespace machine_learning + +/** + * @brief Test routine for Linear Regression + * + * @details + * + * For this test case (Synthetic Dataset) + * The Predicted Values are Close to the real values + * So the model is doing good and has accuracy of R^2 = 0.998 + */ +static void test() { + // Synthetic dataset with correlated features and noise + std::vector> X = {{1.0, 2.1, 3.9}, {2.0, 3.9, 6.1}, + {3.0, 6.2, 9.0}, {4.0, 8.1, 12.2}, + {5.0, 10.2, 14.0}, {6.0, 12.1, 17.8}, + {7.0, 13.9, 20.9}, {8.0, 16.3, 24.2}}; + + // Underlying true relation: y = 0.5*x1 + 0.3*x2 + 0.2*x3 + noise + std::vector y = {3.9, 6.8, 9.9, 12.5, 15.1, 18.4, 21.2, 24.6}; + + // Instantiate model with noticeable regularization + machine_learning::LinearRegression model(0.01, 8000, 1.0); + + model.fit(X, y); + auto preds = model.predict(X); + + // Check predictions + for (size_t i = 0; i < y.size(); ++i) { + assert(std::fabs(preds[i] - y[i]) < 1.0); + } + + // Check cost after training + double final_cost = model.compute_cost(model.normalize_features(X), y); + assert(final_cost < 1.0); +} + +/** + * @brief Main function to execute the test + */ +int main() { + test(); + return 0; +} \ No newline at end of file