diff --git a/.history/011/k_means_beginner_20221007003622.pynb b/.history/011/k_means_beginner_20221007003622.pynb new file mode 100644 index 00000000..e69de29b diff --git a/.history/011/k_means_beginner_20221007003708.pynb b/.history/011/k_means_beginner_20221007003708.pynb new file mode 100644 index 00000000..08066618 --- /dev/null +++ b/.history/011/k_means_beginner_20221007003708.pynb @@ -0,0 +1,9 @@ +import numpy as np # linear algebra +import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) +import os +import matplotlib.pyplot as plt +from sklearn import cluster +from sklearn import preprocessing +import plotly.express as px +from sklearn.datasets import make_blobs +plt.style.use('dark_background') \ No newline at end of file diff --git a/.history/011/k_means_beginner_20221007003710.pynb b/.history/011/k_means_beginner_20221007003710.pynb new file mode 100644 index 00000000..283332ac --- /dev/null +++ b/.history/011/k_means_beginner_20221007003710.pynb @@ -0,0 +1,9 @@ +import numpy as np # linear algebra +import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) +import os +import matplotlib.pyplot as plt +from sklearn import cluster +from sklearn import preprocessing +import plotly.express as px +from sklearn.datasets import make_blobs +plt.style.use('dark_background') diff --git a/.history/011/k_means_beginner_20221007003736.pynb b/.history/011/k_means_beginner_20221007003736.pynb new file mode 100644 index 00000000..a5c5fe04 --- /dev/null +++ b/.history/011/k_means_beginner_20221007003736.pynb @@ -0,0 +1,10 @@ +import numpy as np # linear algebra +import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) +import os +import matplotlib.pyplot as plt +from sklearn import cluster +from sklearn import preprocessing +import plotly.express as px +from sklearn.datasets import make_blobs +plt.style.use('dark_background') + diff --git a/.history/011/readme_20221007002635.md b/.history/011/readme_20221007002635.md new file mode 100644 index 00000000..e69de29b diff --git a/.history/011/readme_20221007002656.md b/.history/011/readme_20221007002656.md new file mode 100644 index 00000000..704d8c12 --- /dev/null +++ b/.history/011/readme_20221007002656.md @@ -0,0 +1,45 @@ +# Problem Statement + +The aim of the exercise is to implement k-NN from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how kNN works internally. + +# Task + +- Extend the algorithm for Distance-weighted kNN classification using appropriate dataset. +- Extend the algorithm for regression using appropriate dataset. +- Extend the algorithm with appropriate dataset. +- Implementing KD trees to understand information retrieval. Visit [this](https://www.analyticsvidhya.com/blog/2017/11/information-retrieval-using-kdtree/) site for dataset and references. + +# k-NN Algorithm + +K-nearest neighbors (KNN) algorithm is a type of supervised ML algorithm which can be used for both classification as well as regression predictive problems. +The following two properties would define KNN +well − +- Lazy learning algorithm − KNN is a lazy learning algorithm because it does not have a specialized training phase and uses all the data for training while classification. +- Non-parametric learning algorithm − KNN is also a non-parametric learning algorithm because it doesn’t assume anything about the underlying data. + +K-nearest neighbors (KNN) algorithm uses ‘feature similarity’ to predict the values of new datapoints which further means that the new data point will be assigned a value based on how closely it matches the points in the training set. + +1. Load the data +2. Initialize K to your chosen number of neighbors +3. For each example in the data + - Calculate the distance between the query example and the current example from the data. + - Add the distance and the index of the example to an ordered collection +4. Sort the ordered collection of distances and indices from smallest to largest (in ascending order) by the distances +5. Pick the first K entries from the sorted collection 6. Get the labels of the selected K entries +6. If regression, return the mean of the K labels +7. If classification, return the mode of the K labels + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007002718.md b/.history/011/readme_20221007002718.md new file mode 100644 index 00000000..19463c97 --- /dev/null +++ b/.history/011/readme_20221007002718.md @@ -0,0 +1,45 @@ +# Problem Statement + +The aim of the exercise is to implement k-Means from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how kNN works internally. + +# Task + +- Extend the algorithm for Distance-weighted kNN classification using appropriate dataset. +- Extend the algorithm for regression using appropriate dataset. +- Extend the algorithm with appropriate dataset. +- Implementing KD trees to understand information retrieval. Visit [this](https://www.analyticsvidhya.com/blog/2017/11/information-retrieval-using-kdtree/) site for dataset and references. + +# k-NN Algorithm + +K-nearest neighbors (KNN) algorithm is a type of supervised ML algorithm which can be used for both classification as well as regression predictive problems. +The following two properties would define KNN +well − +- Lazy learning algorithm − KNN is a lazy learning algorithm because it does not have a specialized training phase and uses all the data for training while classification. +- Non-parametric learning algorithm − KNN is also a non-parametric learning algorithm because it doesn’t assume anything about the underlying data. + +K-nearest neighbors (KNN) algorithm uses ‘feature similarity’ to predict the values of new datapoints which further means that the new data point will be assigned a value based on how closely it matches the points in the training set. + +1. Load the data +2. Initialize K to your chosen number of neighbors +3. For each example in the data + - Calculate the distance between the query example and the current example from the data. + - Add the distance and the index of the example to an ordered collection +4. Sort the ordered collection of distances and indices from smallest to largest (in ascending order) by the distances +5. Pick the first K entries from the sorted collection 6. Get the labels of the selected K entries +6. If regression, return the mean of the K labels +7. If classification, return the mode of the K labels + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007002722.md b/.history/011/readme_20221007002722.md new file mode 100644 index 00000000..1fdda68e --- /dev/null +++ b/.history/011/readme_20221007002722.md @@ -0,0 +1,45 @@ +# Problem Statement + +The aim of the exercise is to implement k-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how kNN works internally. + +# Task + +- Extend the algorithm for Distance-weighted kNN classification using appropriate dataset. +- Extend the algorithm for regression using appropriate dataset. +- Extend the algorithm with appropriate dataset. +- Implementing KD trees to understand information retrieval. Visit [this](https://www.analyticsvidhya.com/blog/2017/11/information-retrieval-using-kdtree/) site for dataset and references. + +# k-NN Algorithm + +K-nearest neighbors (KNN) algorithm is a type of supervised ML algorithm which can be used for both classification as well as regression predictive problems. +The following two properties would define KNN +well − +- Lazy learning algorithm − KNN is a lazy learning algorithm because it does not have a specialized training phase and uses all the data for training while classification. +- Non-parametric learning algorithm − KNN is also a non-parametric learning algorithm because it doesn’t assume anything about the underlying data. + +K-nearest neighbors (KNN) algorithm uses ‘feature similarity’ to predict the values of new datapoints which further means that the new data point will be assigned a value based on how closely it matches the points in the training set. + +1. Load the data +2. Initialize K to your chosen number of neighbors +3. For each example in the data + - Calculate the distance between the query example and the current example from the data. + - Add the distance and the index of the example to an ordered collection +4. Sort the ordered collection of distances and indices from smallest to largest (in ascending order) by the distances +5. Pick the first K entries from the sorted collection 6. Get the labels of the selected K entries +6. If regression, return the mean of the K labels +7. If classification, return the mode of the K labels + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007002725.md b/.history/011/readme_20221007002725.md new file mode 100644 index 00000000..e993b549 --- /dev/null +++ b/.history/011/readme_20221007002725.md @@ -0,0 +1,45 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how kNN works internally. + +# Task + +- Extend the algorithm for Distance-weighted kNN classification using appropriate dataset. +- Extend the algorithm for regression using appropriate dataset. +- Extend the algorithm with appropriate dataset. +- Implementing KD trees to understand information retrieval. Visit [this](https://www.analyticsvidhya.com/blog/2017/11/information-retrieval-using-kdtree/) site for dataset and references. + +# k-NN Algorithm + +K-nearest neighbors (KNN) algorithm is a type of supervised ML algorithm which can be used for both classification as well as regression predictive problems. +The following two properties would define KNN +well − +- Lazy learning algorithm − KNN is a lazy learning algorithm because it does not have a specialized training phase and uses all the data for training while classification. +- Non-parametric learning algorithm − KNN is also a non-parametric learning algorithm because it doesn’t assume anything about the underlying data. + +K-nearest neighbors (KNN) algorithm uses ‘feature similarity’ to predict the values of new datapoints which further means that the new data point will be assigned a value based on how closely it matches the points in the training set. + +1. Load the data +2. Initialize K to your chosen number of neighbors +3. For each example in the data + - Calculate the distance between the query example and the current example from the data. + - Add the distance and the index of the example to an ordered collection +4. Sort the ordered collection of distances and indices from smallest to largest (in ascending order) by the distances +5. Pick the first K entries from the sorted collection 6. Get the labels of the selected K entries +6. If regression, return the mean of the K labels +7. If classification, return the mode of the K labels + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007002732.md b/.history/011/readme_20221007002732.md new file mode 100644 index 00000000..7775e8e5 --- /dev/null +++ b/.history/011/readme_20221007002732.md @@ -0,0 +1,45 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k works internally. + +# Task + +- Extend the algorithm for Distance-weighted kNN classification using appropriate dataset. +- Extend the algorithm for regression using appropriate dataset. +- Extend the algorithm with appropriate dataset. +- Implementing KD trees to understand information retrieval. Visit [this](https://www.analyticsvidhya.com/blog/2017/11/information-retrieval-using-kdtree/) site for dataset and references. + +# k-NN Algorithm + +K-nearest neighbors (KNN) algorithm is a type of supervised ML algorithm which can be used for both classification as well as regression predictive problems. +The following two properties would define KNN +well − +- Lazy learning algorithm − KNN is a lazy learning algorithm because it does not have a specialized training phase and uses all the data for training while classification. +- Non-parametric learning algorithm − KNN is also a non-parametric learning algorithm because it doesn’t assume anything about the underlying data. + +K-nearest neighbors (KNN) algorithm uses ‘feature similarity’ to predict the values of new datapoints which further means that the new data point will be assigned a value based on how closely it matches the points in the training set. + +1. Load the data +2. Initialize K to your chosen number of neighbors +3. For each example in the data + - Calculate the distance between the query example and the current example from the data. + - Add the distance and the index of the example to an ordered collection +4. Sort the ordered collection of distances and indices from smallest to largest (in ascending order) by the distances +5. Pick the first K entries from the sorted collection 6. Get the labels of the selected K entries +6. If regression, return the mean of the K labels +7. If classification, return the mode of the K labels + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007002734.md b/.history/011/readme_20221007002734.md new file mode 100644 index 00000000..4f73b357 --- /dev/null +++ b/.history/011/readme_20221007002734.md @@ -0,0 +1,45 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how kmeans works internally. + +# Task + +- Extend the algorithm for Distance-weighted kNN classification using appropriate dataset. +- Extend the algorithm for regression using appropriate dataset. +- Extend the algorithm with appropriate dataset. +- Implementing KD trees to understand information retrieval. Visit [this](https://www.analyticsvidhya.com/blog/2017/11/information-retrieval-using-kdtree/) site for dataset and references. + +# k-NN Algorithm + +K-nearest neighbors (KNN) algorithm is a type of supervised ML algorithm which can be used for both classification as well as regression predictive problems. +The following two properties would define KNN +well − +- Lazy learning algorithm − KNN is a lazy learning algorithm because it does not have a specialized training phase and uses all the data for training while classification. +- Non-parametric learning algorithm − KNN is also a non-parametric learning algorithm because it doesn’t assume anything about the underlying data. + +K-nearest neighbors (KNN) algorithm uses ‘feature similarity’ to predict the values of new datapoints which further means that the new data point will be assigned a value based on how closely it matches the points in the training set. + +1. Load the data +2. Initialize K to your chosen number of neighbors +3. For each example in the data + - Calculate the distance between the query example and the current example from the data. + - Add the distance and the index of the example to an ordered collection +4. Sort the ordered collection of distances and indices from smallest to largest (in ascending order) by the distances +5. Pick the first K entries from the sorted collection 6. Get the labels of the selected K entries +6. If regression, return the mean of the K labels +7. If classification, return the mode of the K labels + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007002736.md b/.history/011/readme_20221007002736.md new file mode 100644 index 00000000..222f93e5 --- /dev/null +++ b/.history/011/readme_20221007002736.md @@ -0,0 +1,45 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +- Extend the algorithm for Distance-weighted kNN classification using appropriate dataset. +- Extend the algorithm for regression using appropriate dataset. +- Extend the algorithm with appropriate dataset. +- Implementing KD trees to understand information retrieval. Visit [this](https://www.analyticsvidhya.com/blog/2017/11/information-retrieval-using-kdtree/) site for dataset and references. + +# k-NN Algorithm + +K-nearest neighbors (KNN) algorithm is a type of supervised ML algorithm which can be used for both classification as well as regression predictive problems. +The following two properties would define KNN +well − +- Lazy learning algorithm − KNN is a lazy learning algorithm because it does not have a specialized training phase and uses all the data for training while classification. +- Non-parametric learning algorithm − KNN is also a non-parametric learning algorithm because it doesn’t assume anything about the underlying data. + +K-nearest neighbors (KNN) algorithm uses ‘feature similarity’ to predict the values of new datapoints which further means that the new data point will be assigned a value based on how closely it matches the points in the training set. + +1. Load the data +2. Initialize K to your chosen number of neighbors +3. For each example in the data + - Calculate the distance between the query example and the current example from the data. + - Add the distance and the index of the example to an ordered collection +4. Sort the ordered collection of distances and indices from smallest to largest (in ascending order) by the distances +5. Pick the first K entries from the sorted collection 6. Get the labels of the selected K entries +6. If regression, return the mean of the K labels +7. If classification, return the mode of the K labels + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007002824.md b/.history/011/readme_20221007002824.md new file mode 100644 index 00000000..4a6e2e37 --- /dev/null +++ b/.history/011/readme_20221007002824.md @@ -0,0 +1,41 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + + +# k-NN Algorithm + +K-nearest neighbors (KNN) algorithm is a type of supervised ML algorithm which can be used for both classification as well as regression predictive problems. +The following two properties would define KNN +well − +- Lazy learning algorithm − KNN is a lazy learning algorithm because it does not have a specialized training phase and uses all the data for training while classification. +- Non-parametric learning algorithm − KNN is also a non-parametric learning algorithm because it doesn’t assume anything about the underlying data. + +K-nearest neighbors (KNN) algorithm uses ‘feature similarity’ to predict the values of new datapoints which further means that the new data point will be assigned a value based on how closely it matches the points in the training set. + +1. Load the data +2. Initialize K to your chosen number of neighbors +3. For each example in the data + - Calculate the distance between the query example and the current example from the data. + - Add the distance and the index of the example to an ordered collection +4. Sort the ordered collection of distances and indices from smallest to largest (in ascending order) by the distances +5. Pick the first K entries from the sorted collection 6. Get the labels of the selected K entries +6. If regression, return the mean of the K labels +7. If classification, return the mode of the K labels + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003007.md b/.history/011/readme_20221007003007.md new file mode 100644 index 00000000..67f375ae --- /dev/null +++ b/.history/011/readme_20221007003007.md @@ -0,0 +1,41 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# k-NN Algorithm + +K-nearest neighbors (KNN) algorithm is a type of supervised ML algorithm which can be used for both classification as well as regression predictive problems. +The following two properties would define KNN +well − +- Lazy learning algorithm − KNN is a lazy learning algorithm because it does not have a specialized training phase and uses all the data for training while classification. +- Non-parametric learning algorithm − KNN is also a non-parametric learning algorithm because it doesn’t assume anything about the underlying data. + +K-nearest neighbors (KNN) algorithm uses ‘feature similarity’ to predict the values of new datapoints which further means that the new data point will be assigned a value based on how closely it matches the points in the training set. + +1. Load the data +2. Initialize K to your chosen number of neighbors +3. For each example in the data + - Calculate the distance between the query example and the current example from the data. + - Add the distance and the index of the example to an ordered collection +4. Sort the ordered collection of distances and indices from smallest to largest (in ascending order) by the distances +5. Pick the first K entries from the sorted collection 6. Get the labels of the selected K entries +6. If regression, return the mean of the K labels +7. If classification, return the mode of the K labels + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003010.md b/.history/011/readme_20221007003010.md new file mode 100644 index 00000000..1277294b --- /dev/null +++ b/.history/011/readme_20221007003010.md @@ -0,0 +1,42 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# k-NN Algorithm + +K-nearest neighbors (KNN) algorithm is a type of supervised ML algorithm which can be used for both classification as well as regression predictive problems. +The following two properties would define KNN +well − +- Lazy learning algorithm − KNN is a lazy learning algorithm because it does not have a specialized training phase and uses all the data for training while classification. +- Non-parametric learning algorithm − KNN is also a non-parametric learning algorithm because it doesn’t assume anything about the underlying data. + +K-nearest neighbors (KNN) algorithm uses ‘feature similarity’ to predict the values of new datapoints which further means that the new data point will be assigned a value based on how closely it matches the points in the training set. + +1. Load the data +2. Initialize K to your chosen number of neighbors +3. For each example in the data + - Calculate the distance between the query example and the current example from the data. + - Add the distance and the index of the example to an ordered collection +4. Sort the ordered collection of distances and indices from smallest to largest (in ascending order) by the distances +5. Pick the first K entries from the sorted collection 6. Get the labels of the selected K entries +6. If regression, return the mean of the K labels +7. If classification, return the mode of the K labels + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003020.md b/.history/011/readme_20221007003020.md new file mode 100644 index 00000000..6e69286e --- /dev/null +++ b/.history/011/readme_20221007003020.md @@ -0,0 +1,42 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-nearest neighbors (KNN) algorithm is a type of supervised ML algorithm which can be used for both classification as well as regression predictive problems. +The following two properties would define KNN +well − +- Lazy learning algorithm − KNN is a lazy learning algorithm because it does not have a specialized training phase and uses all the data for training while classification. +- Non-parametric learning algorithm − KNN is also a non-parametric learning algorithm because it doesn’t assume anything about the underlying data. + +K-nearest neighbors (KNN) algorithm uses ‘feature similarity’ to predict the values of new datapoints which further means that the new data point will be assigned a value based on how closely it matches the points in the training set. + +1. Load the data +2. Initialize K to your chosen number of neighbors +3. For each example in the data + - Calculate the distance between the query example and the current example from the data. + - Add the distance and the index of the example to an ordered collection +4. Sort the ordered collection of distances and indices from smallest to largest (in ascending order) by the distances +5. Pick the first K entries from the sorted collection 6. Get the labels of the selected K entries +6. If regression, return the mean of the K labels +7. If classification, return the mode of the K labels + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003034.md b/.history/011/readme_20221007003034.md new file mode 100644 index 00000000..a6a428b7 --- /dev/null +++ b/.history/011/readme_20221007003034.md @@ -0,0 +1,24 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003105.md b/.history/011/readme_20221007003105.md new file mode 100644 index 00000000..7ebbec21 --- /dev/null +++ b/.history/011/readme_20221007003105.md @@ -0,0 +1,94 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +The centroids of the K clusters, which can be used to label new data +Labels for the training data (each data point is assigned to a single cluster) +K-means works by defining spherical clusters that are separable in a way so that the mean value converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +Use Cases: + +Document Classification +Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003116.md b/.history/011/readme_20221007003116.md new file mode 100644 index 00000000..c2b93697 --- /dev/null +++ b/.history/011/readme_20221007003116.md @@ -0,0 +1,95 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +The centroids of the K clusters, which can be used to label new data +Labels for the training data (each data point is assigned to a single cluster) +K-means works by defining spherical clusters that are separable in a way so that the mean value converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +Use Cases: + +Document Classification +Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003119.md b/.history/011/readme_20221007003119.md new file mode 100644 index 00000000..c2b93697 --- /dev/null +++ b/.history/011/readme_20221007003119.md @@ -0,0 +1,95 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +The centroids of the K clusters, which can be used to label new data +Labels for the training data (each data point is assigned to a single cluster) +K-means works by defining spherical clusters that are separable in a way so that the mean value converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +Use Cases: + +Document Classification +Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003122.md b/.history/011/readme_20221007003122.md new file mode 100644 index 00000000..4dd799e0 --- /dev/null +++ b/.history/011/readme_20221007003122.md @@ -0,0 +1,95 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +Labels for the training data (each data point is assigned to a single cluster) +K-means works by defining spherical clusters that are separable in a way so that the mean value converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +Use Cases: + +Document Classification +Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003124.md b/.history/011/readme_20221007003124.md new file mode 100644 index 00000000..33694761 --- /dev/null +++ b/.history/011/readme_20221007003124.md @@ -0,0 +1,95 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +-Labels for the training data (each data point is assigned to a single cluster) +K-means works by defining spherical clusters that are separable in a way so that the mean value converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +Use Cases: + +Document Classification +Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003126.md b/.history/011/readme_20221007003126.md new file mode 100644 index 00000000..f5b50a5d --- /dev/null +++ b/.history/011/readme_20221007003126.md @@ -0,0 +1,95 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +-Labels for the training data (each data point is assigned to a single cluster) +-K-means works by defining spherical clusters that are separable in a way so that the mean value converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +Use Cases: + +Document Classification +Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003130.md b/.history/011/readme_20221007003130.md new file mode 100644 index 00000000..76733a18 --- /dev/null +++ b/.history/011/readme_20221007003130.md @@ -0,0 +1,95 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +-Labels for the training data (each data point is assigned to a single cluster) +-K-means works by defining spherical clusters that are separable in a way so that the mean value converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +Use Cases: + +Document Classification +Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003131.md b/.history/011/readme_20221007003131.md new file mode 100644 index 00000000..a4f6ffeb --- /dev/null +++ b/.history/011/readme_20221007003131.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +-Labels for the training data (each data point is assigned to a single cluster) +-K-means works by defining spherical clusters that are separable in a way so that the mean value +converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +Use Cases: + +Document Classification +Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003134.md b/.history/011/readme_20221007003134.md new file mode 100644 index 00000000..c2b6363d --- /dev/null +++ b/.history/011/readme_20221007003134.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +-Labels for the training data (each data point is assigned to a single cluster) +-K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +Use Cases: + +Document Classification +Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003140.md b/.history/011/readme_20221007003140.md new file mode 100644 index 00000000..dda0c952 --- /dev/null +++ b/.history/011/readme_20221007003140.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +-Labels for the training data (each data point is assigned to a single cluster) +-K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +#Use Cases: + +Document Classification +Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003142.md b/.history/011/readme_20221007003142.md new file mode 100644 index 00000000..8c704952 --- /dev/null +++ b/.history/011/readme_20221007003142.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +-Labels for the training data (each data point is assigned to a single cluster) +-K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +Document Classification +Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003148.md b/.history/011/readme_20221007003148.md new file mode 100644 index 00000000..5b7f1851 --- /dev/null +++ b/.history/011/readme_20221007003148.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +-Labels for the training data (each data point is assigned to a single cluster) +-K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +-Document Classification +Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003150.md b/.history/011/readme_20221007003150.md new file mode 100644 index 00000000..8a676dd3 --- /dev/null +++ b/.history/011/readme_20221007003150.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +-Labels for the training data (each data point is assigned to a single cluster) +-K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +-Document Classification +-Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003152.md b/.history/011/readme_20221007003152.md new file mode 100644 index 00000000..3bbf3d53 --- /dev/null +++ b/.history/011/readme_20221007003152.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +-Labels for the training data (each data point is assigned to a single cluster) +-K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +-Document Classification +-Delivery Store Optimization +- Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003153.md b/.history/011/readme_20221007003153.md new file mode 100644 index 00000000..10660327 --- /dev/null +++ b/.history/011/readme_20221007003153.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +-Labels for the training data (each data point is assigned to a single cluster) +-K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +-Document Classification + -Delivery Store Optimization +- Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003155.md b/.history/011/readme_20221007003155.md new file mode 100644 index 00000000..f6f9838f --- /dev/null +++ b/.history/011/readme_20221007003155.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +-Labels for the training data (each data point is assigned to a single cluster) +-K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003158.md b/.history/011/readme_20221007003158.md new file mode 100644 index 00000000..d82ae2c7 --- /dev/null +++ b/.history/011/readme_20221007003158.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003202.md b/.history/011/readme_20221007003202.md new file mode 100644 index 00000000..2df83ad9 --- /dev/null +++ b/.history/011/readme_20221007003202.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003206.md b/.history/011/readme_20221007003206.md new file mode 100644 index 00000000..2cb2e75a --- /dev/null +++ b/.history/011/readme_20221007003206.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. +# Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003207.md b/.history/011/readme_20221007003207.md new file mode 100644 index 00000000..74d902a6 --- /dev/null +++ b/.history/011/readme_20221007003207.md @@ -0,0 +1,97 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003209.md b/.history/011/readme_20221007003209.md new file mode 100644 index 00000000..f6ca7c7b --- /dev/null +++ b/.history/011/readme_20221007003209.md @@ -0,0 +1,97 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003211.md b/.history/011/readme_20221007003211.md new file mode 100644 index 00000000..32e73b65 --- /dev/null +++ b/.history/011/readme_20221007003211.md @@ -0,0 +1,97 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003214.md b/.history/011/readme_20221007003214.md new file mode 100644 index 00000000..b69c631e --- /dev/null +++ b/.history/011/readme_20221007003214.md @@ -0,0 +1,97 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: +- +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003216.md b/.history/011/readme_20221007003216.md new file mode 100644 index 00000000..32e73b65 --- /dev/null +++ b/.history/011/readme_20221007003216.md @@ -0,0 +1,97 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003220.md b/.history/011/readme_20221007003220.md new file mode 100644 index 00000000..92cc61a2 --- /dev/null +++ b/.history/011/readme_20221007003220.md @@ -0,0 +1,97 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003227.md b/.history/011/readme_20221007003227.md new file mode 100644 index 00000000..92cc61a2 --- /dev/null +++ b/.history/011/readme_20221007003227.md @@ -0,0 +1,97 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003230.md b/.history/011/readme_20221007003230.md new file mode 100644 index 00000000..f8268982 --- /dev/null +++ b/.history/011/readme_20221007003230.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003232.md b/.history/011/readme_20221007003232.md new file mode 100644 index 00000000..db9e1bd8 --- /dev/null +++ b/.history/011/readme_20221007003232.md @@ -0,0 +1,95 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on +minci∈Cdist(ci,x)2 +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003250.md b/.history/011/readme_20221007003250.md new file mode 100644 index 00000000..65b55227 --- /dev/null +++ b/.history/011/readme_20221007003250.md @@ -0,0 +1,93 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003255.md b/.history/011/readme_20221007003255.md new file mode 100644 index 00000000..df87f7e6 --- /dev/null +++ b/.history/011/readme_20221007003255.md @@ -0,0 +1,92 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003257.md b/.history/011/readme_20221007003257.md new file mode 100644 index 00000000..65b55227 --- /dev/null +++ b/.history/011/readme_20221007003257.md @@ -0,0 +1,93 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003307.md b/.history/011/readme_20221007003307.md new file mode 100644 index 00000000..5c10e22f --- /dev/null +++ b/.history/011/readme_20221007003307.md @@ -0,0 +1,93 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003318.md b/.history/011/readme_20221007003318.md new file mode 100644 index 00000000..715da329 --- /dev/null +++ b/.history/011/readme_20221007003318.md @@ -0,0 +1,93 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +4. Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003328.md b/.history/011/readme_20221007003328.md new file mode 100644 index 00000000..68ed1d62 --- /dev/null +++ b/.history/011/readme_20221007003328.md @@ -0,0 +1,93 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +3.2. Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003330.md b/.history/011/readme_20221007003330.md new file mode 100644 index 00000000..1f7312d3 --- /dev/null +++ b/.history/011/readme_20221007003330.md @@ -0,0 +1,93 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +3.2 Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003332.md b/.history/011/readme_20221007003332.md new file mode 100644 index 00000000..2fe086b5 --- /dev/null +++ b/.history/011/readme_20221007003332.md @@ -0,0 +1,93 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +3. Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003334.md b/.history/011/readme_20221007003334.md new file mode 100644 index 00000000..5bd7bf56 --- /dev/null +++ b/.history/011/readme_20221007003334.md @@ -0,0 +1,93 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + + Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003336.md b/.history/011/readme_20221007003336.md new file mode 100644 index 00000000..5cbffc21 --- /dev/null +++ b/.history/011/readme_20221007003336.md @@ -0,0 +1,93 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003340.md b/.history/011/readme_20221007003340.md new file mode 100644 index 00000000..29b8eb26 --- /dev/null +++ b/.history/011/readme_20221007003340.md @@ -0,0 +1,93 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003349.md b/.history/011/readme_20221007003349.md new file mode 100644 index 00000000..24a1d754 --- /dev/null +++ b/.history/011/readme_20221007003349.md @@ -0,0 +1,89 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003351.md b/.history/011/readme_20221007003351.md new file mode 100644 index 00000000..f3cb192c --- /dev/null +++ b/.history/011/readme_20221007003351.md @@ -0,0 +1,88 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003352.md b/.history/011/readme_20221007003352.md new file mode 100644 index 00000000..0b83c686 --- /dev/null +++ b/.history/011/readme_20221007003352.md @@ -0,0 +1,87 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003402.md b/.history/011/readme_20221007003402.md new file mode 100644 index 00000000..a1993708 --- /dev/null +++ b/.history/011/readme_20221007003402.md @@ -0,0 +1,87 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003416.md b/.history/011/readme_20221007003416.md new file mode 100644 index 00000000..8aece63e --- /dev/null +++ b/.history/011/readme_20221007003416.md @@ -0,0 +1,83 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003418.md b/.history/011/readme_20221007003418.md new file mode 100644 index 00000000..f5cbd817 --- /dev/null +++ b/.history/011/readme_20221007003418.md @@ -0,0 +1,84 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003419.md b/.history/011/readme_20221007003419.md new file mode 100644 index 00000000..3c0c8c19 --- /dev/null +++ b/.history/011/readme_20221007003419.md @@ -0,0 +1,84 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003423.md b/.history/011/readme_20221007003423.md new file mode 100644 index 00000000..8892a874 --- /dev/null +++ b/.history/011/readme_20221007003423.md @@ -0,0 +1,83 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003425.md b/.history/011/readme_20221007003425.md new file mode 100644 index 00000000..b5296fef --- /dev/null +++ b/.history/011/readme_20221007003425.md @@ -0,0 +1,82 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003427.md b/.history/011/readme_20221007003427.md new file mode 100644 index 00000000..f57c966c --- /dev/null +++ b/.history/011/readme_20221007003427.md @@ -0,0 +1,81 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003430.md b/.history/011/readme_20221007003430.md new file mode 100644 index 00000000..77bc7d22 --- /dev/null +++ b/.history/011/readme_20221007003430.md @@ -0,0 +1,82 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003506.md b/.history/011/readme_20221007003506.md new file mode 100644 index 00000000..feab2e8b --- /dev/null +++ b/.history/011/readme_20221007003506.md @@ -0,0 +1,82 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003510.md b/.history/011/readme_20221007003510.md new file mode 100644 index 00000000..4b98d3c4 --- /dev/null +++ b/.history/011/readme_20221007003510.md @@ -0,0 +1,83 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means +`knn_starter_exercise.ipynb` + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003513.md b/.history/011/readme_20221007003513.md new file mode 100644 index 00000000..d53fbb4c --- /dev/null +++ b/.history/011/readme_20221007003513.md @@ -0,0 +1,82 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means + +[](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003531.md b/.history/011/readme_20221007003531.md new file mode 100644 index 00000000..01e37d56 --- /dev/null +++ b/.history/011/readme_20221007003531.md @@ -0,0 +1,79 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003538.md b/.history/011/readme_20221007003538.md new file mode 100644 index 00000000..d84789a2 --- /dev/null +++ b/.history/011/readme_20221007003538.md @@ -0,0 +1,79 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means + +### References +- \ No newline at end of file diff --git a/.history/011/readme_20221007003554.md b/.history/011/readme_20221007003554.md new file mode 100644 index 00000000..d476960f --- /dev/null +++ b/.history/011/readme_20221007003554.md @@ -0,0 +1,79 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means + +### References +- https://www.javatpoint.com/k-means-clustering-algorithm-in-machine-learning \ No newline at end of file diff --git a/.history/011/readme_20221007005746.md b/.history/011/readme_20221007005746.md new file mode 100644 index 00000000..be06c35d --- /dev/null +++ b/.history/011/readme_20221007005746.md @@ -0,0 +1,79 @@ +# Problem Statement +Business challenge/requirement + +Lithionpower is the largest provider of electric vehicle(e-vehicle) batteries. It provides battery on a rental model to e-vehicle drivers. Drivers rent battery typically for a day and then replace it with a charged battery from the company. Lithionpower has a variable pricing model based on driver's driving history. As the life of a battery depends on factors such as overspeeding, distance driven per day etc.You as a ML expert have to create a cluster model where drivers can be grouped together based on the driving data. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means + +### References +- https://www.javatpoint.com/k-means-clustering-algorithm-in-machine-learning \ No newline at end of file diff --git a/.history/011/readme_20221007005749.md b/.history/011/readme_20221007005749.md new file mode 100644 index 00000000..16b1f172 --- /dev/null +++ b/.history/011/readme_20221007005749.md @@ -0,0 +1,78 @@ +# Problem Statement +Business challenge/requirement +Lithionpower is the largest provider of electric vehicle(e-vehicle) batteries. It provides battery on a rental model to e-vehicle drivers. Drivers rent battery typically for a day and then replace it with a charged battery from the company. Lithionpower has a variable pricing model based on driver's driving history. As the life of a battery depends on factors such as overspeeding, distance driven per day etc.You as a ML expert have to create a cluster model where drivers can be grouped together based on the driving data. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means + +### References +- https://www.javatpoint.com/k-means-clustering-algorithm-in-machine-learning \ No newline at end of file diff --git a/.history/011/readme_20221007005754.md b/.history/011/readme_20221007005754.md new file mode 100644 index 00000000..be06c35d --- /dev/null +++ b/.history/011/readme_20221007005754.md @@ -0,0 +1,79 @@ +# Problem Statement +Business challenge/requirement + +Lithionpower is the largest provider of electric vehicle(e-vehicle) batteries. It provides battery on a rental model to e-vehicle drivers. Drivers rent battery typically for a day and then replace it with a charged battery from the company. Lithionpower has a variable pricing model based on driver's driving history. As the life of a battery depends on factors such as overspeeding, distance driven per day etc.You as a ML expert have to create a cluster model where drivers can be grouped together based on the driving data. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means + +### References +- https://www.javatpoint.com/k-means-clustering-algorithm-in-machine-learning \ No newline at end of file diff --git a/.history/011/readme_20221007005813.md b/.history/011/readme_20221007005813.md new file mode 100644 index 00000000..e9efce1e --- /dev/null +++ b/.history/011/readme_20221007005813.md @@ -0,0 +1,78 @@ +# Problem Statement +Business challenge/requirement + +Lithionpower is the largest provider of electric vehicle(e-vehicle) batteries. It provides battery on a rental model to e-vehicle drivers. Drivers rent battery typically for a day and then replace it with a charged battery from the company. Lithionpower has a variable pricing model based on driver's driving history. As the life of a battery depends on factors such as overspeeding, distance driven per day etc.You as a ML expert have to create a cluster model where drivers can be grouped together based on the driving data. + +# Objective + +To understand how k-means works internally. + +# Task + +Drivers will be incentivized based on the cluster, so grouping has to be accurate +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means + +### References +- https://www.javatpoint.com/k-means-clustering-algorithm-in-machine-learning \ No newline at end of file diff --git a/.history/011/readme_20221007005815.md b/.history/011/readme_20221007005815.md new file mode 100644 index 00000000..e3d1322f --- /dev/null +++ b/.history/011/readme_20221007005815.md @@ -0,0 +1,78 @@ +# Problem Statement +Business challenge/requirement + +Lithionpower is the largest provider of electric vehicle(e-vehicle) batteries. It provides battery on a rental model to e-vehicle drivers. Drivers rent battery typically for a day and then replace it with a charged battery from the company. Lithionpower has a variable pricing model based on driver's driving history. As the life of a battery depends on factors such as overspeeding, distance driven per day etc.You as a ML expert have to create a cluster model where drivers can be grouped together based on the driving data. + +# Objective + +To understand how k-means works internally. + +# Task + +Drivers will be incentivized based on the cluster, so grouping has to be accurate/ +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means + +### References +- https://www.javatpoint.com/k-means-clustering-algorithm-in-machine-learning \ No newline at end of file diff --git a/.history/011/readme_20221007005817.md b/.history/011/readme_20221007005817.md new file mode 100644 index 00000000..8c239492 --- /dev/null +++ b/.history/011/readme_20221007005817.md @@ -0,0 +1,79 @@ +# Problem Statement +Business challenge/requirement + +Lithionpower is the largest provider of electric vehicle(e-vehicle) batteries. It provides battery on a rental model to e-vehicle drivers. Drivers rent battery typically for a day and then replace it with a charged battery from the company. Lithionpower has a variable pricing model based on driver's driving history. As the life of a battery depends on factors such as overspeeding, distance driven per day etc.You as a ML expert have to create a cluster model where drivers can be grouped together based on the driving data. + +# Objective + +To understand how k-means works internally. + +# Task + +Drivers will be incentivized based on the cluster, so grouping has to be accurate. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means + +### References +- https://www.javatpoint.com/k-means-clustering-algorithm-in-machine-learning \ No newline at end of file diff --git a/.history/011/readme_20221007005822.md b/.history/011/readme_20221007005822.md new file mode 100644 index 00000000..75ee2e30 --- /dev/null +++ b/.history/011/readme_20221007005822.md @@ -0,0 +1,79 @@ +# Problem Statement +Business challenge/requirement + +Lithionpower is the largest provider of electric vehicle(e-vehicle) batteries. It provides battery on a rental model to e-vehicle drivers. Drivers rent battery typically for a day and then replace it with a charged battery from the company. Lithionpower has a variable pricing model based on driver's driving history. As the life of a battery depends on factors such as overspeeding, distance driven per day etc.You as a ML expert have to create a cluster model where drivers can be grouped together based on the driving data. + +# Objective + +To understand how k-means works internally. + +# Task + +Drivers will be incentivised based on the cluster, so grouping has to be accurate. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means + +### References +- https://www.javatpoint.com/k-means-clustering-algorithm-in-machine-learning \ No newline at end of file diff --git a/006/solution/ensemble_techniques.ipynb b/006/solution/ensemble_techniques.ipynb index 076959ea..08da2d51 100644 --- a/006/solution/ensemble_techniques.ipynb +++ b/006/solution/ensemble_techniques.ipynb @@ -1,6 +1,6 @@ { "cells": [ - { + { "cell_type": "markdown", "metadata": {}, "source": [ @@ -730,7 +730,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.7.4 ('base')", "language": "python", "name": "python3" }, @@ -744,7 +744,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.2" + "version": "3.7.4" + }, + "vscode": { + "interpreter": { + "hash": "b1e6b76b6e736d29445d5c5f779c1dafb0f59893c5766b7198bc0a87a8e7acf4" + } } }, "nbformat": 4, diff --git a/008/solution/NaiveBayes Solution.ipynb b/008/solution/NaiveBayes Solution.ipynb index 4380749b..5edf37f4 100644 --- a/008/solution/NaiveBayes Solution.ipynb +++ b/008/solution/NaiveBayes Solution.ipynb @@ -543,7 +543,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.7.4 ('base')", "language": "python", "name": "python3" }, @@ -557,7 +557,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.3" + "version": "3.7.4" + }, + "vscode": { + "interpreter": { + "hash": "b1e6b76b6e736d29445d5c5f779c1dafb0f59893c5766b7198bc0a87a8e7acf4" + } } }, "nbformat": 4, diff --git a/011/data/driver-data.csv b/011/data/driver-data.csv new file mode 100644 index 00000000..2310c9f5 --- /dev/null +++ b/011/data/driver-data.csv @@ -0,0 +1,4001 @@ +id,mean_dist_day,mean_over_speed_perc +3423311935,71.24,28 +3423313212,52.53,25 +3423313724,64.54,27 +3423311373,55.69,22 +3423310999,54.58,25 +3423313857,41.91,10 +3423312432,58.64,20 +3423311434,52.02,8 +3423311328,31.25,34 +3423312488,44.31,19 +3423311254,49.35,40 +3423312943,58.07,45 +3423312536,44.22,22 +3423311542,55.73,19 +3423312176,46.63,43 +3423314176,52.97,32 +3423314202,46.25,35 +3423311346,51.55,27 +3423310666,57.05,26 +3423313527,58.45,30 +3423312182,43.42,23 +3423313590,55.68,37 +3423312268,55.15,18 +3423314255,43.84,22 +3423311976,59.26,32 +3423312669,37.14,41 +3423310697,64.3,29 +3423312113,45.75,16 +3423313343,45.97,23 +3423311431,56.04,39 +3423310755,33.64,45 +3423311821,41.67,33 +3423314359,50.68,39 +3423313106,54.22,35 +3423310754,56.2,29 +3423310524,46.16,41 +3423311780,50.22,24 +3423312156,49.66,33 +3423312916,38.61,37 +3423310588,55.28,36 +3423312995,57.87,41 +3423313389,61.69,12 +3423311369,37.41,21 +3423311408,53.83,32 +3423311598,62.98,22 +3423312047,46.97,13 +3423312322,58.03,24 +3423313247,59.87,36 +3423310944,81.34,31 +3423312404,48.56,26 +3423313738,17.66,23 +3423311461,46.01,21 +3423313866,45.34,26 +3423312074,39.64,31 +3423312444,51.22,36 +3423311834,36.21,31 +3423311527,47.32,55 +3423310476,54.87,27 +3423310548,58.97,6 +3423311011,57.3,42 +3423310633,59.94,32 +3423310595,61.72,25 +3423312757,53.19,23 +3423313776,35.13,38 +3423312067,48.47,20 +3423312235,51.17,43 +3423310893,42.14,39 +3423314121,54.31,35 +3423313750,48.93,32 +3423312776,53.51,44 +3423312927,51.72,24 +3423310765,49.86,10 +3423311457,49.2,41 +3423310678,60.2,39 +3423312564,54.06,20 +3423313058,72.91,30 +3423310803,61.92,40 +3423312166,44.2,24 +3423312608,60.75,39 +3423310646,68.36,37 +3423314440,56.39,20 +3423312301,38.19,15 +3423311400,66.19,27 +3423313288,58.2,17 +3423314357,47.55,22 +3423311015,37.62,16 +3423312270,17.81,26 +3423313457,31.25,29 +3423311768,61,41 +3423314289,40.98,33 +3423310618,53.69,24 +3423311628,51.46,9 +3423313173,64.57,29 +3423310552,53.79,23 +3423314153,64.28,34 +3423313814,38.41,24 +3423312466,58.89,42 +3423314103,44.17,33 +3423311257,50.41,43 +3423311814,63.63,28 +3423311017,45.78,24 +3423311127,55.61,20 +3423311066,67.23,5 +3423313316,49.69,38 +3423313648,54.6,30 +3423313558,40.94,59 +3423313353,43.84,36 +3423312285,56.36,19 +3423310853,69.53,23 +3423313991,51.48,19 +3423312791,52.93,41 +3423311030,55.52,8 +3423310575,52.14,23 +3423311959,62.86,21 +3423314029,37.53,33 +3423313843,63.47,25 +3423310628,62.11,33 +3423311869,43.52,25 +3423312133,47.97,27 +3423314085,42.27,37 +3423310504,56.05,30 +3423311462,62.08,37 +3423310974,35.25,53 +3423311980,39.81,25 +3423312931,60.27,33 +3423313867,53.41,22 +3423313838,56.28,27 +3423312956,53.31,25 +3423312985,55.24,30 +3423311863,44.97,29 +3423313131,40.3,43 +3423313166,47.18,42 +3423313841,46.4,34 +3423312477,35.11,15 +3423313132,54.33,38 +3423311934,54.59,64 +3423311470,61.67,50 +3423314234,52.39,19 +3423313633,52.37,6 +3423312165,40.84,25 +3423314381,27.42,27 +3423312732,44.79,31 +3423312525,59.84,37 +3423313793,36,45 +3423313029,47.64,10 +3423313920,51.85,37 +3423311655,59.73,15 +3423311576,42.96,37 +3423313408,51.84,27 +3423313342,59.62,23 +3423312729,58.82,30 +3423311987,50.93,30 +3423313012,41.35,23 +3423310573,24.58,45 +3423311451,57.74,22 +3423313032,38.14,33 +3423312160,51.82,20 +3423312397,41.37,17 +3423311095,55.01,24 +3423312803,64.56,53 +3423311951,46.09,35 +3423313818,57.81,20 +3423311313,46.31,23 +3423311389,66.62,37 +3423312703,42.89,17 +3423314199,40.82,23 +3423312834,50.55,41 +3423312795,40.43,40 +3423312077,53.36,25 +3423314379,44.83,27 +3423310461,35.87,35 +3423312709,46.95,23 +3423312808,47.75,20 +3423311713,57.12,26 +3423312819,38.03,43 +3423314401,54.31,45 +3423311992,50.29,39 +3423311164,33.6,24 +3423311785,45.13,44 +3423310769,31.68,31 +3423310596,72.36,4 +3423310503,48.43,55 +3423311126,44.48,36 +3423313487,56.54,20 +3423314331,54.11,50 +3423313472,49.51,29 +3423312200,50.66,36 +3423313015,61.82,23 +3423312340,50.45,15 +3423313441,67.7,43 +3423312358,41.61,12 +3423313702,39.63,31 +3423311137,36.08,31 +3423313567,45.18,27 +3423313538,49.33,28 +3423312814,48.66,45 +3423310890,53.81,25 +3423312595,35.57,44 +3423311887,48.83,39 +3423314123,69.14,40 +3423312427,60.75,59 +3423310735,52.01,39 +3423312257,47.88,45 +3423314416,40.3,30 +3423311310,48.52,30 +3423311848,55.06,37 +3423311622,48.45,28 +3423311051,58.65,40 +3423313971,69.29,21 +3423314179,51.7,21 +3423312887,69.95,18 +3423312583,53.04,37 +3423312990,47.97,27 +3423311820,64.01,26 +3423314018,61.76,23 +3423313775,49.98,28 +3423310869,81.96,27 +3423313262,56.56,25 +3423311532,51.69,30 +3423313252,54.83,40 +3423311201,54.97,39 +3423313632,44.07,27 +3423311574,45.95,40 +3423311102,42.24,22 +3423310805,43.52,49 +3423313805,51.33,31 +3423311177,56.68,55 +3423313477,51.06,24 +3423310780,41.95,28 +3423313713,53.56,27 +3423313597,47.86,45 +3423314406,43.74,34 +3423312012,41.22,39 +3423312915,45.82,42 +3423312481,44.74,25 +3423313911,53.88,30 +3423311105,44.4,29 +3423312215,59.01,35 +3423311909,53.27,19 +3423314269,45.62,25 +3423313837,53.16,53 +3423314291,50.91,51 +3423311881,33.53,42 +3423312020,46.92,33 +3423312610,59.24,35 +3423314333,48.37,32 +3423311877,50.26,21 +3423310736,38.14,30 +3423312193,63.42,26 +3423313751,57.46,14 +3423313882,63.79,33 +3423312229,58.13,30 +3423314081,56.35,41 +3423311721,52.95,3 +3423312718,55.65,27 +3423314210,29.44,34 +3423314427,49.58,39 +3423313361,31.3,33 +3423311301,43.48,29 +3423313773,48.94,34 +3423312093,43.93,32 +3423312612,53.83,36 +3423312474,53.69,21 +3423311693,60.33,8 +3423311450,40.97,42 +3423311351,49.29,21 +3423314032,56.06,34 +3423311710,66.16,32 +3423310578,32.61,23 +3423310713,37.37,25 +3423311812,46.04,38 +3423314051,57.17,33 +3423313543,41.06,31 +3423311594,47.21,36 +3423311290,55.74,44 +3423314242,56.34,25 +3423311609,61.53,11 +3423311665,61.4,38 +3423314223,59.03,29 +3423312735,26.76,22 +3423311754,48.24,34 +3423311799,50.84,33 +3423314167,64.25,28 +3423312455,23.06,15 +3423313298,47.25,23 +3423313402,59.19,33 +3423313266,32.71,17 +3423312626,48.87,25 +3423313671,58.68,24 +3423313459,51.16,25 +3423311131,49.48,34 +3423313120,16.05,36 +3423312886,53.49,51 +3423312024,57.19,19 +3423312416,49.38,37 +3423311082,38.82,16 +3423311625,49.88,37 +3423311466,48.81,30 +3423313275,58.86,42 +3423313213,45.22,39 +3423312196,49.25,25 +3423312697,71.53,49 +3423312946,49.41,38 +3423312130,63.19,21 +3423310680,48.68,22 +3423313703,50.11,34 +3423313203,43.56,24 +3423312854,49.36,23 +3423312458,23.03,21 +3423313083,36.14,30 +3423311556,37.48,26 +3423313124,46.35,35 +3423313003,41.54,27 +3423312311,66.05,36 +3423312456,47.09,27 +3423310609,33.84,35 +3423313087,41.45,37 +3423312821,56.09,9 +3423311525,54.66,34 +3423310932,53.23,47 +3423314232,53.51,33 +3423311442,48.56,37 +3423310670,35.91,30 +3423311052,58.04,28 +3423312870,37.16,19 +3423311497,42.45,16 +3423310837,62.65,41 +3423312539,51.42,35 +3423311940,46.31,22 +3423312604,51.36,33 +3423313041,63.62,41 +3423312818,56.54,39 +3423310492,48.58,36 +3423311128,30.37,37 +3423311741,51.75,37 +3423311817,61.68,24 +3423314168,56.54,33 +3423313101,41.22,13 +3423314057,46.08,35 +3423312013,53.21,39 +3423311070,57.95,21 +3423311958,53.4,45 +3423313795,50.32,40 +3423311685,62.06,37 +3423313876,32.9,36 +3423312977,43.48,15 +3423312590,47.76,25 +3423313054,41.61,4 +3423313907,28.72,37 +3423311586,53.37,40 +3423313209,37.81,34 +3423310759,48.7,46 +3423314418,53.23,40 +3423312007,43.47,33 +3423311892,64.55,30 +3423314120,59.34,38 +3423311246,29.92,49 +3423312988,45.67,39 +3423314297,44.99,36 +3423311208,51.07,13 +3423311823,57.87,38 +3423312106,57.9,34 +3423314420,45.41,32 +3423311517,66.97,33 +3423311640,49.37,40 +3423311320,49.54,21 +3423312185,51.73,61 +3423312896,29.68,30 +3423314019,66.12,38 +3423311839,52.68,30 +3423311624,50.32,46 +3423312883,63.2,40 +3423311772,45.12,25 +3423313050,38.69,22 +3423312274,50.77,32 +3423311429,48,23 +3423310739,55.36,47 +3423311395,53.11,48 +3423311354,38.51,28 +3423311393,38.96,21 +3423312817,27.4,30 +3423312479,34.95,35 +3423310481,68.86,31 +3423311405,60.4,28 +3423311690,37.33,38 +3423313373,59.63,24 +3423311282,33.92,43 +3423312839,43.98,6 +3423314319,53.74,31 +3423313884,45.51,33 +3423313972,55.09,19 +3423312249,68.87,18 +3423312335,58.44,19 +3423311494,43.75,19 +3423312701,47.07,33 +3423313515,47.89,28 +3423314282,53.3,41 +3423312619,57.02,37 +3423312412,43.44,31 +3423314324,38.49,21 +3423311966,61.08,24 +3423312519,62.52,41 +3423311365,60.04,34 +3423311004,53.63,20 +3423311801,44.96,46 +3423311838,43.37,50 +3423311337,31.88,30 +3423311381,41.01,37 +3423311788,53.72,13 +3423312168,48.05,18 +3423311734,63.38,41 +3423313610,55.92,22 +3423313259,51.54,22 +3423313763,39.27,34 +3423313915,59.22,14 +3423313443,57.77,14 +3423314388,60.84,9 +3423311771,63.59,21 +3423312064,53.12,30 +3423313947,58.89,9 +3423311377,61.69,36 +3423311879,49.56,24 +3423313273,45.44,29 +3423313260,50.28,21 +3423312737,47.42,35 +3423313200,45.8,28 +3423313121,38.07,23 +3423311258,44.13,24 +3423314365,51.91,59 +3423310478,51.7,36 +3423310833,39.66,19 +3423313185,49.73,26 +3423312421,52.86,24 +3423311571,55.81,29 +3423312578,42.86,24 +3423312982,52.31,25 +3423314281,62.35,28 +3423312574,32.9,16 +3423313596,48.51,23 +3423312692,45.33,12 +3423313089,49.09,12 +3423314364,37.89,14 +3423312830,67.15,17 +3423314422,48.01,32 +3423314257,45.44,28 +3423312259,52.54,22 +3423310550,59.23,40 +3423312359,31.85,37 +3423313930,49.4,54 +3423313282,51.11,25 +3423312938,64.34,47 +3423311073,58.32,42 +3423311930,46.99,13 +3423313641,65.75,34 +3423311511,62.34,12 +3423313352,51.33,38 +3423313889,50,27 +3423313542,63.06,27 +3423312858,64.66,24 +3423314389,54.99,36 +3423313004,40.95,35 +3423311375,52.29,40 +3423311023,40.47,30 +3423310823,59.56,42 +3423312540,58,46 +3423313927,55.12,34 +3423312262,59.21,26 +3423312663,65.54,39 +3423313830,55.64,24 +3423313165,62.4,40 +3423313852,51.01,22 +3423311996,48.83,24 +3423312388,57.58,35 +3423311174,47.21,34 +3423313624,49.26,26 +3423312679,59.51,30 +3423312640,46.12,33 +3423311984,51.7,37 +3423312793,85.36,17 +3423311513,40.04,35 +3423312297,34.77,5 +3423310599,45.1,4 +3423311331,56,8 +3423312797,38.14,5 +3423310994,53.39,5 +3423314011,50.55,7 +3423311899,48.8,6 +3423312465,55.99,3 +3423312364,48.37,4 +3423310629,55.19,6 +3423312374,51.51,5 +3423311872,43.29,9 +3423311133,44.96,7 +3423314224,43.45,5 +3423311001,48.92,8 +3423312057,52.9,6 +3423310758,50.99,5 +3423311732,45.08,5 +3423311038,53.36,2 +3423312266,53.27,4 +3423313753,51.93,6 +3423313741,64.96,6 +3423313179,49.22,5 +3423313355,59.82,7 +3423312895,45.24,4 +3423314338,60.4,7 +3423312533,35.14,5 +3423312437,43.86,6 +3423313894,57.49,6 +3423310963,47.98,7 +3423311552,48.33,7 +3423310655,52.42,3 +3423313603,40.59,6 +3423310676,42.54,3 +3423310888,39.43,6 +3423311109,67.04,4 +3423313105,32.58,5 +3423310977,41.75,9 +3423313010,55.95,4 +3423312400,49.61,4 +3423313807,43.04,8 +3423312684,33.5,5 +3423312188,47.25,6 +3423313285,47.47,7 +3423310830,29.47,8 +3423310541,52.55,2 +3423313828,52.6,3 +3423310737,58.98,6 +3423310778,44.94,4 +3423311521,59.72,7 +3423314275,57.38,5 +3423310783,46.76,6 +3423311087,35.58,5 +3423312845,42.55,10 +3423313480,46.44,5 +3423312741,29.66,1 +3423310820,30.77,8 +3423310614,65.73,8 +3423311591,73.5,6 +3423313420,41.94,3 +3423313854,38.2,6 +3423313323,49.47,4 +3423312841,42.9,6 +3423311146,44.02,4 +3423311855,74.03,6 +3423313450,41.73,7 +3423313376,63.6,4 +3423311989,59.71,4 +3423311621,51.75,4 +3423312387,49.68,5 +3423311702,43.6,1 +3423311158,60.76,6 +3423312490,44.84,2 +3423313181,53.99,4 +3423312014,46.74,4 +3423311633,48.99,7 +3423311928,56.28,4 +3423312069,49.19,3 +3423312219,51.3,5 +3423312360,62.3,6 +3423312190,64.5,5 +3423314203,43.31,3 +3423311618,50.87,4 +3423314159,33.18,1 +3423313130,60.07,2 +3423312497,47.48,2 +3423311271,38.98,5 +3423314265,61.65,4 +3423313548,60.52,9 +3423312569,50.87,4 +3423312260,40.61,4 +3423313278,61.48,3 +3423310549,50.18,6 +3423311544,50.89,10 +3423313908,51.97,4 +3423312011,52.07,5 +3423310929,31.06,3 +3423311108,52.44,6 +3423314372,35.84,9 +3423311332,52.66,7 +3423312722,50.39,4 +3423313104,54.05,1 +3423312214,25.78,3 +3423312499,51.53,8 +3423312406,36.51,7 +3423313492,48.2,6 +3423312460,47.23,5 +3423313299,60.59,6 +3423311261,59.46,8 +3423311635,54.7,4 +3423313994,45.16,4 +3423313870,44.92,3 +3423310781,53.44,5 +3423313895,32.54,9 +3423313229,44.51,5 +3423310465,62.26,6 +3423313134,39.05,3 +3423312454,37.68,4 +3423310770,42.33,10 +3423314185,66.06,4 +3423312672,32.94,7 +3423310554,39.67,4 +3423312348,63.08,9 +3423310824,73.78,4 +3423312602,45.24,5 +3423312717,59.71,9 +3423311747,39.79,6 +3423313025,35.93,6 +3423313790,49.1,5 +3423312402,50.22,8 +3423312144,68.63,8 +3423312289,62.03,6 +3423313758,55.64,7 +3423313921,55.4,8 +3423310454,52.28,8 +3423312081,39.84,1 +3423310510,35.52,5 +3423314262,48.79,3 +3423313860,70.7,2 +3423312516,61.92,4 +3423310785,59.86,4 +3423314276,68.37,5 +3423312309,47.65,4 +3423313959,46.7,0 +3423311509,31.24,1 +3423313448,49.47,10 +3423313335,50.58,8 +3423314117,38.61,6 +3423311619,42.83,6 +3423310706,59.72,7 +3423311537,50.21,2 +3423311089,38.04,7 +3423313621,66.87,6 +3423313346,31.07,3 +3423311660,57.66,6 +3423312382,50.89,5 +3423311689,35.99,5 +3423311423,62.2,4 +3423313349,56.52,2 +3423312111,63.05,1 +3423313822,46.99,3 +3423313537,30.45,2 +3423312774,58.23,6 +3423313452,37.05,5 +3423311569,31.14,4 +3423310482,28.91,4 +3423310645,58.82,4 +3423313386,34.34,3 +3423310799,32.49,3 +3423311807,57.76,3 +3423314421,73.89,4 +3423314387,38.4,8 +3423311188,44.13,5 +3423312263,42.26,6 +3423312804,45.89,7 +3423312572,62.56,8 +3423310978,53.54,6 +3423313497,55.4,4 +3423310975,48.81,3 +3423314045,51.01,7 +3423312813,36.88,7 +3423311114,56.42,6 +3423313164,39.94,4 +3423312433,38.7,8 +3423310703,47.39,6 +3423314355,64.67,3 +3423311349,67.47,5 +3423313726,49.39,6 +3423312733,43.1,6 +3423311774,34.03,5 +3423311203,32.72,0 +3423312485,60.65,5 +3423312560,57.26,6 +3423313976,55.1,4 +3423313424,61.06,4 +3423311644,38.91,4 +3423311240,51.35,3 +3423310854,56.87,6 +3423313832,50.2,5 +3423313864,55.34,6 +3423310936,47.95,3 +3423310470,47.47,6 +3423312401,35.44,6 +3423310687,31.45,8 +3423310892,28.16,3 +3423311490,68.57,6 +3423312120,41.5,5 +3423313369,53.75,8 +3423312112,37.47,5 +3423313878,55.63,4 +3423310526,50.45,7 +3423310926,51.61,3 +3423311247,38.08,7 +3423312367,37.26,5 +3423314414,33.01,4 +3423312413,45.87,6 +3423311585,45.45,4 +3423314004,53.15,8 +3423312148,54.58,5 +3423312449,53.08,2 +3423310472,43.09,4 +3423312356,34.46,4 +3423313067,52.19,4 +3423310949,55.54,5 +3423313254,51.45,5 +3423314300,62.7,1 +3423312015,50.18,8 +3423311866,53.44,6 +3423311129,53.42,4 +3423311808,54.52,4 +3423311615,50.89,6 +3423311026,64.49,9 +3423311837,51.92,8 +3423310750,57.49,6 +3423314208,42.97,1 +3423312029,43.27,4 +3423314376,44.69,6 +3423311646,54.16,3 +3423312070,50.07,4 +3423314034,48.42,5 +3423313689,42.59,6 +3423314221,50.69,4 +3423312245,62.86,3 +3423310615,51.78,2 +3423312720,47.89,3 +3423314111,38.05,1 +3423311744,54.09,4 +3423311053,38.88,4 +3423314345,58,5 +3423311390,42.04,5 +3423312872,35.44,2 +3423313186,43.73,8 +3423312826,65.7,5 +3423310695,43.3,5 +3423312055,54.89,3 +3423314371,54.69,6 +3423310787,46.22,6 +3423311333,47.7,5 +3423313052,39.19,8 +3423313129,57.11,3 +3423313562,47.65,7 +3423312213,38.13,2 +3423312699,33.77,4 +3423312248,54.62,6 +3423314339,51.86,4 +3423311512,56.75,6 +3423314021,50.98,4 +3423313850,46.99,6 +3423314382,52.3,4 +3423311510,47.64,5 +3423311688,28.82,9 +3423311539,53.48,2 +3423310881,45.16,6 +3423313868,56.73,4 +3423310536,71.55,4 +3423313418,32.1,2 +3423310458,41.81,3 +3423312494,52.98,3 +3423310589,43.82,3 +3423311234,40.81,3 +3423311099,46.33,8 +3423311380,37.48,6 +3423313300,42.87,4 +3423313328,54.26,5 +3423313195,46.69,6 +3423312836,48.46,6 +3423310800,56.82,8 +3423313336,48.73,4 +3423313817,38.76,4 +3423312065,38.23,2 +3423313729,54.06,6 +3423311265,42.25,9 +3423310776,35.22,3 +3423312079,43.64,5 +3423314235,45.7,5 +3423311705,45.41,6 +3423314392,70.97,4 +3423311696,36.19,4 +3423311472,56.54,3 +3423313301,44.52,1 +3423313384,42.87,6 +3423311703,33.86,5 +3423311197,65.17,10 +3423311656,42.75,4 +3423310696,42.59,6 +3423314437,45.15,3 +3423311419,49.7,3 +3423312537,55.39,2 +3423313631,32.12,6 +3423313117,60.07,6 +3423313966,62.34,3 +3423311283,75.84,3 +3423312186,49.46,8 +3423312963,58.15,7 +3423314215,49.1,9 +3423311568,45.97,4 +3423311782,37.1,6 +3423310664,55.74,6 +3423311421,42.81,6 +3423313735,53.28,5 +3423310606,43.15,3 +3423312350,54.53,4 +3423312885,69.29,4 +3423314075,42.99,4 +3423312319,66.21,4 +3423313840,50.94,5 +3423311059,53.75,6 +3423312828,45.17,5 +3423311763,44.64,7 +3423313655,39.62,7 +3423312566,54.55,7 +3423311554,53.05,4 +3423313128,54.35,1 +3423311830,48.64,6 +3423313787,49.34,5 +3423312940,46.83,2 +3423314039,31.81,4 +3423310512,34.46,3 +3423314109,27.3,5 +3423311610,53.95,3 +3423312550,46.57,6 +3423314273,33.36,6 +3423313318,36.42,7 +3423311833,65.73,5 +3423310622,49.9,5 +3423313486,56.31,3 +3423313417,33.04,3 +3423313760,48.62,10 +3423310959,47.42,5 +3423314026,68.57,4 +3423312527,45.07,4 +3423311975,56.07,7 +3423312164,47.01,6 +3423310547,51.53,6 +3423310648,51.81,7 +3423313553,66.83,6 +3423311998,50.79,4 +3423313444,46.33,2 +3423312617,30.52,11 +3423313274,41.67,1 +3423312788,53.38,7 +3423311815,60.45,6 +3423311318,41.56,3 +3423311056,28.94,4 +3423312790,43.5,8 +3423311187,56.31,6 +3423312471,67.77,4 +3423312924,41.93,2 +3423314280,71.73,5 +3423311852,57.04,0 +3423314135,38.44,7 +3423312276,55.64,5 +3423314325,57.39,2 +3423313592,60.07,3 +3423313395,46.69,2 +3423311523,46.59,3 +3423314106,55.64,3 +3423311452,46.09,4 +3423313046,46.25,2 +3423314403,61.75,8 +3423314303,66.17,4 +3423314233,58.07,6 +3423311653,56.07,4 +3423312383,49.81,6 +3423311778,49.75,5 +3423311916,35.17,9 +3423312658,55.71,5 +3423311903,49.71,4 +3423311853,48.97,3 +3423313111,61.79,5 +3423312175,42.77,7 +3423313709,47.63,8 +3423313297,67.19,4 +3423312072,52.03,3 +3423314173,55.64,5 +3423314028,44.09,5 +3423312891,49.52,7 +3423313045,49.78,4 +3423312073,39.56,5 +3423312508,50.54,8 +3423311818,39.46,9 +3423314430,41.69,5 +3423312844,48.03,8 +3423310534,37.91,5 +3423311139,40.81,4 +3423312170,58.1,2 +3423311659,49.63,5 +3423312734,59.36,7 +3423313806,37.14,6 +3423313066,49.41,6 +3423313269,57.34,2 +3423311361,46.58,7 +3423313306,42.21,6 +3423311548,41.14,2 +3423313692,37.63,3 +3423311121,60.88,5 +3423310928,64.56,3 +3423314099,58.97,6 +3423313531,51.2,6 +3423311416,52.84,3 +3423313875,22.36,5 +3423313812,60.75,2 +3423314008,60.38,4 +3423310593,64.18,1 +3423311764,71.11,6 +3423310521,31.35,3 +3423312928,52.38,4 +3423311155,54.01,4 +3423310768,51.31,5 +3423313122,65.89,3 +3423313403,51.92,7 +3423312683,57.51,5 +3423311154,48.94,3 +3423313363,34.86,6 +3423310747,42.96,3 +3423311016,38.96,4 +3423311731,41.74,8 +3423313375,66.37,5 +3423311386,31.92,4 +3423313979,52.07,6 +3423312979,55.77,5 +3423312246,59.03,4 +3423311971,49.53,7 +3423314361,56.58,7 +3423314055,33.62,7 +3423312698,70.55,6 +3423311829,63.68,2 +3423310906,61.78,9 +3423311206,39.21,8 +3423310546,37.13,5 +3423312794,57.08,6 +3423312009,24.9,8 +3423312126,55.34,5 +3423310634,43.47,7 +3423314049,73.8,6 +3423311275,58.36,10 +3423313127,52.88,4 +3423311694,44.65,6 +3423313364,46.85,5 +3423312763,55.2,7 +3423311626,58.28,0 +3423313314,67.36,6 +3423312050,50.38,5 +3423311578,56.21,1 +3423314214,47.79,6 +3423312244,63.18,5 +3423314336,57.18,5 +3423310930,58.44,6 +3423313155,60.4,6 +3423313175,43.94,4 +3423310537,53.34,3 +3423314321,42.96,3 +3423312097,46.07,3 +3423311414,52.09,9 +3423311584,57.35,5 +3423312217,72.05,7 +3423312163,55.19,4 +3423312061,33.32,4 +3423314054,51.53,4 +3423313950,52.11,5 +3423313135,46.24,7 +3423314444,51.59,6 +3423312929,63.62,6 +3423313565,59.02,5 +3423311674,53.71,6 +3423314304,60.37,6 +3423310530,56,9 +3423310779,58.05,5 +3423313416,50.86,6 +3423313880,39.44,8 +3423313672,44.61,8 +3423314151,54.2,10 +3423310729,36.76,5 +3423312719,74.83,4 +3423313687,46.57,6 +3423310818,50.74,4 +3423312178,46.85,4 +3423311060,52.01,4 +3423311914,63.96,6 +3423314037,68.24,2 +3423312700,43.22,7 +3423311825,51.73,6 +3423314432,44.04,5 +3423312232,53.66,6 +3423313926,54.99,4 +3423312127,59.09,5 +3423312103,64.15,6 +3423314189,28.36,7 +3423313891,57.13,4 +3423311456,54.29,9 +3423311253,70.87,7 +3423310794,46.55,5 +3423312031,64.77,5 +3423312407,48.84,5 +3423312514,51.37,2 +3423313064,75.54,7 +3423313366,44.68,7 +3423313208,46.59,4 +3423310467,46.66,8 +3423313929,50.36,3 +3423312191,51.39,4 +3423314228,53.95,2 +3423310692,57.78,7 +3423311920,45.77,4 +3423312323,44.64,6 +3423312158,42.6,2 +3423310710,63.59,3 +3423312605,53.75,6 +3423312843,55.88,3 +3423312880,31.35,7 +3423312507,42.2,5 +3423313096,50.1,7 +3423312316,40.18,4 +3423313210,43.8,3 +3423314052,41.84,7 +3423311163,57.03,4 +3423311520,70.83,5 +3423313545,54.96,3 +3423314212,72.8,1 +3423313053,45.58,5 +3423312744,61.79,3 +3423311212,39.81,4 +3423313654,60.9,3 +3423310448,53.76,9 +3423313473,55.85,5 +3423312980,63.36,6 +3423310613,63.15,6 +3423310731,58.98,6 +3423311773,44.21,5 +3423311759,42.09,2 +3423311312,39.78,5 +3423310673,70.77,5 +3423312509,54.82,3 +3423313839,54.07,5 +3423311874,56.74,7 +3423311249,42.78,5 +3423312534,22.27,7 +3423311263,43.07,4 +3423313849,43.5,4 +3423310786,58.48,8 +3423313756,40.4,6 +3423311443,54.84,5 +3423311534,51.83,6 +3423312046,53.08,2 +3423313578,51.08,7 +3423312827,63.84,1 +3423311243,40.95,6 +3423313462,48.07,4 +3423313547,63.67,7 +3423310565,56.59,2 +3423311107,39.37,5 +3423312239,46.7,6 +3423314073,50.53,6 +3423312083,54.02,4 +3423312807,47.21,4 +3423311298,48.05,9 +3423311904,52.01,7 +3423312736,65.61,4 +3423313404,58.38,5 +3423313126,51.45,5 +3423310912,51.74,3 +3423312724,54.57,3 +3423311322,53.41,8 +3423312759,46.4,5 +3423313598,39.22,3 +3423311943,56.13,5 +3423312448,48.63,7 +3423313752,40.75,4 +3423314205,50.55,5 +3423311905,45.64,3 +3423313499,59.35,4 +3423314024,50.47,1 +3423313187,36.82,3 +3423313211,46.86,4 +3423313995,43.53,2 +3423313745,48.16,6 +3423312860,48.21,2 +3423314194,59.44,7 +3423310682,42.9,6 +3423313534,57.77,2 +3423311686,46.41,5 +3423313698,40.59,9 +3423311231,44.24,4 +3423314435,60.7,1 +3423311611,52.5,3 +3423312638,56.38,10 +3423314419,45.2,6 +3423312952,48.46,6 +3423313896,48.5,6 +3423313488,45.83,2 +3423313103,54.89,4 +3423313574,38.09,2 +3423312978,52.25,5 +3423313604,48.21,8 +3423311641,50.43,7 +3423310531,50.07,2 +3423312591,48.47,9 +3423312800,66.46,6 +3423311865,40.46,5 +3423311850,15.52,3 +3423311697,62.54,4 +3423312242,39.57,6 +3423313733,47.88,7 +3423312871,27.5,3 +3423311075,45.97,7 +3423310874,42.01,4 +3423312241,62.54,4 +3423312483,46.36,3 +3423312575,47.58,10 +3423313582,51.68,4 +3423311185,71.04,4 +3423312542,55.43,4 +3423311647,36.37,2 +3423312651,52.4,3 +3423313272,36.89,6 +3423311184,54.39,7 +3423313953,54.56,5 +3423310490,60.76,5 +3423314243,30.32,3 +3423310777,58.49,6 +3423312286,59.55,8 +3423311483,30.84,7 +3423311134,44.06,1 +3423313611,37.52,6 +3423311889,64.31,5 +3423313514,50.05,4 +3423313799,54.52,6 +3423312625,56.53,5 +3423311465,59.16,8 +3423313942,56.49,6 +3423314413,33.68,4 +3423311077,54.42,7 +3423313197,47.62,5 +3423310979,52.44,7 +3423313388,55.47,7 +3423313922,41.38,8 +3423312059,60.32,3 +3423313810,42.71,5 +3423313673,41.83,8 +3423313521,48.98,4 +3423312852,57.87,5 +3423311057,41.84,4 +3423313721,55.12,6 +3423314240,38.5,1 +3423311350,37.73,4 +3423312264,36.91,3 +3423312349,37.5,6 +3423312331,60.28,2 +3423313918,45.51,6 +3423310450,51.59,4 +3423314161,49.46,6 +3423313161,57.05,4 +3423313824,41.16,2 +3423311086,55.64,5 +3423310493,56.91,3 +3423313086,57.02,7 +3423310598,60.17,5 +3423310836,45.68,3 +3423312386,68.44,6 +3423310663,58.77,5 +3423311473,51.97,2 +3423313690,58.28,5 +3423312890,50.39,5 +3423311988,23.9,6 +3423313746,36.91,6 +3423314131,48.86,4 +3423313378,70.21,6 +3423311596,61.06,3 +3423313235,49.54,9 +3423313803,47.36,7 +3423313936,54.75,5 +3423310980,62.01,4 +3423314301,56.41,3 +3423310937,32.09,2 +3423311875,42.75,7 +3423312115,73.66,5 +3423311906,54.27,5 +3423314118,42.71,7 +3423311145,46.62,3 +3423314354,45.74,4 +3423312154,53.28,5 +3423313711,41.83,5 +3423311309,48.76,4 +3423314146,47.89,4 +3423314247,49.28,2 +3423314337,56.76,6 +3423314061,42.92,6 +3423310723,44.74,8 +3423313207,51.19,5 +3423311670,41.78,5 +3423310506,47.95,9 +3423312373,58.91,6 +3423312835,52.6,7 +3423311978,53.59,2 +3423313201,48.09,6 +3423314353,39.06,1 +3423310811,57.59,4 +3423312778,44.44,3 +3423311500,59.68,4 +3423310491,57.78,4 +3423311985,27.86,8 +3423312044,48.62,9 +3423312049,54.09,5 +3423313576,39.3,2 +3423313458,38.32,5 +3423313326,43.68,1 +3423311970,53.08,5 +3423314154,55.26,5 +3423314184,51.24,2 +3423312292,50.03,3 +3423312032,49.38,3 +3423314431,68.26,5 +3423312491,44.97,8 +3423311055,43.36,3 +3423313771,42,8 +3423314306,56.86,6 +3423311267,36.67,7 +3423312265,34.28,2 +3423313319,57.2,5 +3423311303,61.23,7 +3423312337,41.21,5 +3423311285,43.36,4 +3423313887,43.16,6 +3423310878,55.7,5 +3423313772,46.44,5 +3423314335,55.43,3 +3423313954,55.96,8 +3423314362,60.51,2 +3423314107,55.29,6 +3423313405,64.11,5 +3423310610,47.18,7 +3423311949,49.05,5 +3423313407,41.91,5 +3423313160,39.94,6 +3423313230,52.03,5 +3423313647,47.6,4 +3423310576,57.05,9 +3423314014,57.45,5 +3423311237,29.65,3 +3423314122,44.09,4 +3423314375,61.38,9 +3423312078,39.04,4 +3423311859,48.86,10 +3423313727,42.99,8 +3423313716,53.51,8 +3423310675,63.68,6 +3423313546,35.51,7 +3423313383,38.84,4 +3423311406,52.64,6 +3423314002,65.57,6 +3423313740,47.85,7 +3423312942,57.64,4 +3423311191,62.54,7 +3423311259,52.2,7 +3423312628,35.54,7 +3423311245,41.49,6 +3423311334,49.31,6 +3423314003,33.9,6 +3423312489,41.13,4 +3423312473,57.57,1 +3423313794,40.22,6 +3423310514,39.79,4 +3423311589,56.45,4 +3423311293,51.2,3 +3423311634,54.34,6 +3423311704,75.52,0 +3423312300,37.02,2 +3423311844,57.63,8 +3423313519,37.91,7 +3423310947,36.49,5 +3423312655,44.97,9 +3423310843,49.35,10 +3423311474,53,4 +3423312573,56.63,5 +3423310535,66.45,4 +3423310605,41.42,7 +3423310616,59.14,8 +3423312884,36.8,4 +3423313044,55.03,2 +3423312787,51.24,5 +3423313031,45.52,4 +3423310693,50.75,5 +3423311067,73.97,3 +3423312510,40.05,5 +3423313071,49.55,3 +3423311072,60.58,3 +3423310570,36.12,4 +3423312114,43.7,7 +3423311561,77.82,9 +3423312317,42.41,7 +3423311941,46.46,6 +3423313768,42.52,5 +3423312255,48.96,6 +3423313865,41.59,3 +3423312926,58.07,5 +3423313544,49.89,9 +3423314088,71.42,3 +3423312267,48.29,5 +3423311199,47.24,3 +3423313601,42,5 +3423311069,52.11,8 +3423310745,54.97,5 +3423313489,58.11,9 +3423312132,72.99,4 +3423313178,58.53,2 +3423311202,56.08,4 +3423313662,33.77,3 +3423312954,39.31,3 +3423310538,33.46,8 +3423313469,48.75,8 +3423311000,64.69,5 +3423312847,38.47,2 +3423313780,47.13,3 +3423311079,50.73,9 +3423314274,34.4,2 +3423311233,33.48,5 +3423312287,45.9,7 +3423310868,49.23,3 +3423313242,51.51,6 +3423312305,49.18,5 +3423313730,57.78,5 +3423311336,28.74,5 +3423312674,45.15,5 +3423314423,50.93,7 +3423314204,61.99,5 +3423314015,47.9,3 +3423311724,65.76,4 +3423310661,56.57,8 +3423314352,48.53,4 +3423312673,53.39,7 +3423313304,43.86,8 +3423311650,58.52,6 +3423314162,66.75,7 +3423311981,68.18,5 +3423312708,62.89,5 +3423311842,55.64,4 +3423312918,53.12,6 +3423310789,57.4,9 +3423313525,35.52,6 +3423314278,40,3 +3423313952,58.5,5 +3423312087,27.59,7 +3423313006,45.9,6 +3423310753,62.47,7 +3423313102,38.7,5 +3423313011,53.45,4 +3423310607,56.48,3 +3423310908,58.94,5 +3423311355,58.46,8 +3423312597,58.46,5 +3423311982,57.82,1 +3423311115,46.11,4 +3423312475,57.25,6 +3423311022,35.04,5 +3423314271,38.49,7 +3423311173,62.91,4 +3423312670,51.77,5 +3423312833,34.74,8 +3423311446,48.67,3 +3423313116,72.61,4 +3423314066,41.2,3 +3423310486,45.08,4 +3423313356,54.41,7 +3423313227,36.24,2 +3423314020,43.29,4 +3423311032,35.7,2 +3423313374,49.07,7 +3423312629,53.42,7 +3423310816,66.52,2 +3423312637,54.8,8 +3423314101,51.22,4 +3423314287,40.14,1 +3423312468,45.01,7 +3423312922,67.27,3 +3423310704,55.42,6 +3423313190,45,7 +3423310899,63.99,5 +3423311761,50.13,7 +3423311142,41.71,8 +3423313675,64.48,3 +3423310935,52.58,7 +3423310877,50.79,5 +3423311912,60.41,8 +3423312705,44.97,6 +3423313140,43.83,3 +3423311031,41.84,7 +3423313778,60.82,5 +3423313287,46.26,7 +3423312002,44.7,4 +3423313494,42.04,3 +3423311983,38.93,1 +3423311180,57.13,5 +3423312351,37.52,1 +3423311348,59.66,3 +3423311125,49.6,4 +3423310477,40.66,7 +3423314211,48.05,7 +3423312279,55.67,3 +3423312010,43.35,3 +3423310807,46.51,6 +3423314270,70.91,7 +3423311150,44.03,7 +3423313294,35.2,6 +3423311953,43.3,5 +3423310726,44.78,6 +3423310523,52.82,5 +3423313305,37.21,5 +3423311563,42.7,6 +3423314025,33.32,8 +3423312696,58.67,6 +3423312639,58.13,7 +3423312209,47.11,6 +3423313502,45.39,6 +3423312231,39.61,6 +3423313619,63.94,4 +3423310825,45.15,3 +3423310812,59.59,6 +3423312687,64.19,4 +3423310886,45.63,7 +3423311613,47.64,3 +3423311637,63.75,8 +3423313983,54.16,5 +3423312346,66.42,4 +3423310665,45.96,4 +3423310690,37.21,6 +3423310612,70.05,7 +3423312034,54.32,3 +3423311923,49.09,5 +3423312052,47.47,7 +3423312882,56.14,6 +3423313802,47.57,5 +3423313174,55.15,9 +3423312118,52.99,9 +3423313225,68.57,4 +3423310946,35.46,5 +3423310773,27.32,6 +3423311118,44.74,2 +3423311330,46.55,5 +3423311447,36.59,7 +3423313851,45.58,0 +3423313202,68.64,9 +3423311277,36.22,5 +3423313723,43.94,6 +3423313238,39.21,5 +3423313334,32.36,5 +3423310964,70.25,6 +3423312381,51.06,7 +3423310752,46.92,6 +3423312092,44.49,7 +3423311495,62.01,2 +3423312815,32.33,7 +3423312452,54.26,3 +3423311205,52.53,6 +3423313762,50.69,8 +3423313819,53.79,5 +3423310567,36.19,2 +3423312487,42.01,4 +3423311182,31.12,5 +3423314094,56.16,6 +3423313657,62.38,2 +3423312635,52.32,6 +3423314206,34.81,3 +3423313099,42.03,4 +3423311341,43.61,8 +3423311270,63.2,7 +3423312747,41.61,4 +3423312730,33.45,5 +3423311880,39.06,8 +3423314108,53.51,1 +3423311003,19.62,3 +3423310788,41.26,7 +3423311274,51.31,3 +3423310623,32.51,2 +3423312941,53.18,7 +3423312675,51.24,5 +3423310466,48.97,4 +3423312480,68.7,3 +3423312258,74.14,5 +3423312784,38.76,6 +3423310581,46.39,6 +3423312802,56.55,2 +3423311120,50.93,10 +3423312503,44.51,1 +3423314237,76.31,4 +3423311683,59.02,3 +3423313909,45.85,6 +3423311735,59.98,6 +3423312223,44.15,5 +3423313501,56.09,11 +3423314060,41.52,5 +3423312981,71.33,4 +3423313024,66.13,4 +3423313446,38.77,5 +3423312773,48.5,7 +3423311973,43.65,5 +3423313267,52.81,10 +3423310743,57.35,7 +3423313143,57.27,4 +3423311397,58.51,6 +3423310885,48.44,7 +3423314436,45.99,3 +3423313051,62.48,8 +3423314195,53.07,1 +3423313506,43.96,5 +3423311942,55.07,6 +3423310542,34.54,2 +3423310991,50.82,6 +3423312632,55.45,8 +3423313940,50.43,7 +3423314385,59.81,8 +3423311643,48.09,3 +3423313475,64.81,8 +3423312707,48.74,7 +3423311590,53.44,1 +3423310516,52.31,3 +3423311002,38.29,6 +3423313084,47.79,9 +3423311789,71.67,7 +3423313890,37.41,5 +3423314071,52.13,7 +3423310867,63.03,2 +3423312704,52.1,2 +3423313219,47.39,5 +3423312395,42.57,5 +3423314181,76.89,5 +3423312472,52.59,6 +3423313699,56.08,8 +3423311809,38.66,1 +3423311281,45.23,5 +3423312119,52.85,5 +3423312515,56.87,6 +3423313816,36.66,4 +3423310863,48.88,2 +3423313939,60.99,4 +3423313415,48.69,6 +3423311398,36.84,6 +3423311453,36.08,5 +3423313524,49.37,3 +3423313530,51.82,5 +3423313688,42.53,4 +3423311394,44.13,6 +3423313002,53.27,6 +3423310734,54.33,6 +3423313512,65.01,6 +3423313232,51.46,4 +3423310915,56.07,6 +3423310797,53.48,5 +3423312745,71.54,1 +3423314191,44.69,3 +3423313551,36.53,6 +3423313737,50.75,4 +3423313706,57.83,7 +3423312997,47.41,3 +3423310489,52.98,6 +3423313075,46.99,6 +3423313199,66.27,5 +3423314150,33.42,9 +3423311692,61.98,4 +3423313157,34.11,1 +3423313679,52.42,7 +3423314296,57.05,5 +3423313912,54.73,5 +3423311716,64.47,2 +3423313119,31.38,7 +3423314067,68.87,5 +3423311172,57.61,5 +3423311558,57.65,8 +3423312332,58.08,4 +3423312003,51.31,6 +3423312174,66.32,6 +3423311112,51.75,5 +3423312555,51.75,6 +3423311726,34.26,5 +3423312903,48.79,6 +3423313900,23.46,2 +3423312798,61.32,4 +3423311433,44.18,5 +3423314283,38.98,7 +3423311800,41.52,8 +3423311189,64.71,5 +3423313815,55.04,4 +3423312571,41.94,3 +3423311680,46.88,6 +3423312467,73.97,3 +3423311506,35.55,5 +3423311411,51.54,9 +3423314038,53.32,4 +3423312996,63.86,2 +3423313030,33.69,8 +3423311242,47.34,6 +3423313658,35.38,4 +3423311668,61.31,4 +3423313668,55.71,1 +3423310856,58.87,3 +3423313214,43.3,5 +3423312091,36.19,8 +3423313463,47.62,5 +3423312825,39.87,5 +3423311781,51.76,9 +3423313460,46.82,5 +3423312321,59.73,8 +3423314110,60.88,7 +3423313091,55.99,4 +3423313518,46.93,3 +3423311413,62.48,2 +3423312492,59.47,5 +3423312461,49.09,5 +3423311977,42.49,5 +3423313725,36.63,5 +3423313783,57.03,6 +3423312545,40.71,6 +3423312824,42.1,6 +3423310630,53.8,3 +3423313049,53.5,2 +3423312517,37.27,6 +3423311843,44.17,6 +3423312145,39.32,5 +3423311530,48.39,6 +3423312823,57.14,5 +3423314089,47.26,3 +3423310475,38.03,5 +3423314425,31.36,5 +3423312271,68.26,6 +3423312706,54.23,2 +3423312755,52.52,1 +3423310603,52.19,3 +3423314129,78.26,4 +3423313239,78.56,4 +3423311792,58.48,5 +3423311214,47.36,7 +3423312968,50.25,6 +3423311062,76.26,2 +3423313061,34.12,4 +3423311006,49.99,7 +3423311420,60.89,6 +3423313426,51.83,6 +3423311968,40.4,2 +3423313062,64.86,8 +3423312486,51.32,3 +3423313533,55.83,4 +3423311198,61.88,2 +3423312644,45.35,4 +3423313666,55.7,1 +3423312816,40.17,2 +3423310983,61.46,7 +3423311403,41.67,4 +3423314256,44.85,3 +3423314402,53.25,8 +3423314390,41.92,5 +3423310909,63.38,4 +3423312769,55.52,8 +3423311730,59.16,7 +3423310847,62.14,6 +3423312290,33.4,9 +3423313493,41.01,3 +3423312296,39.59,3 +3423313434,48.31,6 +3423311220,47.96,4 +3423313379,40.43,5 +3423311749,50.35,4 +3423313999,46.08,3 +3423312096,48.83,5 +3423313584,50.29,4 +3423312543,56.23,4 +3423311498,36.93,4 +3423311700,51.13,8 +3423313387,45.21,3 +3423314426,47.56,8 +3423311840,49.5,9 +3423312315,41.22,8 +3423313844,65.02,3 +3423310501,61.22,9 +3423310720,39.6,7 +3423311546,59.64,6 +3423311058,50.75,8 +3423313464,55.84,3 +3423311090,51.12,6 +3423310694,54.09,6 +3423313948,45.71,7 +3423311885,64.43,5 +3423312299,51.49,2 +3423311412,42.73,5 +3423312399,55.28,2 +3423312150,46.47,7 +3423311681,65.79,3 +3423313322,42.04,5 +3423313193,52.88,5 +3423311260,51.86,2 +3423311605,42.75,4 +3423313158,41.34,3 +3423314046,37.14,6 +3423313476,51.39,9 +3423313833,41.93,4 +3423313226,64.54,7 +3423313792,50.24,8 +3423311516,43.43,10 +3423312911,26.87,3 +3423314370,55.75,3 +3423313588,51.23,5 +3423311415,45.6,4 +3423314386,53.04,9 +3423312914,74.2,3 +3423311396,68.53,3 +3423313886,61.21,1 +3423313990,51.91,2 +3423311709,39.73,7 +3423313797,63.49,4 +3423314438,41.37,1 +3423313701,58.09,6 +3423311725,41.1,4 +3423312149,29.32,6 +3423313777,53.43,5 +3423311020,42.54,5 +3423312806,48.38,6 +3423313708,53.68,2 +3423311244,43.9,1 +3423311225,50.56,4 +3423312392,62.59,4 +3423311796,37.18,3 +3423313509,48.98,6 +3423314373,46.67,6 +3423313674,56.56,3 +3423310677,48.74,3 +3423312838,63.24,3 +3423311810,36.26,11 +3423310594,37.36,2 +3423312848,48.48,1 +3423312677,29.42,7 +3423310764,55.04,5 +3423313749,54.05,2 +3423310566,49.24,1 +3423312429,51.72,5 +3423312799,43.03,7 +3423312631,56.77,7 +3423312654,31.62,5 +3423314036,30.61,2 +3423311352,39.68,4 +3423314149,44.39,4 +3423311008,47.51,9 +3423314245,59.39,5 +3423313291,65.3,5 +3423312254,44.3,3 +3423313484,47.74,3 +3423311463,57.2,7 +3423312528,37.62,3 +3423311593,44.39,3 +3423312529,51.67,5 +3423311096,59.58,4 +3423312370,42.08,7 +3423310617,34.59,5 +3423313645,60.39,8 +3423312107,59.46,6 +3423311135,56.72,3 +3423312559,55.09,4 +3423313577,59.63,3 +3423312189,54.41,4 +3423312693,46.15,5 +3423310686,29.92,4 +3423311972,52.07,3 +3423313951,57.64,6 +3423312974,55.9,3 +3423310551,44.65,4 +3423314254,60.79,6 +3423314169,50.38,6 +3423313293,62.09,3 +3423314063,51.05,4 +3423313330,53.11,2 +3423314397,64.75,7 +3423310471,61.68,6 +3423312001,51.3,5 +3423314231,59.63,1 +3423312863,56.13,6 +3423310572,42.12,4 +3423312962,60.37,7 +3423311379,37.6,6 +3423313009,51.12,4 +3423313943,46.04,2 +3423313112,42.52,10 +3423314284,66.51,10 +3423312765,37.17,5 +3423312236,55.8,5 +3423313676,56.27,4 +3423314349,65.04,8 +3423313968,43.77,6 +3423313607,63.01,7 +3423313670,42.77,6 +3423311777,44.26,8 +3423310831,55.84,9 +3423314238,52.42,7 +3423312967,29.67,5 +3423312614,46.95,8 +3423310744,46.28,5 +3423311153,55.74,5 +3423312099,56.12,3 +3423313196,54.26,4 +3423310958,56.04,1 +3423311148,60.79,8 +3423311604,50.19,5 +3423313139,50.72,6 +3423313431,52.72,2 +3423312777,61.67,5 +3423312685,47.17,5 +3423311272,51.93,5 +3423310557,38.38,6 +3423313786,57.32,9 +3423312592,39.57,4 +3423311857,68.99,4 +3423311106,35.39,7 +3423314097,56.91,8 +3423311238,48.66,4 +3423312089,57.91,4 +3423314164,41.25,6 +3423313732,46.26,8 +3423313474,27.99,2 +3423312992,40.4,5 +3423311802,45.54,3 +3423310453,34.68,6 +3423313022,48.75,6 +3423310495,46.79,7 +3423310810,53.83,6 +3423313635,51.13,5 +3423312908,61.03,7 +3423312714,32.3,5 +3423312652,55.84,5 +3423314391,29.73,4 +3423312368,58.43,5 +3423311886,45.56,3 +3423311891,56.99,7 +3423313490,42.04,4 +3423311878,54.03,3 +3423311278,49.17,5 +3423311769,55.12,6 +3423313722,39.28,6 +3423310639,39.88,4 +3423311085,34.49,8 +3423311913,50.52,7 +3423310880,40.7,2 +3423310483,53.33,3 +3423311540,34.04,5 +3423313182,45.54,11 +3423312660,52.09,8 +3423310716,45.01,8 +3423312576,52.06,3 +3423310555,58.46,5 +3423311488,37.58,7 +3423311565,46.91,2 +3423314294,48.57,8 +3423310796,25.39,7 +3423310684,53.29,8 +3423313063,55.97,3 +3423313667,39.14,7 +3423313092,48.74,4 +3423310945,52.26,2 +3423313077,67.57,7 +3423312842,38.43,4 +3423311256,29.59,4 +3423311455,39.81,8 +3423311617,37.84,3 +3423313223,53.56,2 +3423312633,48.79,2 +3423311947,37.66,2 +3423311599,52.05,7 +3423311931,65.27,6 +3423313587,38.74,6 +3423313321,29.28,3 +3423312496,21.6,3 +3423312889,28.7,0 +3423313467,46.08,7 +3423311868,42.02,5 +3423312820,49.96,1 +3423310950,48.05,9 +3423311171,61.36,5 +3423314400,52.83,7 +3423310865,69.19,4 +3423313510,33.2,3 +3423311555,51.27,5 +3423313396,54.48,2 +3423311501,42.47,7 +3423310730,46.2,5 +3423311581,47.92,2 +3423313265,49.43,5 +3423312039,38.26,5 +3423310562,30.5,5 +3423311654,48.88,7 +3423314105,49.45,4 +3423310620,60.69,6 +3423312325,57.36,7 +3423313350,51.15,6 +3423313365,30.3,1 +3423313303,55.41,3 +3423313593,44.98,3 +3423313903,56.9,10 +3423314383,36.37,5 +3423312957,55.2,1 +3423311046,66.98,9 +3423312584,65.93,3 +3423311081,50.89,5 +3423314023,40.97,3 +3423312484,46.17,3 +3423312829,43.19,5 +3423310985,42.01,6 +3423313831,66.26,6 +3423314326,48.41,6 +3423311714,64.68,10 +3423311028,65.17,2 +3423310801,40.48,4 +3423311664,38.74,7 +3423312198,39.83,4 +3423310627,42.94,3 +3423311499,54.04,7 +3423310927,35.32,6 +3423311149,62.47,4 +3423311335,39.72,7 +3423310587,47.95,9 +3423311699,40.29,7 +3423313159,51.25,5 +3423311009,51.77,3 +3423311742,32.28,8 +3423311036,44.08,5 +3423310657,52.84,4 +3423313110,48.98,3 +3423310855,57,8 +3423313310,58.22,5 +3423310474,41.38,2 +3423310738,45.01,7 +3423311816,43.28,9 +3423313686,52.78,9 +3423311939,37.25,8 +3423313937,53.88,3 +3423313731,53.84,6 +3423311583,52.7,1 +3423313180,44.56,4 +3423314253,50.16,7 +3423313910,51.76,4 +3423312417,40.61,4 +3423311564,56.47,8 +3423312172,41.94,6 +3423313036,62.92,7 +3423314076,65.48,7 +3423310533,48.82,2 +3423311504,39.13,4 +3423313413,30.27,4 +3423314166,47.01,6 +3423310846,39.04,4 +3423313644,36.6,4 +3423311550,60.09,3 +3423312431,38.5,7 +3423314140,60.46,3 +3423313339,48.26,5 +3423311193,46.74,2 +3423310674,44.85,2 +3423310813,61.38,6 +3423313385,63.88,6 +3423312894,44.43,4 +3423312546,59.64,6 +3423312324,55.07,1 +3423310582,56.72,7 +3423313956,52.33,4 +3423312579,47.04,7 +3423313263,45.59,6 +3423311374,45.47,5 +3423314207,50.18,3 +3423310871,38.85,8 +3423312920,51.57,3 +3423310763,56.38,1 +3423314218,43.52,8 +3423310714,53.46,5 +3423310625,45.74,4 +3423311755,46.48,6 +3423313620,50.63,8 +3423313898,43.06,3 +3423312192,34.77,4 +3423310485,47.34,5 +3423311469,59.12,11 +3423313663,41.22,4 +3423311211,46.41,10 +3423313639,45.58,4 +3423312415,45.28,7 +3423314069,55.43,6 +3423313569,54.32,4 +3423311805,40.04,3 +3423312851,62.46,5 +3423312423,60.24,4 +3423310887,54.78,6 +3423314192,52.11,4 +3423313163,46.1,6 +3423310700,46.13,9 +3423311304,45.62,5 +3423312122,56.74,3 +3423314310,42.62,4 +3423312810,74.88,7 +3423314096,33.22,3 +3423313978,39.93,4 +3423313243,60.35,2 +3423314127,54.66,4 +3423312463,34.96,6 +3423310631,58.1,8 +3423311753,46.64,2 +3423313430,63,6 +3423312330,48.1,9 +3423312904,38.66,3 +3423312018,48.91,0 +3423312641,63.5,4 +3423313056,44.26,4 +3423314329,36.23,3 +3423314174,45.16,5 +3423310513,49.35,8 +3423310662,49.73,11 +3423310883,54.63,5 +3423311575,54.04,7 +3423311410,60.63,6 +3423312147,40.05,0 +3423311757,61.61,1 +3423312910,45.63,8 +3423312017,40.1,4 +3423311739,35.24,7 +3423312620,49.02,7 +3423314022,50.07,8 +3423312715,54.38,4 +3423311378,56.85,3 +3423312760,42.39,4 +3423311719,49.53,4 +3423310918,37.42,8 +3423313148,57.2,6 +3423313770,38.4,4 +3423313465,50.1,5 +3423311803,53.42,1 +3423312728,50.55,4 +3423312568,45.7,7 +3423311595,48.56,3 +3423310529,59.31,7 +3423311921,46.6,4 +3423313145,49.49,3 +3423313637,40.67,5 +3423312661,62.34,3 +3423314360,56.89,7 +3423311323,52.19,4 +3423312532,39.67,5 +3423312354,49.69,5 +3423313371,56.33,3 +3423313800,44.21,5 +3423313618,43.54,6 +3423312142,33.63,5 +3423312764,58.27,1 +3423311566,40.18,6 +3423311043,50.38,5 +3423313897,37.68,5 +3423312207,50.65,3 +3423313960,44,7 +3423314157,50.05,8 +3423313575,56.99,7 +3423312618,66.04,5 +3423311698,33.17,2 +3423310520,45.31,6 +3423313919,64.43,8 +3423311835,52.05,4 +3423310996,45.12,4 +3423310509,55.8,8 +3423312284,55.43,4 +3423311084,43.25,4 +3423312549,55.56,4 +3423312124,47.95,8 +3423311019,66.13,5 +3423313559,50.75,2 +3423310804,64.75,4 +3423310848,36.52,7 +3423310866,60.46,4 +3423311217,49.37,9 +3423313432,50.69,7 +3423312849,70.29,8 +3423311140,50.64,2 +3423312912,38.68,4 +3423313757,56.9,3 +3423312161,43.42,8 +3423312385,40,6 +3423311994,66.08,4 +3423313944,62.75,6 +3423314408,50.25,10 +3423314263,41.55,3 +3423312976,51.98,3 +3423314112,56.02,5 +3423313296,47.55,3 +3423311050,45.86,6 +3423313014,44.31,4 +3423310637,63.6,2 +3423313325,51.62,6 +3423310910,47.76,3 +3423312596,49.68,9 +3423311922,45.02,5 +3423311021,37.79,3 +3423312853,49.27,5 +3423310751,39.67,9 +3423312208,50.9,9 +3423313406,29.54,5 +3423311222,44.2,6 +3423313423,59.43,5 +3423313358,45.78,8 +3423310459,48.82,5 +3423311846,51.48,6 +3423311894,36.11,9 +3423312958,47.52,6 +3423310624,57.27,9 +3423312959,47.95,5 +3423312197,49.19,7 +3423312143,54,4 +3423310981,52.27,7 +3423312371,62.39,4 +3423311166,48.22,5 +3423313734,71.83,5 +3423310711,42.53,3 +3423313107,41.02,9 +3423312384,39.76,8 +3423313589,54.67,4 +3423312801,65.55,3 +3423314158,56.67,5 +3423311997,58.09,3 +3423313217,45.26,11 +3423313264,48.47,6 +3423313482,60.23,2 +3423312743,39.97,3 +3423313035,67.34,1 +3423311858,67.87,3 +3423312028,29.69,6 +3423310961,54.15,5 +3423310845,51.15,3 +3423313098,50.1,5 +3423313535,68.64,3 +3423311822,54.16,3 +3423312110,43.23,3 +3423310767,63.62,4 +3423311938,52.73,3 +3423313612,64.16,7 +3423313138,50.08,3 +3423314250,47.15,3 +3423313941,49.3,1 +3423310766,43.37,4 +3423312362,49.88,5 +3423312786,55.95,8 +3423312753,47.52,8 +3423312379,56.61,6 +3423314230,53.26,6 +3423314084,55.77,5 +3423311902,44.67,7 +3423314320,55.07,4 +3423313435,39.75,3 +3423313962,65.73,5 +3423312498,49.55,3 +3423314128,59.63,4 +3423314095,54.67,5 +3423312022,56.32,8 +3423312339,44.71,6 +3423314187,45.95,3 +3423314239,54.84,6 +3423311014,47.34,6 +3423313149,45.73,3 +3423312036,48.84,7 +3423314308,57.45,6 +3423312642,60.97,9 +3423314219,50.64,4 +3423311642,52.12,5 +3423311898,34.72,5 +3423312951,53.49,4 +3423310494,56.82,3 +3423312408,50.26,5 +3423311737,44.29,6 +3423313152,29.93,4 +3423314006,58.14,7 +3423313332,43.63,8 +3423313989,52.03,4 +3423313710,55.81,6 +3423312056,69.54,7 +3423314446,35.54,6 +3423312068,37.06,8 +3423313649,44.46,4 +3423313555,53.36,11 +3423312531,54.1,8 +3423313739,41.54,8 +3423314429,37.17,2 +3423313206,48.39,5 +3423311608,51.92,7 +3423311969,60.39,7 +3423314316,40.03,4 +3423313564,32.36,5 +3423310894,65.55,2 +3423313246,58.83,3 +3423311662,59.37,6 +3423314170,74.91,5 +3423313917,51.97,5 +3423312187,35.67,5 +3423314217,41.93,4 +3423313037,48.01,6 +3423312155,29.85,6 +3423313627,57.22,7 +3423311098,52.76,4 +3423313228,71.84,3 +3423313862,57.11,5 +3423312877,45.24,3 +3423314322,65.49,6 +3423314378,58.16,9 +3423314175,30.44,3 +3423311489,58.54,7 +3423312662,55.85,5 +3423310611,54.94,7 +3423312688,66.46,6 +3423311775,44.42,6 +3423312611,63.96,5 +3423310602,44.38,5 +3423313560,67.31,5 +3423314393,54.63,2 +3423310527,50.98,0 +3423313532,66.8,6 +3423314366,52.43,6 +3423313188,39.4,8 +3423311368,50.63,3 +3423310931,49.28,6 +3423312094,72.16,5 +3423311538,36.74,7 +3423310553,52.48,8 +3423314367,59.95,7 +3423311507,65.88,8 +3423312203,49.83,9 +3423314056,60.85,5 +3423314068,59.7,10 +3423310585,66.76,2 +3423313205,50.68,4 +3423312523,44.67,5 +3423314268,53.88,4 +3423314327,48.34,4 +3423310870,46.08,3 +3423311061,58.53,8 +3423312897,53.43,6 +3423314225,51.34,8 +3423313329,74.11,6 +3423312033,55.79,5 +3423310604,71.3,8 +3423312742,59.34,9 +3423310838,73.13,3 +3423312269,38.97,4 +3423312211,31.88,7 +3423312306,26.14,5 +3423312547,35.39,8 +3423311300,60.21,6 +3423312502,49.17,0 +3423310502,35.29,3 +3423312740,51.57,5 +3423313769,56.01,4 +3423313347,59.08,5 +3423310859,56.78,6 +3423312859,56.02,5 +3423311071,31.3,5 +3423311597,56.1,9 +3423313541,60.55,7 +3423313013,64.6,3 +3423311738,32.09,6 +3423311766,52.49,6 +3423314350,61.01,7 +3423312278,57.26,6 +3423312751,34.42,6 +3423311871,44.44,7 +3423312250,45.53,5 +3423312320,44.78,6 +3423312086,31.13,6 +3423310971,39.02,4 +3423311669,53.34,2 +3423312615,51.01,5 +3423311529,48.81,6 +3423311268,50.16,4 +3423314080,66.56,4 +3423313277,51.05,4 +3423311918,51.44,5 +3423311029,49.22,5 +3423312216,44.82,7 +3423312949,55.23,8 +3423313156,45.51,3 +3423313409,44.53,4 +3423312822,25.9,3 +3423313829,61.28,6 +3423313392,30.89,5 +3423311482,41.6,6 +3423310717,69.79,6 +3423312659,47.48,3 +3423313167,50.64,3 +3423310821,62.55,3 +3423313250,40.97,6 +3423311779,64.38,6 +3423313681,72.33,5 +3423311399,57.8,8 +3423311230,60.3,7 +3423313873,45.25,8 +3423310740,61.46,6 +3423314252,50.14,3 +3423311353,51.31,2 +3423313307,41.42,6 +3423314412,33.65,8 +3423312710,37.09,7 +3423311851,35.76,7 +3423314302,54.8,6 +3423314092,41.47,6 +3423313608,43.54,4 +3423311727,42.72,3 +3423312840,56.12,2 +3423311967,51.91,6 +3423312554,39.26,7 +3423313973,50.81,5 +3423311210,53,5 +3423312365,43.65,8 +3423310519,58.17,5 +3423312221,58.88,5 +3423311152,49.9,6 +3423311661,49.94,3 +3423312045,63.02,3 +3423314279,41.73,4 +3423310960,58.95,2 +3423311776,52.33,6 +3423314272,49.25,7 +3423311841,71.63,6 +3423314155,53.34,8 +3423313233,40.49,5 +3423312422,35.32,2 +3423311760,41.45,3 +3423312136,61.34,5 +3423311536,43.15,8 +3423313924,42.32,5 +3423311426,30.44,4 +3423313340,56.59,6 +3423313147,45.05,4 +3423312846,47.22,7 +3423313693,34.87,4 +3423312341,47.16,4 +3423312355,44.75,3 +3423310539,41.62,9 +3423312493,47.67,4 +3423312500,51.09,7 +3423311123,54.14,8 +3423310649,74.51,5 +3423311787,51.47,7 +3423312129,29.96,6 +3423312606,35.52,6 +3423311791,51.63,7 +3423313720,31.73,8 +3423312983,65.39,5 +3423311687,41.39,10 +3423311342,43.95,5 +3423312082,54.48,8 +3423312805,58.12,8 +3423311364,53.04,4 +3423313425,43.05,4 +3423314244,56.28,7 +3423312726,42.21,8 +3423311147,35.47,3 +3423311950,42.97,8 +3423313842,70.3,2 +3423310829,41.82,3 +3423312558,63.43,10 +3423313345,45.45,3 +3423311317,56.25,6 +3423313600,41.64,1 +3423312867,49.85,5 +3423310709,44.93,5 +3423310638,45.35,6 +3423311570,66.26,3 +3423311522,56.11,1 +3423311157,40.37,4 +3423311167,27.14,4 +3423313338,46.89,6 +3423311302,52.27,1 +3423314078,40.92,6 +3423312393,58.37,2 +3423311795,32.08,4 +3423313503,50.93,6 +3423313718,67.87,2 +3423310644,59.36,7 +3423313081,47.05,5 +3423312390,59.87,4 +3423313634,32.52,10 +3423313095,43.65,2 +3423312128,49.98,4 +3423313872,61.27,6 +3423313970,40.5,4 +3423312505,62.04,5 +3423311573,50.56,3 +3423313047,31.4,7 +3423311449,41.91,6 +3423313360,38.34,4 +3423314182,42.44,3 +3423313088,45.94,6 +3423311907,60.7,3 +3423311827,55.02,3 +3423314156,50.91,5 +3423312565,54.92,2 +3423312727,65.84,6 +3423311076,55.84,4 +3423313400,51.61,2 +3423312680,45.94,3 +3423314410,31.39,5 +3423311707,41.02,7 +3423312716,34.61,6 +3423313933,34,6 +3423312966,34.34,0 +3423314227,53.94,4 +3423311493,53.75,3 +3423313311,69.02,5 +3423311900,60.75,5 +3423312725,50.41,3 +3423314005,49.24,5 +3423313315,65.23,5 +3423314133,44.8,7 +3423312603,48.77,6 +3423313859,48.43,3 +3423312746,56.71,8 +3423311454,53.06,5 +3423313397,26.95,8 +3423310852,46.54,4 +3423313782,43.23,4 +3423314404,59.07,1 +3423311227,35.88,4 +3423311946,49.41,7 +3423314305,45,7 +3423312866,58.96,6 +3423311359,60.01,6 +3423311040,43.02,7 +3423311636,43.2,7 +3423312657,62.91,7 +3423313359,39.21,6 +3423312785,40.92,7 +3423312048,41.31,5 +3423313289,69.15,3 +3423311080,69.44,8 +3423313508,48.14,3 +3423312240,53.95,3 +3423314213,62.97,2 +3423313344,74.19,9 +3423314445,51.73,3 +3423312234,28.32,3 +3423310456,46.75,4 +3423313685,47.98,8 +3423312561,43.98,4 +3423313255,41.47,6 +3423312462,77.94,8 +3423311607,53.72,6 +3423312051,60.71,6 +3423312151,52.3,4 +3423310728,56.09,6 +3423312671,25.69,8 +3423313623,45.3,5 +3423310591,58.88,6 +3423311831,59.43,6 +3423311712,57.76,5 +3423310997,47.29,3 +3423314398,28.69,6 +3423313570,42.88,5 +3423311505,57.96,2 +3423312041,46.72,3 +3423312861,60.59,8 +3423313236,36.03,6 +3423312183,64.94,1 +3423312272,53.9,4 +3423311993,49.08,5 +3423312439,38.58,3 +3423313108,38.13,4 +3423311345,46.65,3 +3423313848,46.88,6 +3423310891,49.9,4 +3423310668,48.73,8 +3423311729,31.88,6 +3423310632,62.58,5 +3423312593,61.6,3 +3423312180,52.03,4 +3423311430,28.98,5 +3423311162,64.43,3 +3423312865,48.26,3 +3423313661,53.25,3 +3423311236,45.32,4 +3423312544,49.93,11 +3423312783,50.28,6 +3423313038,48.58,8 +3423310919,67.82,6 +3423313516,59.91,6 +3423310672,64.55,7 +3423313438,58.33,3 +3423312302,32.88,6 +3423311910,57.98,6 +3423312970,52.67,6 +3423311195,51.51,5 +3423310817,54.46,3 +3423312796,69.63,5 +3423310827,49.92,2 +3423314053,48.66,6 +3423311190,38.6,2 +3423314172,39.93,5 +3423312713,66.74,6 +3423310656,40.44,6 +3423313652,52.86,5 +3423314050,39.75,8 +3423310968,61.33,1 +3423312251,57.2,2 +3423310911,65.91,3 +3423313169,40,9 +3423314198,49.98,4 +3423311110,66,3 +3423312748,52.84,4 +3423312598,59.6,5 +3423310844,48.82,6 +3423311027,41.85,6 +3423310791,53.52,9 +3423311752,47.65,6 +3423313901,56.01,6 +3423310722,46.18,1 +3423313007,64.8,8 +3423310973,69.64,8 +3423313677,56.23,2 +3423314288,34.12,4 +3423311478,61.8,1 +3423310702,31.13,4 +3423312582,73.18,8 +3423313997,55.61,6 +3423311241,51.84,2 +3423312443,59.76,4 +3423313696,54.84,7 +3423312357,36.31,6 +3423313719,29.94,4 +3423312621,49.01,4 +3423311194,47.93,5 +3423313162,54.86,4 +3423310719,34.3,8 +3423313189,53.1,4 +3423310583,41.62,4 +3423314070,67.55,9 +3423311294,60.29,6 +3423313566,60.44,3 +3423314134,30.15,6 +3423313146,48.98,7 +3423312206,45.56,4 +3423313090,46.39,7 +3423312712,54.84,3 +3423311360,45.67,7 +3423313454,47,5 +3423314201,40.99,7 +3423312689,50.64,5 +3423311362,53.24,3 +3423313617,61.45,9 +3423313059,48.35,3 +3423314229,33.51,4 +3423313380,47.64,5 +3423313184,31.37,5 +3423313906,41.53,5 +3423314323,56.07,6 +3423311340,54.43,4 +3423313281,65.17,4 +3423311464,50.66,6 +3423311746,56.41,7 +3423311836,49.76,5 +3423311867,44,4 +3423312419,55.67,2 +3423310914,53.45,5 +3423312944,48.64,6 +3423311143,39.16,5 +3423313835,43.46,6 +3423313000,48.79,5 +3423310732,51.7,5 +3423313427,34.76,5 +3423311884,41.93,6 +3423310508,64.32,4 +3423313076,45.64,6 +3423312446,49.3,5 +3423313033,41.5,4 +3423313115,54.82,5 +3423310902,48.47,4 +3423311492,43.07,1 +3423313113,64.36,6 +3423311034,66.9,5 +3423313372,60.9,6 +3423311132,42.6,5 +3423312037,60.85,5 +3423312792,66.57,2 +3423314236,53.67,6 +3423311113,65.02,11 +3423313097,43.18,4 +3423313253,52,3 +3423312643,54.1,4 +3423313748,47.57,6 +3423311682,49.57,5 +3423312749,44.84,7 +3423312779,55.73,1 +3423310921,37.3,5 +3423312218,55.44,5 +3423314104,47.04,4 +3423314171,51.36,9 +3423313808,70.59,2 +3423311226,56.02,6 +3423312874,46.99,7 +3423314115,53.64,3 +3423310907,39.17,4 +3423314369,39.19,8 +3423314193,52.8,7 +3423311508,55.62,3 +3423311553,43.95,3 +3423312105,39.31,6 +3423310990,42.46,5 +3423313362,47.59,4 +3423312085,40.71,5 +3423310771,31.62,2 +3423311606,31.14,9 +3423314047,39.21,6 +3423311526,50.7,7 +3423311339,47.91,4 +3423311138,54.47,4 +3423313529,47.52,2 +3423312934,65.2,7 +3423311657,59.48,4 +3423312789,62.58,5 +3423313511,50.47,6 +3423314083,53.41,3 +3423314093,83.42,5 +3423311116,58.78,3 +3423313290,41.65,7 +3423313023,53,3 +3423313826,56.12,0 +3423313172,37.54,5 +3423314027,46.08,7 +3423311961,34.55,7 +3423314295,46.45,4 +3423312811,66.17,6 +3423312869,44.1,2 +3423311860,56.24,5 +3423311652,53.48,5 +3423313085,70.32,4 +3423311181,46.94,6 +3423313899,36.98,6 +3423311793,49.3,3 +3423311035,27.04,6 +3423311048,45.28,6 +3423313244,50.03,2 +3423311876,68.13,2 +3423310986,26.78,2 +3423311305,58.24,5 +3423314062,46.98,6 +3423312616,34.04,9 +3423310808,53.49,4 +3423311991,44.63,2 +3423312238,51.27,6 +3423311460,62.51,4 +3423313642,42.25,5 +3423313616,39.53,6 +3423311957,46.59,4 +3423311767,27.8,6 +3423311255,52.12,7 +3423310601,49.26,7 +3423313447,55.56,5 +3423310756,40.53,4 +3423312294,53.96,1 +3423311691,44.44,7 +3423313755,61.23,3 +3423312095,32.67,3 +3423313916,36.99,3 +3423313312,49.41,4 +3423313286,57.89,6 +3423311291,50.86,5 +3423312353,57.64,8 +3423310943,51.43,5 +3423311213,37.73,3 +3423314183,58.56,3 +3423312428,45.26,4 +3423314399,37.1,6 +3423310952,44.41,3 +3423312511,50.62,7 +3423311701,55.14,1 +3423314222,39.01,5 +3423313931,52.11,7 +3423312522,64.48,5 +3423313074,48.69,6 +3423310691,57.18,5 +3423312520,59.04,6 +3423312589,48.86,7 +3423313528,59.12,7 +3423312414,45.86,7 +3423311161,60.78,0 +3423312541,56.91,5 +3423312451,34.95,0 +3423312140,47.84,4 +3423312275,48.14,2 +3423312694,44.89,1 +3423313536,65.76,9 +3423312372,41.39,8 +3423311480,66.81,5 +3423313234,58.92,3 +3423313789,46.72,7 +3423312256,44.12,2 +3423312650,53.59,2 +3423311170,60.9,5 +3423312063,38.06,7 +3423311251,64.21,7 +3423312194,56.61,8 +3423310522,48.03,5 +3423312989,48.63,4 +3423312280,46.73,5 +3423312562,51.97,5 +3423312775,60.65,5 +3423313125,54.41,6 +3423311078,65.88,5 +3423313324,38.04,4 +3423311932,26.27,4 +3423313665,60.05,6 +3423313858,50.65,4 +3423312329,50.07,4 +3423311572,44.68,1 +3423310841,69.88,7 +3423313411,48.53,6 +3423312731,61.06,8 +3423313069,54.4,4 +3423312252,65.32,3 +3423313892,48.91,2 +3423311141,47.33,3 +3423313636,49.7,3 +3423312436,35.97,7 +3423314216,55.46,5 +3423312411,49.48,4 +3423310858,45.89,8 +3423311387,45.69,3 +3423312923,70.45,6 +3423312868,58.45,6 +3423311514,65.42,6 +3423311391,38.21,5 +3423314130,42.86,4 +3423312116,44.62,2 +3423311925,58.27,6 +3423312169,34.84,8 +3423314009,56.96,7 +3423312134,58.07,11 +3423311715,51.52,8 +3423313283,36.76,7 +3423310449,51.86,4 +3423313955,57.16,4 +3423313602,48.67,3 +3423310925,59,4 +3423311068,54.36,2 +3423313609,41.52,4 +3423311370,46.37,5 +3423310457,51.91,6 +3423310579,65.6,6 +3423310762,61.92,4 +3423313963,40.24,5 +3423311063,65.11,5 +3423311367,26.3,7 +3423310884,41.87,6 +3423311491,51.6,6 +3423310540,43.69,4 +3423311528,63.9,7 +3423314251,72.04,3 +3423313742,50.67,4 +3423312377,57.78,1 +3423313572,69.93,3 +3423312948,56.26,6 +3423311175,44.91,7 +3423313136,52.01,7 +3423311549,41.83,5 +3423310882,49.27,5 +3423312478,40.09,5 +3423314433,75.09,5 +3423310775,44.64,4 +3423313522,41.63,6 +3423314368,54.41,5 +3423311794,27.46,5 +3423312936,52.21,5 +3423311873,74.91,4 +3423314044,25.48,5 +3423314341,52.75,4 +3423312000,45.21,7 +3423310904,49.97,6 +3423311756,55.33,5 +3423314137,30.34,6 +3423314196,53.01,4 +3423314309,58.27,6 +3423311954,41.23,6 +3423311196,46.1,4 +3423313070,60.1,5 +3423311037,51.02,7 +3423313517,52.59,4 +3423312038,43.26,2 +3423310940,50.62,8 +3423312137,46.51,8 +3423310462,49.64,5 +3423313393,50.67,4 +3423312664,65.59,9 +3423311813,45.58,7 +3423313825,55.72,5 +3423311487,44.47,4 +3423314267,55.88,5 +3423311658,47.4,6 +3423313904,64.15,8 +3423312881,61.37,1 +3423311315,35.09,4 +3423314332,47.59,5 +3423310957,53.96,4 +3423311176,63.61,5 +3423312004,45.05,4 +3423313980,43.89,5 +3423313080,57.82,7 +3423311518,50.96,3 +3423311870,42.94,5 +3423312955,67.08,1 +3423313728,49.88,6 +3423314087,37.35,5 +3423313845,53.29,6 +3423314292,37.04,5 +3423311515,50.28,4 +3423312613,50.96,6 +3423311229,46.38,6 +3423311990,34.79,4 +3423312925,38.92,7 +3423311579,46.54,4 +3423314141,62.24,10 +3423313295,50.96,1 +3423313846,53.23,6 +3423313429,53.89,2 +3423314277,74.41,5 +3423311280,50.85,5 +3423312361,62.67,1 +3423311092,36.33,8 +3423314246,50.48,8 +3423312053,43.32,4 +3423314266,50.78,2 +3423312440,49.62,4 +3423310993,52.6,4 +3423314132,45.29,9 +3423314415,64.98,3 +3423314311,41.82,4 +3423310924,55.3,4 +3423311088,52.57,7 +3423313068,51.53,2 +3423311169,46.71,4 +3423313885,52.3,3 +3423314374,43.69,6 +3423311297,29.7,6 +3423313705,53.59,5 +3423313809,48.51,8 +3423314136,29.01,3 +3423314417,56.14,9 +3423312179,56.83,5 +3423314188,59.93,7 +3423314178,52.72,4 +3423311307,59.63,3 +3423314261,32.63,4 +3423313483,26.62,5 +3423312430,66.95,5 +3423312782,35.24,2 +3423313218,66.96,4 +3423312702,54.87,6 +3423310956,70.04,1 +3423311083,45.11,6 +3423310518,44.77,2 +3423310905,75.68,7 +3423312261,53.45,5 +3423313171,55.01,6 +3423311632,34.24,4 +3423310873,52.42,5 +3423314048,37,5 +3423311100,49.14,6 +3423311025,49.35,6 +3423312389,32.54,8 +3423313964,52.15,4 +3423310640,47.49,5 +3423310761,54.5,5 +3423312548,31.58,6 +3423314180,59.91,7 +3423313595,34.02,6 +3423311119,57.14,2 +3423312513,38.27,8 +3423314226,33.56,7 +3423311012,57.74,4 +3423310563,51.29,5 +3423311329,51.87,4 +3423312580,41.31,5 +3423313650,16.39,8 +3423314007,56.55,5 +3423310953,41,6 +3423312901,31.72,2 +3423313977,63.75,4 +3423311392,67.15,7 +3423313881,33.66,8 +3423313248,40.32,6 +3423312295,52.41,3 +3423310792,56.56,4 +3423313040,50.32,5 +3423312464,50.72,5 +3423311097,25.17,6 +3423313550,45.32,6 +3423312581,45.15,7 +3423312006,38.63,4 +3423313078,63.43,4 +3423311049,32.45,6 +3423312141,34.68,7 +3423314077,49.89,2 +3423312204,66.63,9 +3423313333,34.11,5 +3423312425,56.86,4 +3423310862,46.14,5 +3423312326,69.49,1 +3423311101,67.46,4 +3423313455,37.83,7 +3423310822,52.81,5 +3423311639,35.42,3 +3423313073,48.84,4 +3423313491,58.43,2 +3423314363,37.76,5 +3423313969,29.88,4 +3423313377,49.48,2 +3423312551,36.84,5 +3423310452,53.5,5 +3423312876,48.95,2 +3423311926,51.91,6 +3423311804,72.14,6 +3423311436,51.91,6 +3423312585,32.58,6 +3423310499,52.68,3 +3423310654,58.94,5 +3423313695,63.95,3 +3423313863,36.88,5 +3423310840,57.25,3 +3423313993,44.51,6 +3423312512,58.4,7 +3423314411,38.4,4 +3423311477,44.95,5 +3423312121,47.87,4 +3423314013,35.83,6 +3423310842,24.58,4 +3423312202,48.71,5 +3423310922,56.67,3 +3423312570,48.72,6 +3423311864,47.6,1 +3423313191,55.05,6 +3423311347,62.81,6 +3423314139,50.86,4 +3423311111,51.63,4 +3423313656,27.83,5 +3423314394,50.4,7 +3423310708,53.69,5 +3423313684,48.17,4 +3423314441,38.25,6 +3423313660,54.66,3 +3423314079,42.41,7 +3423314428,52.04,1 +3423313659,67.41,4 +3423313280,50.71,7 +3423310642,48.45,6 +3423311811,58.03,4 +3423311758,64.12,4 +3423312369,41.6,7 +3423310815,42.32,6 +3423313811,69.72,4 +3423310835,60.39,7 +3423311937,57.86,6 +3423311908,58.88,6 +3423312225,54.94,7 +3423312019,48.64,4 +3423310948,69.66,4 +3423311745,55.4,9 +3423313192,42.27,4 +3423313093,58.67,1 +3423310564,58.64,1 +3423310469,49.85,8 +3423312930,37.14,2 +3423314313,33.95,5 +3423312153,40.72,6 +3423311890,56.34,6 +3423313118,72.41,7 +3423310577,48.52,7 +3423312288,68.6,4 +3423313065,61.86,7 +3423312040,52.59,5 +3423313257,50.94,6 +3423313982,58.56,7 +3423311266,62.48,7 +3423311945,27.48,5 +3423312627,37.26,1 +3423313292,70.3,3 +3423312476,42.54,5 +3423312752,54.23,5 +3423313309,52.43,11 +3423312973,54.87,4 +3423313913,33.42,4 +3423314098,67.51,7 +3423312506,52.69,1 +3423311024,55.16,7 +3423313055,52.11,4 +3423312552,54.27,6 +3423310972,56.07,5 +3423312403,46.64,5 +3423310984,39.64,4 +3423314113,43.58,7 +3423311407,54.64,5 +3423311363,39.87,4 +3423310679,52.7,7 +3423312609,45.53,4 +3423311933,50.34,6 +3423310619,53.39,4 +3423310741,44.5,8 +3423312588,62.6,6 +3423312347,53.76,4 +3423311279,54.1,7 +3423312993,56.2,6 +3423312809,49.33,7 +3423312447,72.9,7 +3423313414,45.83,2 +3423313351,57.25,0 +3423312667,66.67,5 +3423312781,34.86,3 +3423310724,51.34,6 +3423312344,49.93,5 +3423314059,49.43,7 +3423313678,49.56,4 +3423313585,64.63,8 +3423313820,68.18,5 +3423312875,56.45,6 +3423310733,35.82,5 +3423311979,51.35,6 +3423310861,48.34,5 +3423313382,45.94,6 +3423314116,42.37,5 +3423310669,60.81,6 +3423311288,36.82,5 +3423310558,47.77,6 +3423313766,59.02,3 +3423314090,60.99,4 +3423312521,63.68,4 +3423312768,40.54,5 +3423311065,47.81,6 +3423310872,49.72,4 +3423310559,58.11,7 +3423310895,60.73,5 +3423312108,47.39,7 +3423311321,63.15,0 +3423312101,71.55,4 +3423314264,55.78,7 +3423313680,44.77,3 +3423313368,36.18,5 +3423314012,55.18,5 +3423312646,67.74,1 +3423312434,59.16,8 +3423312308,55.51,8 +3423312343,40.01,1 +3423310574,49.71,6 +3423313410,56.49,4 +3423313697,55.05,7 +3423311366,57.91,7 +3423312181,50.49,3 +3423313561,56.18,6 +3423312935,51.35,2 +3423311117,48.29,5 +3423311519,30.16,5 +3423312906,57.01,7 +3423313869,42.9,6 +3423310580,64.85,6 +3423314347,61.21,7 +3423310463,31.6,5 +3423313348,43.61,3 +3423312586,35.6,1 +3423312109,55.47,9 +3423313470,37.15,7 +3423313579,48.09,5 +3423310544,49.65,11 +3423311672,31.25,4 +3423313717,54,6 +3423313905,47.68,4 +3423310712,48.19,6 +3423314200,37.64,7 +3423311614,63.66,1 +3423311948,34.94,5 +3423313468,34.41,4 +3423314258,47.46,2 +3423312016,60.78,6 +3423314152,55.58,4 +3423311376,44.49,4 +3423313879,36.07,5 +3423312873,49.26,7 +3423313591,64.9,6 +3423313925,53.25,3 +3423311358,54.25,7 +3423313279,48.47,3 +3423313539,50.56,5 +3423311439,45.67,7 +3423313251,59.67,3 +3423311629,47.77,5 +3423311319,51.2,9 +3423311648,52.57,5 +3423313967,56.52,7 +3423313747,63.71,6 +3423312837,63.72,8 +3423313422,65.08,3 +3423312668,48.42,5 +3423313027,41.08,2 +3423312933,46.2,5 +3423312686,47.74,7 +3423312177,50.78,5 +3423313945,62.3,5 +3423310497,38.42,2 +3423310900,51.55,6 +3423312008,41.31,6 +3423311695,57.2,5 +3423311676,51.97,7 +3423314220,34.74,4 +3423310969,62.81,4 +3423311924,55.97,4 +3423311819,54.99,6 +3423312495,58.13,4 +3423312947,66.86,6 +3423310832,45.03,11 +3423310681,53.54,7 +3423311287,23.2,8 +3423313765,64.13,6 +3423313082,53.11,3 +3423311284,62.58,8 +3423311944,57.68,2 +3423314317,29.13,5 +3423311232,58.83,5 +3423313150,44.83,7 +3423313804,40.46,7 +3423310941,58.92,4 +3423311559,56.04,6 +3423312459,41.14,6 +3423312152,48.35,7 +3423313341,56.17,1 +3423313261,47.57,5 +3423312035,44.22,5 +3423311649,52.85,4 +3423313453,41.15,7 +3423311404,47.82,4 +3423310496,66.89,3 +3423311440,43.25,7 +3423313216,74.47,5 +3423312409,47.43,2 +3423313017,49.34,3 +3423311262,56.37,3 +3423313357,50.64,4 +3423314380,60.36,5 +3423313580,45.74,5 +3423313823,60.32,5 +3423313026,59.08,5 +3423312754,40.87,5 +3423311269,63.55,1 +3423312622,54.65,4 +3423311824,53.47,4 +3423314031,41.7,5 +3423312334,55.76,7 +3423311722,53.33,5 +3423313798,49.58,4 +3423312524,40.9,5 +3423311289,54.32,5 +3423311883,52.11,11 +3423312961,54.34,3 +3423312950,41.75,5 +3423310989,49.19,5 +3423313442,46.8,6 +3423313958,30.97,8 +3423310903,42.42,4 +3423313204,30.31,6 +3423313736,32.95,10 +3423313987,58.13,6 +3423312043,64.32,3 +3423310608,39.93,4 +3423311018,40.63,3 +3423313240,48.81,6 +3423311952,40.34,2 +3423314340,54.29,9 +3423311684,47.92,7 +3423311762,55.11,6 +3423312277,58.93,6 +3423311588,34.04,6 +3423311295,50.3,8 +3423312892,51.38,4 +3423311424,42.02,1 +3423311919,39.33,6 +3423310879,49.76,7 +3423311679,46.96,1 +3423313557,58.61,5 +3423310998,32.53,4 +3423313176,38.94,8 +3423312888,46.81,9 +3423310701,50.56,5 +3423313016,37.57,6 +3423312117,49.86,5 +3423312293,53.57,4 +3423313861,35.15,5 +3423314035,36.91,6 +3423310954,60.04,5 +3423311733,41.25,3 +3423310584,49.53,7 +3423310511,44.54,7 +3423310876,53.61,3 +3423312771,62.81,7 +3423313743,49.22,8 +3423311130,52.34,7 +3423313183,55.11,3 +3423310571,50.03,9 +3423312518,54.39,3 +3423312761,47.56,5 +3423314346,39.39,8 +3423313451,46.85,7 +3423313552,37.68,7 +3423312998,50.56,5 +3423314190,179.22,95 +3423314144,192.34,69 +3423314442,140.25,92 +3423313001,184.28,70 +3423311047,200.58,50 +3423312780,197.99,95 +3423313932,199.81,32 +3423310689,168.45,70 +3423312919,197.87,74 +3423314356,172.08,87 +3423310586,164.31,45 +3423313042,162.56,56 +3423312098,165.2,74 +3423313466,211.36,5 +3423310545,157.51,59 +3423311847,208.47,26 +3423311620,180.95,85 +3423312577,167.83,70 +3423313198,144.23,49 +3423312363,163.82,79 +3423311045,186.28,4 +3423310933,182.21,72 +3423313498,149.08,78 +3423312553,171.83,50 +3423313412,212.8,51 +3423311630,185.3,49 +3423313019,170.11,49 +3423312418,175.82,80 +3423311974,234.52,88 +3423312470,164.03,86 +3423312832,200.77,79 +3423313399,168.47,56 +3423314334,184.79,89 +3423311956,158.61,71 +3423314293,192.11,68 +3423313331,176.49,82 +3423312723,190.97,76 +3423314086,175.93,53 +3423311402,157.99,44 +3423313142,197.56,57 +3423313018,140.32,56 +3423313048,162.56,23 +3423313646,181.12,58 +3423312691,197.26,76 +3423314248,156.85,58 +3423311901,164.13,99 +3423311468,187.37,86 +3423314100,174.32,28 +3423311601,145.21,96 +3423313433,178.05,29 +3423313784,183.7,74 +3423312054,121.07,57 +3423313479,171.16,51 +3423313821,148,66 +3423314285,162.93,75 +3423313505,163.17,91 +3423310597,184.96,65 +3423313168,178.28,71 +3423312770,190.13,52 +3423310962,167.55,70 +3423313779,176.01,70 +3423312312,157.02,21 +3423313902,173.72,84 +3423311160,224.47,52 +3423310942,219.46,75 +3423310464,156.79,46 +3423312665,193.01,43 +3423311784,191.2,70 +3423311425,183.98,61 +3423312435,181.04,58 +3423314343,192.37,99 +3423311627,185.03,41 +3423310923,202.94,80 +3423313428,178.47,99 +3423312913,213.93,47 +3423311790,188.36,63 +3423310671,164.08,71 +3423311382,173.91,75 +3423310860,175.29,34 +3423313827,208.65,45 +3423310643,212.49,25 +3423310938,183.78,56 +3423312233,162.3,74 +3423312899,164.83,53 +3423311897,172.53,76 +3423314328,179.26,79 +3423312184,187.12,99 +3423314358,191.47,93 +3423312173,202.02,38 +3423314384,188.06,75 +3423312303,201.54,90 +3423314396,165.23,57 +3423311005,183.92,29 +3423313215,166.22,77 +3423313020,167.75,41 +3423311010,219.54,39 +3423312601,209.43,64 +3423313496,145.28,91 +3423312088,185.5,67 +3423311039,168.77,55 +3423312649,174.41,36 +3423313853,169.45,67 +3423311723,159.2,46 +3423314000,184.76,56 +3423311845,228.21,61 +3423314017,151.81,67 +3423311612,158.47,100 +3423312623,186.03,65 +3423312230,169.85,99 +3423313271,154.52,51 +3423310898,198.3,99 +3423311917,149.71,78 +3423313436,182.53,99 +3423313456,204.62,38 +3423313513,224.94,27 +3423313871,177.2,87 +3423310955,182.4,83 +3423314016,189.88,97 +3423310913,214.17,73 +3423311064,159.8,58 +3423313485,150.8,8 +3423312898,149.55,7 +3423314299,154.72,19 +3423311602,188.1,15 +3423313449,151.59,4 +3423312138,209.27,11 +3423311828,184.26,7 +3423314409,184.77,15 +3423311384,168.27,14 +3423311995,178.52,17 +3423312445,190.16,12 +3423312991,176.3,18 +3423312690,195.01,10 +3423310934,176.1,5 +3423313224,186.4,19 +3423310543,158.63,13 +3423312986,190.1,6 +3423312380,198.1,3 +3423311316,165.93,1 +3423311580,162.35,13 +3423311849,203.98,11 +3423313137,177.21,16 +3423310850,192.25,12 +3423312975,164.47,7 +3423312375,210.88,12 +3423311623,171.76,12 +3423312075,180.06,8 +3423311965,166.36,1 +3423312879,203.85,12 +3423311357,200.37,1 +3423311663,149.51,10 +3423313194,180.26,15 +3423312653,203.91,12 +3423313245,203.13,11 +3423310515,139.35,6 +3423311545,158.88,20 +3423312131,194.37,15 +3423312855,170.38,16 +3423310635,179.61,15 +3423311783,221.45,3 +3423313813,181.05,4 +3423311486,179.97,0 +3423311200,192.78,7 +3423310988,174.71,9 +3423314148,168.64,5 +3423311888,156.17,17 +3423312645,186.51,15 +3423314091,173.21,14 +3423313153,166.02,12 +3423312021,159.14,8 +3423312378,207.13,13 +3423310621,183.67,14 +3423310705,171.29,14 +3423312030,200.64,10 +3423311252,179.25,12 +3423313583,177.43,10 +3423311314,180.8,9 +3423312909,166.45,17 +3423311094,156.41,10 +3423310951,172.5,5 +3423311093,158.02,8 +3423312298,179.76,8 +3423310982,163.26,13 +3423313461,177.75,12 +3423313500,184.44,5 +3423313141,187.98,15 +3423311960,146.8,18 +3423311582,196.9,1 +3423310774,171.24,1 +3423314424,220.23,2 +3423311718,200.41,7 +3423311882,207.52,10 +3423311826,183.1,14 +3423311708,175.04,10 +3423313928,173.26,19 +3423313714,159.7,11 +3423311235,175.37,18 +3423312857,198.44,3 +3423313231,168.76,5 +3423311179,165.87,3 +3423313268,145.73,9 +3423312291,179.68,0 +3423313761,184.94,11 +3423311325,227.52,7 +3423311144,149.36,9 +3423313981,182.32,15 +3423312338,200.6,18 +3423314124,183.21,13 +3423313613,195.43,9 +3423311223,185.17,4 +3423312352,179.33,5 +3423311861,196.17,12 +3423312333,183.14,9 +3423311557,156.71,17 +3423312682,174.79,10 +3423313256,174.48,4 +3423312469,220.39,7 +3423310987,189.07,4 +3423313628,152.64,16 +3423313622,185.33,8 +3423313421,176.88,7 +3423312307,171.65,15 +3423310727,166.33,8 +3423313664,163.94,0 +3423313221,192.77,14 +3423311531,206.13,9 +3423312066,173.2,16 +3423311751,171.41,12 +3423312900,202.04,7 +3423310839,168.84,7 +3423312453,178.48,7 +3423312420,188.51,13 +3423312647,187.13,19 +3423311044,192.89,15 +3423314307,163.5,9 +3423313669,174.78,14 +3423311854,182.67,9 +3423314163,193.88,9 +3423313507,147.55,12 +3423310917,182.63,12 +3423313276,169.78,15 +3423310809,225.68,2 +3423310901,206.08,15 +3423312410,154.75,13 +3423311385,165.07,8 +3423312530,197.07,14 +3423312964,183.97,11 +3423314114,165.07,4 +3423310995,213.82,7 +3423313754,155.84,13 +3423312084,188.57,9 +3423314119,199.02,9 +3423312984,189.61,16 +3423311962,178.34,9 +3423313988,187.46,7 +3423313877,193.3,7 +3423310707,169.02,4 +3423311963,165.71,6 +3423312960,169.48,1 +3423310864,174.91,12 +3423313008,171.99,13 +3423312135,179.52,8 +3423312482,172.02,14 +3423313471,161.32,9 +3423312139,164.26,11 +3423310683,206.47,21 +3423313504,158.71,9 +3423311936,207.08,4 +3423310748,177.63,7 +3423313439,193.4,12 +3423312336,190.06,19 +3423311603,171.15,5 +3423314074,198.13,16 +3423311209,160.23,4 +3423313151,167.46,14 +3423310479,195.66,4 +3423312676,175.16,13 +3423312405,194.29,10 +3423311286,188.86,6 +3423313856,182.61,10 +3423313986,180.45,4 +3423310487,191.98,10 +3423313554,184.19,14 +3423312758,211.8,9 +3423310992,199.29,21 +3423313961,201.79,1 +3423312535,187.37,14 +3423311326,215.94,8 +3423311327,182.43,9 +3423313781,210.69,2 +3423311485,212.54,13 +3423311484,205.43,7 +3423311748,185.52,16 +3423313401,188.19,3 +3423313934,179.74,11 +3423313398,217.21,8 +3423311308,170.23,13 +3423313625,180.18,11 +3423310532,191.03,1 +3423312201,199.33,9 +3423313682,206.15,14 +3423312247,170.85,16 +3423310798,137.67,4 +3423311651,197.27,14 +3423312210,148.83,1 +3423312441,181.67,8 +3423312442,166.1,18 +3423311717,185.73,3 +3423314260,184.66,8 +3423314351,175.8,7 +3423314065,189.2,4 +3423313764,205.78,9 +3423312766,156.87,16 +3423314249,185.54,8 +3423311475,186.14,11 +3423311765,183.3,16 +3423310967,190.39,13 +3423310568,166.15,11 +3423314318,162.71,10 +3423314102,189.62,8 +3423311183,212.42,7 +3423311592,186.3,9 +3423310480,152.55,17 +3423312831,174.29,10 +3423313683,146.35,16 +3423311445,134.81,20 +3423314033,191.22,8 +3423311706,187.7,17 +3423312681,219.29,17 +3423313481,206.59,10 +3423314259,218.79,8 +3423311033,171.7,14 +3423310857,168.33,10 +3423313712,160.71,11 +3423312396,217.76,4 +3423312076,140.49,7 +3423311955,170.73,11 +3423313855,196.13,17 +3423311409,170.58,22 +3423310455,160.52,19 +3423312939,181.77,18 +3423310569,195.7,12 +3423311156,159.86,9 +3423312526,162.96,22 +3423314126,177.53,4 +3423313057,179.4,2 +3423314443,156.38,6 +3423314377,196.12,8 +3423311736,204.96,7 +3423313390,184.1,19 +3423314290,186.88,9 +3423311216,193.86,6 +3423313144,171.51,8 +3423313888,169.25,9 +3423313495,155.86,6 +3423314001,176.54,3 +3423313568,168.36,13 +3423311964,149.91,5 +3423310795,189.38,6 +3423313170,194.15,12 +3423312905,186.47,10 +3423313796,185.5,13 +3423314314,186.69,15 +3423312304,179.23,6 +3423313586,158.05,8 +3423312607,189.13,6 +3423313005,181.82,6 +3423314041,150.31,14 +3423311054,177.95,8 +3423313836,160.49,6 +3423311228,184.62,6 +3423312917,197.02,8 +3423312878,184.32,11 +3423310966,195.47,14 +3423313957,191.27,8 +3423313526,208.19,17 +3423310659,178.37,6 +3423311893,169.24,4 +3423310826,166.62,14 +3423313626,189.78,17 +3423314142,145.39,15 +3423313998,164.47,10 +3423314344,163.39,6 +3423310660,159.11,15 +3423312457,169.56,17 +3423313984,188.25,18 +3423311547,184.12,7 +3423312987,199.37,7 +3423310600,175.81,8 +3423312222,175.06,9 +3423310650,190.85,9 +3423310556,188.89,1 +3423310592,192.77,14 +3423312237,179.24,7 +3423311437,170.86,7 +3423312060,171.29,19 +3423310451,168.9,5 +3423311896,161.76,5 +3423313094,169.54,12 +3423312243,173.72,10 +3423311562,180.98,8 +3423313520,181.89,17 +3423314010,172.85,13 +3423313028,244.79,9 +3423311091,176.79,8 +3423313938,148.44,7 +3423312195,216.66,9 +3423313540,161.14,10 +3423311343,183.38,3 +3423313354,175.29,11 +3423311273,167.02,10 +3423312090,234.09,7 +3423312630,168.98,14 +3423312656,242.37,15 +3423313614,160.78,11 +3423313744,190.56,11 +3423311207,195.33,14 +3423312318,196.23,4 +3423313440,209.76,15 +3423311667,172.78,5 +3423313419,201.99,15 +3423311675,162.56,13 +3423310772,191.81,7 +3423312376,170.72,1 +3423311577,189.45,21 +3423312199,189.77,16 +3423313874,201.69,14 +3423312563,191.59,12 +3423312125,154.93,14 +3423310468,149.19,16 +3423311927,165.07,12 +3423313367,156.57,7 +3423310782,162.34,9 +3423313563,177.82,12 +3423311895,205.07,15 +3423310802,155.83,10 +3423311711,181.53,13 +3423311124,195.87,3 +3423314160,170.3,6 +3423314439,167.89,15 +3423313394,187.17,7 +3423313043,153.81,18 +3423311524,164.53,7 +3423310488,174.81,13 +3423310976,152.1,10 +3423310636,157.26,18 +3423313985,203.03,10 +3423312253,172.78,8 +3423310561,164.95,11 +3423312228,182.99,5 +3423311248,170.9,0 +3423314330,150.57,15 +3423313249,187.45,4 +3423313241,135.98,14 +3423311306,187.55,5 +3423311567,241.71,22 +3423312971,168.04,12 +3423312695,156.01,14 +3423313691,189.75,9 +3423311458,178.42,3 +3423313707,211.26,6 +3423312027,171.7,12 +3423310828,145.07,6 +3423312721,194.55,6 +3423313893,157.26,10 +3423311743,153.34,9 +3423314407,193.78,6 +3423313437,200.72,20 +3423310658,193.21,14 +3423312224,176.23,16 +3423313615,172.11,12 +3423312314,193.31,8 +3423310688,174.3,12 +3423313653,164.4,10 +3423313284,162.68,5 +3423311218,173.55,3 +3423311168,172.15,8 +3423311418,202.84,17 +3423314177,197.95,10 +3423312767,167.21,5 +3423313391,136.52,13 +3423312310,199.92,0 +3423311221,152.09,24 +3423311476,170.87,15 +3423312902,137.16,16 +3423313237,164.38,8 +3423313946,212.41,12 +3423312327,163.36,15 +3423311151,158.31,4 +3423310819,162.26,14 +3423313220,211.99,6 +3423311013,160.26,20 +3423313039,228.96,13 +3423311264,182.69,11 +3423310715,162.8,11 +3423311136,193.58,16 +3423314138,161.26,10 +3423314447,201.12,19 +3423312953,181.35,13 +3423311631,168.64,9 +3423310916,175.58,9 +3423311560,160.2,9 +3423313109,192.22,6 +3423311296,197.6,10 +3423311224,189.82,7 +3423311770,205.8,13 +3423311438,193.33,12 +3423311479,132.03,15 +3423313791,190.33,14 +3423310834,173.9,11 +3423313114,179.11,6 +3423312157,209.64,7 +3423313638,161.9,15 +3423313700,166.62,9 +3423313308,161.64,10 +3423312812,162.9,10 +3423311186,178.88,10 +3423314147,171.97,12 +3423311041,179.56,17 +3423312058,188.59,16 +3423313581,151.09,13 +3423312594,194.87,8 +3423312394,200.21,5 +3423310742,196.39,6 +3423313643,193.05,11 +3423312587,183.61,6 +3423310517,166.8,8 +3423311204,190.45,14 +3423311797,184.29,7 +3423313629,197.44,15 +3423313704,170.72,7 +3423312342,168.56,5 +3423312438,168.22,9 +3423313883,165.22,16 +3423314342,143.37,10 +3423311159,224.24,16 +3423310500,173.49,8 +3423314197,190.04,16 +3423312100,178.39,11 +3423313573,194.68,2 +3423313694,170.72,1 +3423313317,181.64,17 +3423312856,207.34,9 +3423312624,189.85,9 +3423313640,142.83,5 +3423311798,168.14,1 +3423313975,161.94,9 +3423314286,151.65,1 +3423310721,177.29,10 +3423311324,173.02,3 +3423312972,182.6,10 +3423311856,208.78,11 +3423311344,160.5,14 +3423311104,170.45,1 +3423312762,164.37,7 +3423312756,180.1,14 +3423313337,192.31,12 +3423312937,167.88,6 +3423310698,231.57,13 +3423310626,189.68,10 +3423311481,192.61,10 +3423313606,218.21,11 +3423311435,168.88,14 +3423312599,150,8 +3423311638,178.17,10 +3423311178,190.84,1 +3423312636,211.34,15 +3423312634,161.98,9 +3423311786,208.42,7 +3423312042,185.7,17 +3423314072,165.15,9 +3423310498,166.68,14 +3423314312,213.12,3 +3423312850,217.91,3 +3423314082,159.47,22 +3423310749,176.44,5 +3423311239,173.35,6 +3423311740,194.1,4 +3423312281,176.01,16 +3423310507,193.08,9 +3423313935,180.84,21 +3423313478,191.52,9 +3423312282,141.96,15 +3423313549,154.33,3 +3423312945,185.15,10 +3423311356,185.38,7 +3423313715,208.96,18 +3423312893,176.97,4 +3423312750,162.58,1 +3423310667,184.14,9 +3423311587,154.74,1 +3423310746,180.91,11 +3423311388,184.74,1 +3423314145,186.24,4 +3423313177,181.3,11 +3423311371,180.36,13 +3423311276,163.72,6 +3423314241,177.04,6 +3423313302,154.59,19 +3423313605,196.37,17 +3423312167,219.84,15 +3423313370,189.97,3 +3423311444,168.09,5 +3423312424,190.94,20 +3423313571,192.12,16 +3423312171,218.67,1 +3423312025,188.26,14 +3423310725,200.32,9 +3423313100,188.22,12 +3423310651,207.01,18 +3423310641,173.38,11 +3423310806,161.51,5 +3423313651,169.75,5 +3423310653,169.66,11 +3423314298,150.46,14 +3423313949,176.96,23 +3423310560,200.62,8 +3423312538,197.08,14 +3423313034,196.95,11 +3423314125,116.58,4 +3423314042,174.58,5 +3423311122,187.16,11 +3423313801,184.46,4 +3423314186,170.72,15 +3423312739,168.24,10 +3423313222,187.67,17 +3423314143,218.58,9 +3423312738,155.02,9 +3423313996,153.84,9 +3423313599,180.65,16 +3423312450,168.76,7 +3423312864,153.5,1 +3423311401,159.53,15 +3423312501,207.44,13 +3423313320,229.3,15 +3423313914,180.6,13 +3423311915,165.09,4 +3423311600,205.4,11 +3423311911,171.19,10 +3423311432,188.24,3 +3423313774,189.25,7 +3423311673,160.94,13 +3423313021,185.58,3 +3423312227,171.79,6 +3423312212,179.7,9 +3423311441,193.68,8 +3423311471,195.95,4 +3423311677,169.72,6 +3423312345,207.05,13 +3423312969,205.62,11 +3423312999,166.64,11 +3423313974,168.34,9 +3423310525,185.5,13 +3423311428,191.31,7 +3423314165,140.59,3 +3423311986,188.61,9 +3423312504,167.46,11 +3423312273,186.26,4 +3423311042,188.34,18 +3423312283,157.73,12 +3423311250,175.69,16 +3423312862,134.54,10 +3423311215,190.28,10 +3423314030,168.01,3 +3423311541,188.49,11 +3423314040,164.43,12 +3423310896,186.42,14 +3423310760,201.97,17 +3423312556,201.38,1 +3423311678,196.34,9 +3423314395,171.64,16 +3423313759,183.96,6 +3423311862,192.31,8 +3423310965,180.63,12 +3423312062,194.13,11 +3423313123,175.76,5 +3423312220,163.89,8 +3423313788,164.29,12 +3423310484,170.9,15 +3423312994,166.87,15 +3423311427,164.77,6 +3423313767,184.45,12 +3423312678,191.82,7 +3423311311,183.61,14 +3423313594,171.42,4 +3423313154,186.55,8 +3423312104,200.37,6 +3423311806,166.72,9 +3423312907,192.38,10 +3423312023,151.88,4 +3423312366,196.56,1 +3423312102,172.43,11 +3423313847,169.64,9 +3423311616,177.33,1 +3423312005,156.76,15 +3423314064,165.76,5 +3423314348,176.43,11 +3423313133,159.11,16 +3423313556,186.67,15 +3423314209,169.32,10 +3423311496,177.78,11 +3423311007,180.73,1 +3423312313,202.18,1 +3423312159,159.31,10 +3423311666,166.8,10 +3423312328,177.09,7 +3423310718,170.6,3 +3423311372,190.14,13 +3423310757,201.05,9 +3423312666,174.91,14 +3423310793,168.01,15 +3423313060,152.99,6 +3423310699,145.17,15 +3423313445,174.44,9 +3423311503,181.52,14 +3423312162,185.27,9 +3423313270,176.37,11 +3423313923,159.92,15 +3423311338,150.21,1 +3423311417,152.6,15 +3423313992,197.71,9 +3423311543,174.78,4 +3423312398,156.2,4 +3423310970,174.5,6 +3423312772,210.4,10 +3423312071,200.05,9 +3423313523,164.96,5 +3423310790,176.5,11 +3423311535,192.86,21 +3423311671,188.22,8 +3423310528,178.57,17 +3423312026,203.38,8 +3423312426,192.32,14 +3423310897,176.15,9 +3423312080,156.61,20 +3423311502,176.51,14 +3423310875,190.66,7 +3423311299,154.62,5 +3423310784,162.97,15 +3423311645,207.96,7 +3423311750,176.65,11 +3423312557,164.56,0 +3423312711,203.41,7 +3423314405,182.19,15 +3423313785,201.68,16 +3423311720,135.38,8 +3423310849,153.45,1 +3423310939,182.71,11 +3423310505,177.44,4 +3423312205,208.52,5 +3423311422,175.58,11 +3423311929,161.6,6 +3423313965,168.09,12 +3423313072,168.77,10 +3423313258,158.86,14 +3423311999,182.16,6 +3423312965,199.42,8 +3423311467,172.99,10 +3423311074,182.44,10 +3423310814,166.96,8 +3423311459,153.58,12 +3423310920,185.68,18 +3423311219,214.8,20 +3423310460,185.39,20 +3423314058,169.35,19 +3423313313,181.75,10 +3423311165,159.89,17 +3423310652,196.52,12 +3423311383,169.65,14 +3423313327,192.21,8 +3423313079,180.29,20 +3423312932,222.08,11 +3423311728,165.1,13 +3423314434,212.75,9 +3423311292,170.64,13 +3423314315,196.2,14 +3423310889,189.99,12 +3423310851,208.96,10 +3423313381,219.39,1 +3423311448,188.25,10 +3423311551,187.13,12 +3423313834,187.28,8 +3423312123,192.74,13 +3423310590,211.2,8 +3423312146,189.88,9 +3423312648,165.58,6 +3423310473,191.88,7 +3423312226,194.22,12 +3423310647,167.22,2 +3423311832,185.37,14 +3423311103,203.8,22 +3423311192,167.05,10 +3423314043,177.48,19 +3423312391,170.22,16 +3423312567,209.76,18 +3423310685,160.04,10 +3423312600,176.17,5 +3423312921,170.91,12 +3423313630,176.14,5 +3423311533,168.03,9 diff --git a/011/exercise/readme.md b/011/exercise/readme.md new file mode 100644 index 00000000..75ee2e30 --- /dev/null +++ b/011/exercise/readme.md @@ -0,0 +1,79 @@ +# Problem Statement +Business challenge/requirement + +Lithionpower is the largest provider of electric vehicle(e-vehicle) batteries. It provides battery on a rental model to e-vehicle drivers. Drivers rent battery typically for a day and then replace it with a charged battery from the company. Lithionpower has a variable pricing model based on driver's driving history. As the life of a battery depends on factors such as overspeeding, distance driven per day etc.You as a ML expert have to create a cluster model where drivers can be grouped together based on the driving data. + +# Objective + +To understand how k-means works internally. + +# Task + +Drivers will be incentivised based on the cluster, so grouping has to be accurate. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means + +### References +- https://www.javatpoint.com/k-means-clustering-algorithm-in-machine-learning \ No newline at end of file diff --git a/011/solution/k-means-clustering.ipynb b/011/solution/k-means-clustering.ipynb new file mode 100644 index 00000000..7ae43a78 --- /dev/null +++ b/011/solution/k-means-clustering.ipynb @@ -0,0 +1,835 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | mean_dist_day | \n", + "mean_over_speed_perc | \n", + "
|---|---|---|
| id | \n", + "\n", + " | \n", + " |
| 3423311935 | \n", + "71.24 | \n", + "28 | \n", + "
| 3423313212 | \n", + "52.53 | \n", + "25 | \n", + "
| 3423313724 | \n", + "64.54 | \n", + "27 | \n", + "
| 3423311373 | \n", + "55.69 | \n", + "22 | \n", + "
| 3423310999 | \n", + "54.58 | \n", + "25 | \n", + "
| \n", + " | mean_dist_day | \n", + "mean_over_speed_perc | \n", + "cluster | \n", + "
|---|---|---|---|
| id | \n", + "\n", + " | \n", + " | \n", + " |
| 3423311935 | \n", + "71.24 | \n", + "28 | \n", + "3 | \n", + "
| 3423313212 | \n", + "52.53 | \n", + "25 | \n", + "3 | \n", + "
| 3423313724 | \n", + "64.54 | \n", + "27 | \n", + "3 | \n", + "
| 3423311373 | \n", + "55.69 | \n", + "22 | \n", + "3 | \n", + "
| 3423310999 | \n", + "54.58 | \n", + "25 | \n", + "3 | \n", + "
| 3423313857 | \n", + "41.91 | \n", + "10 | \n", + "0 | \n", + "
| 3423312432 | \n", + "58.64 | \n", + "20 | \n", + "3 | \n", + "
| 3423311434 | \n", + "52.02 | \n", + "8 | \n", + "0 | \n", + "
| 3423311328 | \n", + "31.25 | \n", + "34 | \n", + "3 | \n", + "
| 3423312488 | \n", + "44.31 | \n", + "19 | \n", + "3 | \n", + "
| 3423311254 | \n", + "49.35 | \n", + "40 | \n", + "3 | \n", + "
| 3423312943 | \n", + "58.07 | \n", + "45 | \n", + "3 | \n", + "
| 3423312536 | \n", + "44.22 | \n", + "22 | \n", + "3 | \n", + "
| 3423311542 | \n", + "55.73 | \n", + "19 | \n", + "3 | \n", + "
| 3423312176 | \n", + "46.63 | \n", + "43 | \n", + "3 | \n", + "
| 3423314176 | \n", + "52.97 | \n", + "32 | \n", + "3 | \n", + "
| 3423314202 | \n", + "46.25 | \n", + "35 | \n", + "3 | \n", + "
| 3423311346 | \n", + "51.55 | \n", + "27 | \n", + "3 | \n", + "
| 3423310666 | \n", + "57.05 | \n", + "26 | \n", + "3 | \n", + "
| 3423313527 | \n", + "58.45 | \n", + "30 | \n", + "3 | \n", + "
| 3423312182 | \n", + "43.42 | \n", + "23 | \n", + "3 | \n", + "
| 3423313590 | \n", + "55.68 | \n", + "37 | \n", + "3 | \n", + "
| 3423312268 | \n", + "55.15 | \n", + "18 | \n", + "0 | \n", + "
| 3423314255 | \n", + "43.84 | \n", + "22 | \n", + "3 | \n", + "
| 3423311976 | \n", + "59.26 | \n", + "32 | \n", + "3 | \n", + "
| 3423312669 | \n", + "37.14 | \n", + "41 | \n", + "3 | \n", + "
| 3423310697 | \n", + "64.30 | \n", + "29 | \n", + "3 | \n", + "
| 3423312113 | \n", + "45.75 | \n", + "16 | \n", + "0 | \n", + "
| 3423313343 | \n", + "45.97 | \n", + "23 | \n", + "3 | \n", + "
| 3423311431 | \n", + "56.04 | \n", + "39 | \n", + "3 | \n", + "
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 3423313079 | \n", + "180.29 | \n", + "20 | \n", + "1 | \n", + "
| 3423312932 | \n", + "222.08 | \n", + "11 | \n", + "1 | \n", + "
| 3423311728 | \n", + "165.10 | \n", + "13 | \n", + "1 | \n", + "
| 3423314434 | \n", + "212.75 | \n", + "9 | \n", + "1 | \n", + "
| 3423311292 | \n", + "170.64 | \n", + "13 | \n", + "1 | \n", + "
| 3423314315 | \n", + "196.20 | \n", + "14 | \n", + "1 | \n", + "
| 3423310889 | \n", + "189.99 | \n", + "12 | \n", + "1 | \n", + "
| 3423310851 | \n", + "208.96 | \n", + "10 | \n", + "1 | \n", + "
| 3423313381 | \n", + "219.39 | \n", + "1 | \n", + "1 | \n", + "
| 3423311448 | \n", + "188.25 | \n", + "10 | \n", + "1 | \n", + "
| 3423311551 | \n", + "187.13 | \n", + "12 | \n", + "1 | \n", + "
| 3423313834 | \n", + "187.28 | \n", + "8 | \n", + "1 | \n", + "
| 3423312123 | \n", + "192.74 | \n", + "13 | \n", + "1 | \n", + "
| 3423310590 | \n", + "211.20 | \n", + "8 | \n", + "1 | \n", + "
| 3423312146 | \n", + "189.88 | \n", + "9 | \n", + "1 | \n", + "
| 3423312648 | \n", + "165.58 | \n", + "6 | \n", + "1 | \n", + "
| 3423310473 | \n", + "191.88 | \n", + "7 | \n", + "1 | \n", + "
| 3423312226 | \n", + "194.22 | \n", + "12 | \n", + "1 | \n", + "
| 3423310647 | \n", + "167.22 | \n", + "2 | \n", + "1 | \n", + "
| 3423311832 | \n", + "185.37 | \n", + "14 | \n", + "1 | \n", + "
| 3423311103 | \n", + "203.80 | \n", + "22 | \n", + "1 | \n", + "
| 3423311192 | \n", + "167.05 | \n", + "10 | \n", + "1 | \n", + "
| 3423314043 | \n", + "177.48 | \n", + "19 | \n", + "1 | \n", + "
| 3423312391 | \n", + "170.22 | \n", + "16 | \n", + "1 | \n", + "
| 3423312567 | \n", + "209.76 | \n", + "18 | \n", + "1 | \n", + "
| 3423310685 | \n", + "160.04 | \n", + "10 | \n", + "1 | \n", + "
| 3423312600 | \n", + "176.17 | \n", + "5 | \n", + "1 | \n", + "
| 3423312921 | \n", + "170.91 | \n", + "12 | \n", + "1 | \n", + "
| 3423313630 | \n", + "176.14 | \n", + "5 | \n", + "1 | \n", + "
| 3423311533 | \n", + "168.03 | \n", + "9 | \n", + "1 | \n", + "
4000 rows × 3 columns
\n", + "