diff --git a/.history/011/k_means_beginner_20221007003622.pynb b/.history/011/k_means_beginner_20221007003622.pynb new file mode 100644 index 00000000..e69de29b diff --git a/.history/011/k_means_beginner_20221007003708.pynb b/.history/011/k_means_beginner_20221007003708.pynb new file mode 100644 index 00000000..08066618 --- /dev/null +++ b/.history/011/k_means_beginner_20221007003708.pynb @@ -0,0 +1,9 @@ +import numpy as np # linear algebra +import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) +import os +import matplotlib.pyplot as plt +from sklearn import cluster +from sklearn import preprocessing +import plotly.express as px +from sklearn.datasets import make_blobs +plt.style.use('dark_background') \ No newline at end of file diff --git a/.history/011/k_means_beginner_20221007003710.pynb b/.history/011/k_means_beginner_20221007003710.pynb new file mode 100644 index 00000000..283332ac --- /dev/null +++ b/.history/011/k_means_beginner_20221007003710.pynb @@ -0,0 +1,9 @@ +import numpy as np # linear algebra +import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) +import os +import matplotlib.pyplot as plt +from sklearn import cluster +from sklearn import preprocessing +import plotly.express as px +from sklearn.datasets import make_blobs +plt.style.use('dark_background') diff --git a/.history/011/k_means_beginner_20221007003736.pynb b/.history/011/k_means_beginner_20221007003736.pynb new file mode 100644 index 00000000..a5c5fe04 --- /dev/null +++ b/.history/011/k_means_beginner_20221007003736.pynb @@ -0,0 +1,10 @@ +import numpy as np # linear algebra +import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) +import os +import matplotlib.pyplot as plt +from sklearn import cluster +from sklearn import preprocessing +import plotly.express as px +from sklearn.datasets import make_blobs +plt.style.use('dark_background') + diff --git a/.history/011/readme_20221007002635.md b/.history/011/readme_20221007002635.md new file mode 100644 index 00000000..e69de29b diff --git a/.history/011/readme_20221007002656.md b/.history/011/readme_20221007002656.md new file mode 100644 index 00000000..704d8c12 --- /dev/null +++ b/.history/011/readme_20221007002656.md @@ -0,0 +1,45 @@ +# Problem Statement + +The aim of the exercise is to implement k-NN from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how kNN works internally. + +# Task + +- Extend the algorithm for Distance-weighted kNN classification using appropriate dataset. +- Extend the algorithm for regression using appropriate dataset. +- Extend the algorithm with appropriate dataset. +- Implementing KD trees to understand information retrieval. Visit [this](https://www.analyticsvidhya.com/blog/2017/11/information-retrieval-using-kdtree/) site for dataset and references. + +# k-NN Algorithm + +K-nearest neighbors (KNN) algorithm is a type of supervised ML algorithm which can be used for both classification as well as regression predictive problems. +The following two properties would define KNN +well − +- Lazy learning algorithm − KNN is a lazy learning algorithm because it does not have a specialized training phase and uses all the data for training while classification. +- Non-parametric learning algorithm − KNN is also a non-parametric learning algorithm because it doesn’t assume anything about the underlying data. + +K-nearest neighbors (KNN) algorithm uses ‘feature similarity’ to predict the values of new datapoints which further means that the new data point will be assigned a value based on how closely it matches the points in the training set. + +1. Load the data +2. Initialize K to your chosen number of neighbors +3. For each example in the data + - Calculate the distance between the query example and the current example from the data. + - Add the distance and the index of the example to an ordered collection +4. Sort the ordered collection of distances and indices from smallest to largest (in ascending order) by the distances +5. Pick the first K entries from the sorted collection 6. Get the labels of the selected K entries +6. If regression, return the mean of the K labels +7. If classification, return the mode of the K labels + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007002718.md b/.history/011/readme_20221007002718.md new file mode 100644 index 00000000..19463c97 --- /dev/null +++ b/.history/011/readme_20221007002718.md @@ -0,0 +1,45 @@ +# Problem Statement + +The aim of the exercise is to implement k-Means from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how kNN works internally. + +# Task + +- Extend the algorithm for Distance-weighted kNN classification using appropriate dataset. +- Extend the algorithm for regression using appropriate dataset. +- Extend the algorithm with appropriate dataset. +- Implementing KD trees to understand information retrieval. Visit [this](https://www.analyticsvidhya.com/blog/2017/11/information-retrieval-using-kdtree/) site for dataset and references. + +# k-NN Algorithm + +K-nearest neighbors (KNN) algorithm is a type of supervised ML algorithm which can be used for both classification as well as regression predictive problems. +The following two properties would define KNN +well − +- Lazy learning algorithm − KNN is a lazy learning algorithm because it does not have a specialized training phase and uses all the data for training while classification. +- Non-parametric learning algorithm − KNN is also a non-parametric learning algorithm because it doesn’t assume anything about the underlying data. + +K-nearest neighbors (KNN) algorithm uses ‘feature similarity’ to predict the values of new datapoints which further means that the new data point will be assigned a value based on how closely it matches the points in the training set. + +1. Load the data +2. Initialize K to your chosen number of neighbors +3. For each example in the data + - Calculate the distance between the query example and the current example from the data. + - Add the distance and the index of the example to an ordered collection +4. Sort the ordered collection of distances and indices from smallest to largest (in ascending order) by the distances +5. Pick the first K entries from the sorted collection 6. Get the labels of the selected K entries +6. If regression, return the mean of the K labels +7. If classification, return the mode of the K labels + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007002722.md b/.history/011/readme_20221007002722.md new file mode 100644 index 00000000..1fdda68e --- /dev/null +++ b/.history/011/readme_20221007002722.md @@ -0,0 +1,45 @@ +# Problem Statement + +The aim of the exercise is to implement k-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how kNN works internally. + +# Task + +- Extend the algorithm for Distance-weighted kNN classification using appropriate dataset. +- Extend the algorithm for regression using appropriate dataset. +- Extend the algorithm with appropriate dataset. +- Implementing KD trees to understand information retrieval. Visit [this](https://www.analyticsvidhya.com/blog/2017/11/information-retrieval-using-kdtree/) site for dataset and references. + +# k-NN Algorithm + +K-nearest neighbors (KNN) algorithm is a type of supervised ML algorithm which can be used for both classification as well as regression predictive problems. +The following two properties would define KNN +well − +- Lazy learning algorithm − KNN is a lazy learning algorithm because it does not have a specialized training phase and uses all the data for training while classification. +- Non-parametric learning algorithm − KNN is also a non-parametric learning algorithm because it doesn’t assume anything about the underlying data. + +K-nearest neighbors (KNN) algorithm uses ‘feature similarity’ to predict the values of new datapoints which further means that the new data point will be assigned a value based on how closely it matches the points in the training set. + +1. Load the data +2. Initialize K to your chosen number of neighbors +3. For each example in the data + - Calculate the distance between the query example and the current example from the data. + - Add the distance and the index of the example to an ordered collection +4. Sort the ordered collection of distances and indices from smallest to largest (in ascending order) by the distances +5. Pick the first K entries from the sorted collection 6. Get the labels of the selected K entries +6. If regression, return the mean of the K labels +7. If classification, return the mode of the K labels + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007002725.md b/.history/011/readme_20221007002725.md new file mode 100644 index 00000000..e993b549 --- /dev/null +++ b/.history/011/readme_20221007002725.md @@ -0,0 +1,45 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how kNN works internally. + +# Task + +- Extend the algorithm for Distance-weighted kNN classification using appropriate dataset. +- Extend the algorithm for regression using appropriate dataset. +- Extend the algorithm with appropriate dataset. +- Implementing KD trees to understand information retrieval. Visit [this](https://www.analyticsvidhya.com/blog/2017/11/information-retrieval-using-kdtree/) site for dataset and references. + +# k-NN Algorithm + +K-nearest neighbors (KNN) algorithm is a type of supervised ML algorithm which can be used for both classification as well as regression predictive problems. +The following two properties would define KNN +well − +- Lazy learning algorithm − KNN is a lazy learning algorithm because it does not have a specialized training phase and uses all the data for training while classification. +- Non-parametric learning algorithm − KNN is also a non-parametric learning algorithm because it doesn’t assume anything about the underlying data. + +K-nearest neighbors (KNN) algorithm uses ‘feature similarity’ to predict the values of new datapoints which further means that the new data point will be assigned a value based on how closely it matches the points in the training set. + +1. Load the data +2. Initialize K to your chosen number of neighbors +3. For each example in the data + - Calculate the distance between the query example and the current example from the data. + - Add the distance and the index of the example to an ordered collection +4. Sort the ordered collection of distances and indices from smallest to largest (in ascending order) by the distances +5. Pick the first K entries from the sorted collection 6. Get the labels of the selected K entries +6. If regression, return the mean of the K labels +7. If classification, return the mode of the K labels + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007002732.md b/.history/011/readme_20221007002732.md new file mode 100644 index 00000000..7775e8e5 --- /dev/null +++ b/.history/011/readme_20221007002732.md @@ -0,0 +1,45 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k works internally. + +# Task + +- Extend the algorithm for Distance-weighted kNN classification using appropriate dataset. +- Extend the algorithm for regression using appropriate dataset. +- Extend the algorithm with appropriate dataset. +- Implementing KD trees to understand information retrieval. Visit [this](https://www.analyticsvidhya.com/blog/2017/11/information-retrieval-using-kdtree/) site for dataset and references. + +# k-NN Algorithm + +K-nearest neighbors (KNN) algorithm is a type of supervised ML algorithm which can be used for both classification as well as regression predictive problems. +The following two properties would define KNN +well − +- Lazy learning algorithm − KNN is a lazy learning algorithm because it does not have a specialized training phase and uses all the data for training while classification. +- Non-parametric learning algorithm − KNN is also a non-parametric learning algorithm because it doesn’t assume anything about the underlying data. + +K-nearest neighbors (KNN) algorithm uses ‘feature similarity’ to predict the values of new datapoints which further means that the new data point will be assigned a value based on how closely it matches the points in the training set. + +1. Load the data +2. Initialize K to your chosen number of neighbors +3. For each example in the data + - Calculate the distance between the query example and the current example from the data. + - Add the distance and the index of the example to an ordered collection +4. Sort the ordered collection of distances and indices from smallest to largest (in ascending order) by the distances +5. Pick the first K entries from the sorted collection 6. Get the labels of the selected K entries +6. If regression, return the mean of the K labels +7. If classification, return the mode of the K labels + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007002734.md b/.history/011/readme_20221007002734.md new file mode 100644 index 00000000..4f73b357 --- /dev/null +++ b/.history/011/readme_20221007002734.md @@ -0,0 +1,45 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how kmeans works internally. + +# Task + +- Extend the algorithm for Distance-weighted kNN classification using appropriate dataset. +- Extend the algorithm for regression using appropriate dataset. +- Extend the algorithm with appropriate dataset. +- Implementing KD trees to understand information retrieval. Visit [this](https://www.analyticsvidhya.com/blog/2017/11/information-retrieval-using-kdtree/) site for dataset and references. + +# k-NN Algorithm + +K-nearest neighbors (KNN) algorithm is a type of supervised ML algorithm which can be used for both classification as well as regression predictive problems. +The following two properties would define KNN +well − +- Lazy learning algorithm − KNN is a lazy learning algorithm because it does not have a specialized training phase and uses all the data for training while classification. +- Non-parametric learning algorithm − KNN is also a non-parametric learning algorithm because it doesn’t assume anything about the underlying data. + +K-nearest neighbors (KNN) algorithm uses ‘feature similarity’ to predict the values of new datapoints which further means that the new data point will be assigned a value based on how closely it matches the points in the training set. + +1. Load the data +2. Initialize K to your chosen number of neighbors +3. For each example in the data + - Calculate the distance between the query example and the current example from the data. + - Add the distance and the index of the example to an ordered collection +4. Sort the ordered collection of distances and indices from smallest to largest (in ascending order) by the distances +5. Pick the first K entries from the sorted collection 6. Get the labels of the selected K entries +6. If regression, return the mean of the K labels +7. If classification, return the mode of the K labels + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007002736.md b/.history/011/readme_20221007002736.md new file mode 100644 index 00000000..222f93e5 --- /dev/null +++ b/.history/011/readme_20221007002736.md @@ -0,0 +1,45 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +- Extend the algorithm for Distance-weighted kNN classification using appropriate dataset. +- Extend the algorithm for regression using appropriate dataset. +- Extend the algorithm with appropriate dataset. +- Implementing KD trees to understand information retrieval. Visit [this](https://www.analyticsvidhya.com/blog/2017/11/information-retrieval-using-kdtree/) site for dataset and references. + +# k-NN Algorithm + +K-nearest neighbors (KNN) algorithm is a type of supervised ML algorithm which can be used for both classification as well as regression predictive problems. +The following two properties would define KNN +well − +- Lazy learning algorithm − KNN is a lazy learning algorithm because it does not have a specialized training phase and uses all the data for training while classification. +- Non-parametric learning algorithm − KNN is also a non-parametric learning algorithm because it doesn’t assume anything about the underlying data. + +K-nearest neighbors (KNN) algorithm uses ‘feature similarity’ to predict the values of new datapoints which further means that the new data point will be assigned a value based on how closely it matches the points in the training set. + +1. Load the data +2. Initialize K to your chosen number of neighbors +3. For each example in the data + - Calculate the distance between the query example and the current example from the data. + - Add the distance and the index of the example to an ordered collection +4. Sort the ordered collection of distances and indices from smallest to largest (in ascending order) by the distances +5. Pick the first K entries from the sorted collection 6. Get the labels of the selected K entries +6. If regression, return the mean of the K labels +7. If classification, return the mode of the K labels + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007002824.md b/.history/011/readme_20221007002824.md new file mode 100644 index 00000000..4a6e2e37 --- /dev/null +++ b/.history/011/readme_20221007002824.md @@ -0,0 +1,41 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + + +# k-NN Algorithm + +K-nearest neighbors (KNN) algorithm is a type of supervised ML algorithm which can be used for both classification as well as regression predictive problems. +The following two properties would define KNN +well − +- Lazy learning algorithm − KNN is a lazy learning algorithm because it does not have a specialized training phase and uses all the data for training while classification. +- Non-parametric learning algorithm − KNN is also a non-parametric learning algorithm because it doesn’t assume anything about the underlying data. + +K-nearest neighbors (KNN) algorithm uses ‘feature similarity’ to predict the values of new datapoints which further means that the new data point will be assigned a value based on how closely it matches the points in the training set. + +1. Load the data +2. Initialize K to your chosen number of neighbors +3. For each example in the data + - Calculate the distance between the query example and the current example from the data. + - Add the distance and the index of the example to an ordered collection +4. Sort the ordered collection of distances and indices from smallest to largest (in ascending order) by the distances +5. Pick the first K entries from the sorted collection 6. Get the labels of the selected K entries +6. If regression, return the mean of the K labels +7. If classification, return the mode of the K labels + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003007.md b/.history/011/readme_20221007003007.md new file mode 100644 index 00000000..67f375ae --- /dev/null +++ b/.history/011/readme_20221007003007.md @@ -0,0 +1,41 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# k-NN Algorithm + +K-nearest neighbors (KNN) algorithm is a type of supervised ML algorithm which can be used for both classification as well as regression predictive problems. +The following two properties would define KNN +well − +- Lazy learning algorithm − KNN is a lazy learning algorithm because it does not have a specialized training phase and uses all the data for training while classification. +- Non-parametric learning algorithm − KNN is also a non-parametric learning algorithm because it doesn’t assume anything about the underlying data. + +K-nearest neighbors (KNN) algorithm uses ‘feature similarity’ to predict the values of new datapoints which further means that the new data point will be assigned a value based on how closely it matches the points in the training set. + +1. Load the data +2. Initialize K to your chosen number of neighbors +3. For each example in the data + - Calculate the distance between the query example and the current example from the data. + - Add the distance and the index of the example to an ordered collection +4. Sort the ordered collection of distances and indices from smallest to largest (in ascending order) by the distances +5. Pick the first K entries from the sorted collection 6. Get the labels of the selected K entries +6. If regression, return the mean of the K labels +7. If classification, return the mode of the K labels + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003010.md b/.history/011/readme_20221007003010.md new file mode 100644 index 00000000..1277294b --- /dev/null +++ b/.history/011/readme_20221007003010.md @@ -0,0 +1,42 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# k-NN Algorithm + +K-nearest neighbors (KNN) algorithm is a type of supervised ML algorithm which can be used for both classification as well as regression predictive problems. +The following two properties would define KNN +well − +- Lazy learning algorithm − KNN is a lazy learning algorithm because it does not have a specialized training phase and uses all the data for training while classification. +- Non-parametric learning algorithm − KNN is also a non-parametric learning algorithm because it doesn’t assume anything about the underlying data. + +K-nearest neighbors (KNN) algorithm uses ‘feature similarity’ to predict the values of new datapoints which further means that the new data point will be assigned a value based on how closely it matches the points in the training set. + +1. Load the data +2. Initialize K to your chosen number of neighbors +3. For each example in the data + - Calculate the distance between the query example and the current example from the data. + - Add the distance and the index of the example to an ordered collection +4. Sort the ordered collection of distances and indices from smallest to largest (in ascending order) by the distances +5. Pick the first K entries from the sorted collection 6. Get the labels of the selected K entries +6. If regression, return the mean of the K labels +7. If classification, return the mode of the K labels + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003020.md b/.history/011/readme_20221007003020.md new file mode 100644 index 00000000..6e69286e --- /dev/null +++ b/.history/011/readme_20221007003020.md @@ -0,0 +1,42 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-nearest neighbors (KNN) algorithm is a type of supervised ML algorithm which can be used for both classification as well as regression predictive problems. +The following two properties would define KNN +well − +- Lazy learning algorithm − KNN is a lazy learning algorithm because it does not have a specialized training phase and uses all the data for training while classification. +- Non-parametric learning algorithm − KNN is also a non-parametric learning algorithm because it doesn’t assume anything about the underlying data. + +K-nearest neighbors (KNN) algorithm uses ‘feature similarity’ to predict the values of new datapoints which further means that the new data point will be assigned a value based on how closely it matches the points in the training set. + +1. Load the data +2. Initialize K to your chosen number of neighbors +3. For each example in the data + - Calculate the distance between the query example and the current example from the data. + - Add the distance and the index of the example to an ordered collection +4. Sort the ordered collection of distances and indices from smallest to largest (in ascending order) by the distances +5. Pick the first K entries from the sorted collection 6. Get the labels of the selected K entries +6. If regression, return the mean of the K labels +7. If classification, return the mode of the K labels + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003034.md b/.history/011/readme_20221007003034.md new file mode 100644 index 00000000..a6a428b7 --- /dev/null +++ b/.history/011/readme_20221007003034.md @@ -0,0 +1,24 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003105.md b/.history/011/readme_20221007003105.md new file mode 100644 index 00000000..7ebbec21 --- /dev/null +++ b/.history/011/readme_20221007003105.md @@ -0,0 +1,94 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +The centroids of the K clusters, which can be used to label new data +Labels for the training data (each data point is assigned to a single cluster) +K-means works by defining spherical clusters that are separable in a way so that the mean value converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +Use Cases: + +Document Classification +Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003116.md b/.history/011/readme_20221007003116.md new file mode 100644 index 00000000..c2b93697 --- /dev/null +++ b/.history/011/readme_20221007003116.md @@ -0,0 +1,95 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +The centroids of the K clusters, which can be used to label new data +Labels for the training data (each data point is assigned to a single cluster) +K-means works by defining spherical clusters that are separable in a way so that the mean value converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +Use Cases: + +Document Classification +Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003119.md b/.history/011/readme_20221007003119.md new file mode 100644 index 00000000..c2b93697 --- /dev/null +++ b/.history/011/readme_20221007003119.md @@ -0,0 +1,95 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +The centroids of the K clusters, which can be used to label new data +Labels for the training data (each data point is assigned to a single cluster) +K-means works by defining spherical clusters that are separable in a way so that the mean value converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +Use Cases: + +Document Classification +Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003122.md b/.history/011/readme_20221007003122.md new file mode 100644 index 00000000..4dd799e0 --- /dev/null +++ b/.history/011/readme_20221007003122.md @@ -0,0 +1,95 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +Labels for the training data (each data point is assigned to a single cluster) +K-means works by defining spherical clusters that are separable in a way so that the mean value converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +Use Cases: + +Document Classification +Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003124.md b/.history/011/readme_20221007003124.md new file mode 100644 index 00000000..33694761 --- /dev/null +++ b/.history/011/readme_20221007003124.md @@ -0,0 +1,95 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +-Labels for the training data (each data point is assigned to a single cluster) +K-means works by defining spherical clusters that are separable in a way so that the mean value converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +Use Cases: + +Document Classification +Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003126.md b/.history/011/readme_20221007003126.md new file mode 100644 index 00000000..f5b50a5d --- /dev/null +++ b/.history/011/readme_20221007003126.md @@ -0,0 +1,95 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +-Labels for the training data (each data point is assigned to a single cluster) +-K-means works by defining spherical clusters that are separable in a way so that the mean value converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +Use Cases: + +Document Classification +Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003130.md b/.history/011/readme_20221007003130.md new file mode 100644 index 00000000..76733a18 --- /dev/null +++ b/.history/011/readme_20221007003130.md @@ -0,0 +1,95 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +-Labels for the training data (each data point is assigned to a single cluster) +-K-means works by defining spherical clusters that are separable in a way so that the mean value converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +Use Cases: + +Document Classification +Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003131.md b/.history/011/readme_20221007003131.md new file mode 100644 index 00000000..a4f6ffeb --- /dev/null +++ b/.history/011/readme_20221007003131.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +-Labels for the training data (each data point is assigned to a single cluster) +-K-means works by defining spherical clusters that are separable in a way so that the mean value +converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +Use Cases: + +Document Classification +Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003134.md b/.history/011/readme_20221007003134.md new file mode 100644 index 00000000..c2b6363d --- /dev/null +++ b/.history/011/readme_20221007003134.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +-Labels for the training data (each data point is assigned to a single cluster) +-K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +Use Cases: + +Document Classification +Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003140.md b/.history/011/readme_20221007003140.md new file mode 100644 index 00000000..dda0c952 --- /dev/null +++ b/.history/011/readme_20221007003140.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +-Labels for the training data (each data point is assigned to a single cluster) +-K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +#Use Cases: + +Document Classification +Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003142.md b/.history/011/readme_20221007003142.md new file mode 100644 index 00000000..8c704952 --- /dev/null +++ b/.history/011/readme_20221007003142.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +-Labels for the training data (each data point is assigned to a single cluster) +-K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +Document Classification +Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003148.md b/.history/011/readme_20221007003148.md new file mode 100644 index 00000000..5b7f1851 --- /dev/null +++ b/.history/011/readme_20221007003148.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +-Labels for the training data (each data point is assigned to a single cluster) +-K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +-Document Classification +Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003150.md b/.history/011/readme_20221007003150.md new file mode 100644 index 00000000..8a676dd3 --- /dev/null +++ b/.history/011/readme_20221007003150.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +-Labels for the training data (each data point is assigned to a single cluster) +-K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +-Document Classification +-Delivery Store Optimization +Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003152.md b/.history/011/readme_20221007003152.md new file mode 100644 index 00000000..3bbf3d53 --- /dev/null +++ b/.history/011/readme_20221007003152.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +-Labels for the training data (each data point is assigned to a single cluster) +-K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +-Document Classification +-Delivery Store Optimization +- Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003153.md b/.history/011/readme_20221007003153.md new file mode 100644 index 00000000..10660327 --- /dev/null +++ b/.history/011/readme_20221007003153.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +-Labels for the training data (each data point is assigned to a single cluster) +-K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +-Document Classification + -Delivery Store Optimization +- Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003155.md b/.history/011/readme_20221007003155.md new file mode 100644 index 00000000..f6f9838f --- /dev/null +++ b/.history/011/readme_20221007003155.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +-The centroids of the K clusters, which can be used to label new data +-Labels for the training data (each data point is assigned to a single cluster) +-K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003158.md b/.history/011/readme_20221007003158.md new file mode 100644 index 00000000..d82ae2c7 --- /dev/null +++ b/.history/011/readme_20221007003158.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003202.md b/.history/011/readme_20221007003202.md new file mode 100644 index 00000000..2df83ad9 --- /dev/null +++ b/.history/011/readme_20221007003202.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. +Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003206.md b/.history/011/readme_20221007003206.md new file mode 100644 index 00000000..2cb2e75a --- /dev/null +++ b/.history/011/readme_20221007003206.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. +# Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003207.md b/.history/011/readme_20221007003207.md new file mode 100644 index 00000000..74d902a6 --- /dev/null +++ b/.history/011/readme_20221007003207.md @@ -0,0 +1,97 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003209.md b/.history/011/readme_20221007003209.md new file mode 100644 index 00000000..f6ca7c7b --- /dev/null +++ b/.history/011/readme_20221007003209.md @@ -0,0 +1,97 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003211.md b/.history/011/readme_20221007003211.md new file mode 100644 index 00000000..32e73b65 --- /dev/null +++ b/.history/011/readme_20221007003211.md @@ -0,0 +1,97 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003214.md b/.history/011/readme_20221007003214.md new file mode 100644 index 00000000..b69c631e --- /dev/null +++ b/.history/011/readme_20221007003214.md @@ -0,0 +1,97 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: +- +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003216.md b/.history/011/readme_20221007003216.md new file mode 100644 index 00000000..32e73b65 --- /dev/null +++ b/.history/011/readme_20221007003216.md @@ -0,0 +1,97 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assigment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003220.md b/.history/011/readme_20221007003220.md new file mode 100644 index 00000000..92cc61a2 --- /dev/null +++ b/.history/011/readme_20221007003220.md @@ -0,0 +1,97 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003227.md b/.history/011/readme_20221007003227.md new file mode 100644 index 00000000..92cc61a2 --- /dev/null +++ b/.history/011/readme_20221007003227.md @@ -0,0 +1,97 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on + +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003230.md b/.history/011/readme_20221007003230.md new file mode 100644 index 00000000..f8268982 --- /dev/null +++ b/.history/011/readme_20221007003230.md @@ -0,0 +1,96 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on +minci∈Cdist(ci,x)2 + +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003232.md b/.history/011/readme_20221007003232.md new file mode 100644 index 00000000..db9e1bd8 --- /dev/null +++ b/.history/011/readme_20221007003232.md @@ -0,0 +1,95 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. If ci is the collection of centroids in set C, then each data point x is assigned to a cluster based on +minci∈Cdist(ci,x)2 +where dist( · ) is the standard (L2) Euclidean distance. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003250.md b/.history/011/readme_20221007003250.md new file mode 100644 index 00000000..65b55227 --- /dev/null +++ b/.history/011/readme_20221007003250.md @@ -0,0 +1,93 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003255.md b/.history/011/readme_20221007003255.md new file mode 100644 index 00000000..df87f7e6 --- /dev/null +++ b/.history/011/readme_20221007003255.md @@ -0,0 +1,92 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003257.md b/.history/011/readme_20221007003257.md new file mode 100644 index 00000000..65b55227 --- /dev/null +++ b/.history/011/readme_20221007003257.md @@ -0,0 +1,93 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003307.md b/.history/011/readme_20221007003307.md new file mode 100644 index 00000000..5c10e22f --- /dev/null +++ b/.history/011/readme_20221007003307.md @@ -0,0 +1,93 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003318.md b/.history/011/readme_20221007003318.md new file mode 100644 index 00000000..715da329 --- /dev/null +++ b/.history/011/readme_20221007003318.md @@ -0,0 +1,93 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +4. Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003328.md b/.history/011/readme_20221007003328.md new file mode 100644 index 00000000..68ed1d62 --- /dev/null +++ b/.history/011/readme_20221007003328.md @@ -0,0 +1,93 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +3.2. Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003330.md b/.history/011/readme_20221007003330.md new file mode 100644 index 00000000..1f7312d3 --- /dev/null +++ b/.history/011/readme_20221007003330.md @@ -0,0 +1,93 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +3.2 Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003332.md b/.history/011/readme_20221007003332.md new file mode 100644 index 00000000..2fe086b5 --- /dev/null +++ b/.history/011/readme_20221007003332.md @@ -0,0 +1,93 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +3. Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003334.md b/.history/011/readme_20221007003334.md new file mode 100644 index 00000000..5bd7bf56 --- /dev/null +++ b/.history/011/readme_20221007003334.md @@ -0,0 +1,93 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + + Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003336.md b/.history/011/readme_20221007003336.md new file mode 100644 index 00000000..5cbffc21 --- /dev/null +++ b/.history/011/readme_20221007003336.md @@ -0,0 +1,93 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003340.md b/.history/011/readme_20221007003340.md new file mode 100644 index 00000000..29b8eb26 --- /dev/null +++ b/.history/011/readme_20221007003340.md @@ -0,0 +1,93 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +The Silhouette Coefficient is for a single sample is then given as: + +s=b−amax(a,b) + +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003349.md b/.history/011/readme_20221007003349.md new file mode 100644 index 00000000..24a1d754 --- /dev/null +++ b/.history/011/readme_20221007003349.md @@ -0,0 +1,89 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. + +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003351.md b/.history/011/readme_20221007003351.md new file mode 100644 index 00000000..f3cb192c --- /dev/null +++ b/.history/011/readme_20221007003351.md @@ -0,0 +1,88 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. + +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003352.md b/.history/011/readme_20221007003352.md new file mode 100644 index 00000000..0b83c686 --- /dev/null +++ b/.history/011/readme_20221007003352.md @@ -0,0 +1,87 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003402.md b/.history/011/readme_20221007003402.md new file mode 100644 index 00000000..a1993708 --- /dev/null +++ b/.history/011/readme_20221007003402.md @@ -0,0 +1,87 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +cosθ=a→.b→∥a→∥∥b→∥=∑ni=1aibi∑ni=1a2i∑ni=1b2i−−−−−−√−−−−−−−−−−−−−−√ + +where a→.b→=∑ni=1aibi=a1b1+a2b2+...+anbn +Manhattan distance : is the total sum of the difference between the x-coordinates and y-coordinates. +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003416.md b/.history/011/readme_20221007003416.md new file mode 100644 index 00000000..8aece63e --- /dev/null +++ b/.history/011/readme_20221007003416.md @@ -0,0 +1,83 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003418.md b/.history/011/readme_20221007003418.md new file mode 100644 index 00000000..f5cbd817 --- /dev/null +++ b/.history/011/readme_20221007003418.md @@ -0,0 +1,84 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003419.md b/.history/011/readme_20221007003419.md new file mode 100644 index 00000000..3c0c8c19 --- /dev/null +++ b/.history/011/readme_20221007003419.md @@ -0,0 +1,84 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + +Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003423.md b/.history/011/readme_20221007003423.md new file mode 100644 index 00000000..8892a874 --- /dev/null +++ b/.history/011/readme_20221007003423.md @@ -0,0 +1,83 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: + +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003425.md b/.history/011/readme_20221007003425.md new file mode 100644 index 00000000..b5296fef --- /dev/null +++ b/.history/011/readme_20221007003425.md @@ -0,0 +1,82 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) + +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003427.md b/.history/011/readme_20221007003427.md new file mode 100644 index 00000000..f57c966c --- /dev/null +++ b/.history/011/readme_20221007003427.md @@ -0,0 +1,81 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003430.md b/.history/011/readme_20221007003430.md new file mode 100644 index 00000000..77bc7d22 --- /dev/null +++ b/.history/011/readme_20221007003430.md @@ -0,0 +1,82 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003506.md b/.history/011/readme_20221007003506.md new file mode 100644 index 00000000..feab2e8b --- /dev/null +++ b/.history/011/readme_20221007003506.md @@ -0,0 +1,82 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003510.md b/.history/011/readme_20221007003510.md new file mode 100644 index 00000000..4b98d3c4 --- /dev/null +++ b/.history/011/readme_20221007003510.md @@ -0,0 +1,83 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means +`knn_starter_exercise.ipynb` + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003513.md b/.history/011/readme_20221007003513.md new file mode 100644 index 00000000..d53fbb4c --- /dev/null +++ b/.history/011/readme_20221007003513.md @@ -0,0 +1,82 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means + +[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) +[![View in nbviewer](https://github.com/jupyter/design/blob/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.jupyter.org/github/gimseng/99-ML-Learning-Projects/blob/master/010/exercise/knn_starter_exercise.ipynb) + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003531.md b/.history/011/readme_20221007003531.md new file mode 100644 index 00000000..01e37d56 --- /dev/null +++ b/.history/011/readme_20221007003531.md @@ -0,0 +1,79 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means + +### References +- https://www.tutorialspoint.com/machine_learning_with_python/machine_learning_with_python_knn_algorithm_finding_nearest_neighbors.htm diff --git a/.history/011/readme_20221007003538.md b/.history/011/readme_20221007003538.md new file mode 100644 index 00000000..d84789a2 --- /dev/null +++ b/.history/011/readme_20221007003538.md @@ -0,0 +1,79 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means + +### References +- \ No newline at end of file diff --git a/.history/011/readme_20221007003554.md b/.history/011/readme_20221007003554.md new file mode 100644 index 00000000..d476960f --- /dev/null +++ b/.history/011/readme_20221007003554.md @@ -0,0 +1,79 @@ +# Problem Statement + +The aim of the exercise is to implement K-Means Clustering from scratch. +The basic implementation is done for understanding. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means + +### References +- https://www.javatpoint.com/k-means-clustering-algorithm-in-machine-learning \ No newline at end of file diff --git a/.history/011/readme_20221007005746.md b/.history/011/readme_20221007005746.md new file mode 100644 index 00000000..be06c35d --- /dev/null +++ b/.history/011/readme_20221007005746.md @@ -0,0 +1,79 @@ +# Problem Statement +Business challenge/requirement + +Lithionpower is the largest provider of electric vehicle(e-vehicle) batteries. It provides battery on a rental model to e-vehicle drivers. Drivers rent battery typically for a day and then replace it with a charged battery from the company. Lithionpower has a variable pricing model based on driver's driving history. As the life of a battery depends on factors such as overspeeding, distance driven per day etc.You as a ML expert have to create a cluster model where drivers can be grouped together based on the driving data. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means + +### References +- https://www.javatpoint.com/k-means-clustering-algorithm-in-machine-learning \ No newline at end of file diff --git a/.history/011/readme_20221007005749.md b/.history/011/readme_20221007005749.md new file mode 100644 index 00000000..16b1f172 --- /dev/null +++ b/.history/011/readme_20221007005749.md @@ -0,0 +1,78 @@ +# Problem Statement +Business challenge/requirement +Lithionpower is the largest provider of electric vehicle(e-vehicle) batteries. It provides battery on a rental model to e-vehicle drivers. Drivers rent battery typically for a day and then replace it with a charged battery from the company. Lithionpower has a variable pricing model based on driver's driving history. As the life of a battery depends on factors such as overspeeding, distance driven per day etc.You as a ML expert have to create a cluster model where drivers can be grouped together based on the driving data. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means + +### References +- https://www.javatpoint.com/k-means-clustering-algorithm-in-machine-learning \ No newline at end of file diff --git a/.history/011/readme_20221007005754.md b/.history/011/readme_20221007005754.md new file mode 100644 index 00000000..be06c35d --- /dev/null +++ b/.history/011/readme_20221007005754.md @@ -0,0 +1,79 @@ +# Problem Statement +Business challenge/requirement + +Lithionpower is the largest provider of electric vehicle(e-vehicle) batteries. It provides battery on a rental model to e-vehicle drivers. Drivers rent battery typically for a day and then replace it with a charged battery from the company. Lithionpower has a variable pricing model based on driver's driving history. As the life of a battery depends on factors such as overspeeding, distance driven per day etc.You as a ML expert have to create a cluster model where drivers can be grouped together based on the driving data. + +# Objective + +To understand how k-means works internally. + +# Task + +This input file contains the basic information (ID, age, gender, income, spending score) about the customers of a mall. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means + +### References +- https://www.javatpoint.com/k-means-clustering-algorithm-in-machine-learning \ No newline at end of file diff --git a/.history/011/readme_20221007005813.md b/.history/011/readme_20221007005813.md new file mode 100644 index 00000000..e9efce1e --- /dev/null +++ b/.history/011/readme_20221007005813.md @@ -0,0 +1,78 @@ +# Problem Statement +Business challenge/requirement + +Lithionpower is the largest provider of electric vehicle(e-vehicle) batteries. It provides battery on a rental model to e-vehicle drivers. Drivers rent battery typically for a day and then replace it with a charged battery from the company. Lithionpower has a variable pricing model based on driver's driving history. As the life of a battery depends on factors such as overspeeding, distance driven per day etc.You as a ML expert have to create a cluster model where drivers can be grouped together based on the driving data. + +# Objective + +To understand how k-means works internally. + +# Task + +Drivers will be incentivized based on the cluster, so grouping has to be accurate +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means + +### References +- https://www.javatpoint.com/k-means-clustering-algorithm-in-machine-learning \ No newline at end of file diff --git a/.history/011/readme_20221007005815.md b/.history/011/readme_20221007005815.md new file mode 100644 index 00000000..e3d1322f --- /dev/null +++ b/.history/011/readme_20221007005815.md @@ -0,0 +1,78 @@ +# Problem Statement +Business challenge/requirement + +Lithionpower is the largest provider of electric vehicle(e-vehicle) batteries. It provides battery on a rental model to e-vehicle drivers. Drivers rent battery typically for a day and then replace it with a charged battery from the company. Lithionpower has a variable pricing model based on driver's driving history. As the life of a battery depends on factors such as overspeeding, distance driven per day etc.You as a ML expert have to create a cluster model where drivers can be grouped together based on the driving data. + +# Objective + +To understand how k-means works internally. + +# Task + +Drivers will be incentivized based on the cluster, so grouping has to be accurate/ +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means + +### References +- https://www.javatpoint.com/k-means-clustering-algorithm-in-machine-learning \ No newline at end of file diff --git a/.history/011/readme_20221007005817.md b/.history/011/readme_20221007005817.md new file mode 100644 index 00000000..8c239492 --- /dev/null +++ b/.history/011/readme_20221007005817.md @@ -0,0 +1,79 @@ +# Problem Statement +Business challenge/requirement + +Lithionpower is the largest provider of electric vehicle(e-vehicle) batteries. It provides battery on a rental model to e-vehicle drivers. Drivers rent battery typically for a day and then replace it with a charged battery from the company. Lithionpower has a variable pricing model based on driver's driving history. As the life of a battery depends on factors such as overspeeding, distance driven per day etc.You as a ML expert have to create a cluster model where drivers can be grouped together based on the driving data. + +# Objective + +To understand how k-means works internally. + +# Task + +Drivers will be incentivized based on the cluster, so grouping has to be accurate. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means + +### References +- https://www.javatpoint.com/k-means-clustering-algorithm-in-machine-learning \ No newline at end of file diff --git a/.history/011/readme_20221007005822.md b/.history/011/readme_20221007005822.md new file mode 100644 index 00000000..75ee2e30 --- /dev/null +++ b/.history/011/readme_20221007005822.md @@ -0,0 +1,79 @@ +# Problem Statement +Business challenge/requirement + +Lithionpower is the largest provider of electric vehicle(e-vehicle) batteries. It provides battery on a rental model to e-vehicle drivers. Drivers rent battery typically for a day and then replace it with a charged battery from the company. Lithionpower has a variable pricing model based on driver's driving history. As the life of a battery depends on factors such as overspeeding, distance driven per day etc.You as a ML expert have to create a cluster model where drivers can be grouped together based on the driving data. + +# Objective + +To understand how k-means works internally. + +# Task + +Drivers will be incentivised based on the cluster, so grouping has to be accurate. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means + +### References +- https://www.javatpoint.com/k-means-clustering-algorithm-in-machine-learning \ No newline at end of file diff --git a/006/solution/ensemble_techniques.ipynb b/006/solution/ensemble_techniques.ipynb index 076959ea..08da2d51 100644 --- a/006/solution/ensemble_techniques.ipynb +++ b/006/solution/ensemble_techniques.ipynb @@ -1,6 +1,6 @@ { "cells": [ - { + { "cell_type": "markdown", "metadata": {}, "source": [ @@ -730,7 +730,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.7.4 ('base')", "language": "python", "name": "python3" }, @@ -744,7 +744,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.2" + "version": "3.7.4" + }, + "vscode": { + "interpreter": { + "hash": "b1e6b76b6e736d29445d5c5f779c1dafb0f59893c5766b7198bc0a87a8e7acf4" + } } }, "nbformat": 4, diff --git a/008/solution/NaiveBayes Solution.ipynb b/008/solution/NaiveBayes Solution.ipynb index 4380749b..5edf37f4 100644 --- a/008/solution/NaiveBayes Solution.ipynb +++ b/008/solution/NaiveBayes Solution.ipynb @@ -543,7 +543,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.7.4 ('base')", "language": "python", "name": "python3" }, @@ -557,7 +557,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.3" + "version": "3.7.4" + }, + "vscode": { + "interpreter": { + "hash": "b1e6b76b6e736d29445d5c5f779c1dafb0f59893c5766b7198bc0a87a8e7acf4" + } } }, "nbformat": 4, diff --git a/011/data/driver-data.csv b/011/data/driver-data.csv new file mode 100644 index 00000000..2310c9f5 --- /dev/null +++ b/011/data/driver-data.csv @@ -0,0 +1,4001 @@ +id,mean_dist_day,mean_over_speed_perc +3423311935,71.24,28 +3423313212,52.53,25 +3423313724,64.54,27 +3423311373,55.69,22 +3423310999,54.58,25 +3423313857,41.91,10 +3423312432,58.64,20 +3423311434,52.02,8 +3423311328,31.25,34 +3423312488,44.31,19 +3423311254,49.35,40 +3423312943,58.07,45 +3423312536,44.22,22 +3423311542,55.73,19 +3423312176,46.63,43 +3423314176,52.97,32 +3423314202,46.25,35 +3423311346,51.55,27 +3423310666,57.05,26 +3423313527,58.45,30 +3423312182,43.42,23 +3423313590,55.68,37 +3423312268,55.15,18 +3423314255,43.84,22 +3423311976,59.26,32 +3423312669,37.14,41 +3423310697,64.3,29 +3423312113,45.75,16 +3423313343,45.97,23 +3423311431,56.04,39 +3423310755,33.64,45 +3423311821,41.67,33 +3423314359,50.68,39 +3423313106,54.22,35 +3423310754,56.2,29 +3423310524,46.16,41 +3423311780,50.22,24 +3423312156,49.66,33 +3423312916,38.61,37 +3423310588,55.28,36 +3423312995,57.87,41 +3423313389,61.69,12 +3423311369,37.41,21 +3423311408,53.83,32 +3423311598,62.98,22 +3423312047,46.97,13 +3423312322,58.03,24 +3423313247,59.87,36 +3423310944,81.34,31 +3423312404,48.56,26 +3423313738,17.66,23 +3423311461,46.01,21 +3423313866,45.34,26 +3423312074,39.64,31 +3423312444,51.22,36 +3423311834,36.21,31 +3423311527,47.32,55 +3423310476,54.87,27 +3423310548,58.97,6 +3423311011,57.3,42 +3423310633,59.94,32 +3423310595,61.72,25 +3423312757,53.19,23 +3423313776,35.13,38 +3423312067,48.47,20 +3423312235,51.17,43 +3423310893,42.14,39 +3423314121,54.31,35 +3423313750,48.93,32 +3423312776,53.51,44 +3423312927,51.72,24 +3423310765,49.86,10 +3423311457,49.2,41 +3423310678,60.2,39 +3423312564,54.06,20 +3423313058,72.91,30 +3423310803,61.92,40 +3423312166,44.2,24 +3423312608,60.75,39 +3423310646,68.36,37 +3423314440,56.39,20 +3423312301,38.19,15 +3423311400,66.19,27 +3423313288,58.2,17 +3423314357,47.55,22 +3423311015,37.62,16 +3423312270,17.81,26 +3423313457,31.25,29 +3423311768,61,41 +3423314289,40.98,33 +3423310618,53.69,24 +3423311628,51.46,9 +3423313173,64.57,29 +3423310552,53.79,23 +3423314153,64.28,34 +3423313814,38.41,24 +3423312466,58.89,42 +3423314103,44.17,33 +3423311257,50.41,43 +3423311814,63.63,28 +3423311017,45.78,24 +3423311127,55.61,20 +3423311066,67.23,5 +3423313316,49.69,38 +3423313648,54.6,30 +3423313558,40.94,59 +3423313353,43.84,36 +3423312285,56.36,19 +3423310853,69.53,23 +3423313991,51.48,19 +3423312791,52.93,41 +3423311030,55.52,8 +3423310575,52.14,23 +3423311959,62.86,21 +3423314029,37.53,33 +3423313843,63.47,25 +3423310628,62.11,33 +3423311869,43.52,25 +3423312133,47.97,27 +3423314085,42.27,37 +3423310504,56.05,30 +3423311462,62.08,37 +3423310974,35.25,53 +3423311980,39.81,25 +3423312931,60.27,33 +3423313867,53.41,22 +3423313838,56.28,27 +3423312956,53.31,25 +3423312985,55.24,30 +3423311863,44.97,29 +3423313131,40.3,43 +3423313166,47.18,42 +3423313841,46.4,34 +3423312477,35.11,15 +3423313132,54.33,38 +3423311934,54.59,64 +3423311470,61.67,50 +3423314234,52.39,19 +3423313633,52.37,6 +3423312165,40.84,25 +3423314381,27.42,27 +3423312732,44.79,31 +3423312525,59.84,37 +3423313793,36,45 +3423313029,47.64,10 +3423313920,51.85,37 +3423311655,59.73,15 +3423311576,42.96,37 +3423313408,51.84,27 +3423313342,59.62,23 +3423312729,58.82,30 +3423311987,50.93,30 +3423313012,41.35,23 +3423310573,24.58,45 +3423311451,57.74,22 +3423313032,38.14,33 +3423312160,51.82,20 +3423312397,41.37,17 +3423311095,55.01,24 +3423312803,64.56,53 +3423311951,46.09,35 +3423313818,57.81,20 +3423311313,46.31,23 +3423311389,66.62,37 +3423312703,42.89,17 +3423314199,40.82,23 +3423312834,50.55,41 +3423312795,40.43,40 +3423312077,53.36,25 +3423314379,44.83,27 +3423310461,35.87,35 +3423312709,46.95,23 +3423312808,47.75,20 +3423311713,57.12,26 +3423312819,38.03,43 +3423314401,54.31,45 +3423311992,50.29,39 +3423311164,33.6,24 +3423311785,45.13,44 +3423310769,31.68,31 +3423310596,72.36,4 +3423310503,48.43,55 +3423311126,44.48,36 +3423313487,56.54,20 +3423314331,54.11,50 +3423313472,49.51,29 +3423312200,50.66,36 +3423313015,61.82,23 +3423312340,50.45,15 +3423313441,67.7,43 +3423312358,41.61,12 +3423313702,39.63,31 +3423311137,36.08,31 +3423313567,45.18,27 +3423313538,49.33,28 +3423312814,48.66,45 +3423310890,53.81,25 +3423312595,35.57,44 +3423311887,48.83,39 +3423314123,69.14,40 +3423312427,60.75,59 +3423310735,52.01,39 +3423312257,47.88,45 +3423314416,40.3,30 +3423311310,48.52,30 +3423311848,55.06,37 +3423311622,48.45,28 +3423311051,58.65,40 +3423313971,69.29,21 +3423314179,51.7,21 +3423312887,69.95,18 +3423312583,53.04,37 +3423312990,47.97,27 +3423311820,64.01,26 +3423314018,61.76,23 +3423313775,49.98,28 +3423310869,81.96,27 +3423313262,56.56,25 +3423311532,51.69,30 +3423313252,54.83,40 +3423311201,54.97,39 +3423313632,44.07,27 +3423311574,45.95,40 +3423311102,42.24,22 +3423310805,43.52,49 +3423313805,51.33,31 +3423311177,56.68,55 +3423313477,51.06,24 +3423310780,41.95,28 +3423313713,53.56,27 +3423313597,47.86,45 +3423314406,43.74,34 +3423312012,41.22,39 +3423312915,45.82,42 +3423312481,44.74,25 +3423313911,53.88,30 +3423311105,44.4,29 +3423312215,59.01,35 +3423311909,53.27,19 +3423314269,45.62,25 +3423313837,53.16,53 +3423314291,50.91,51 +3423311881,33.53,42 +3423312020,46.92,33 +3423312610,59.24,35 +3423314333,48.37,32 +3423311877,50.26,21 +3423310736,38.14,30 +3423312193,63.42,26 +3423313751,57.46,14 +3423313882,63.79,33 +3423312229,58.13,30 +3423314081,56.35,41 +3423311721,52.95,3 +3423312718,55.65,27 +3423314210,29.44,34 +3423314427,49.58,39 +3423313361,31.3,33 +3423311301,43.48,29 +3423313773,48.94,34 +3423312093,43.93,32 +3423312612,53.83,36 +3423312474,53.69,21 +3423311693,60.33,8 +3423311450,40.97,42 +3423311351,49.29,21 +3423314032,56.06,34 +3423311710,66.16,32 +3423310578,32.61,23 +3423310713,37.37,25 +3423311812,46.04,38 +3423314051,57.17,33 +3423313543,41.06,31 +3423311594,47.21,36 +3423311290,55.74,44 +3423314242,56.34,25 +3423311609,61.53,11 +3423311665,61.4,38 +3423314223,59.03,29 +3423312735,26.76,22 +3423311754,48.24,34 +3423311799,50.84,33 +3423314167,64.25,28 +3423312455,23.06,15 +3423313298,47.25,23 +3423313402,59.19,33 +3423313266,32.71,17 +3423312626,48.87,25 +3423313671,58.68,24 +3423313459,51.16,25 +3423311131,49.48,34 +3423313120,16.05,36 +3423312886,53.49,51 +3423312024,57.19,19 +3423312416,49.38,37 +3423311082,38.82,16 +3423311625,49.88,37 +3423311466,48.81,30 +3423313275,58.86,42 +3423313213,45.22,39 +3423312196,49.25,25 +3423312697,71.53,49 +3423312946,49.41,38 +3423312130,63.19,21 +3423310680,48.68,22 +3423313703,50.11,34 +3423313203,43.56,24 +3423312854,49.36,23 +3423312458,23.03,21 +3423313083,36.14,30 +3423311556,37.48,26 +3423313124,46.35,35 +3423313003,41.54,27 +3423312311,66.05,36 +3423312456,47.09,27 +3423310609,33.84,35 +3423313087,41.45,37 +3423312821,56.09,9 +3423311525,54.66,34 +3423310932,53.23,47 +3423314232,53.51,33 +3423311442,48.56,37 +3423310670,35.91,30 +3423311052,58.04,28 +3423312870,37.16,19 +3423311497,42.45,16 +3423310837,62.65,41 +3423312539,51.42,35 +3423311940,46.31,22 +3423312604,51.36,33 +3423313041,63.62,41 +3423312818,56.54,39 +3423310492,48.58,36 +3423311128,30.37,37 +3423311741,51.75,37 +3423311817,61.68,24 +3423314168,56.54,33 +3423313101,41.22,13 +3423314057,46.08,35 +3423312013,53.21,39 +3423311070,57.95,21 +3423311958,53.4,45 +3423313795,50.32,40 +3423311685,62.06,37 +3423313876,32.9,36 +3423312977,43.48,15 +3423312590,47.76,25 +3423313054,41.61,4 +3423313907,28.72,37 +3423311586,53.37,40 +3423313209,37.81,34 +3423310759,48.7,46 +3423314418,53.23,40 +3423312007,43.47,33 +3423311892,64.55,30 +3423314120,59.34,38 +3423311246,29.92,49 +3423312988,45.67,39 +3423314297,44.99,36 +3423311208,51.07,13 +3423311823,57.87,38 +3423312106,57.9,34 +3423314420,45.41,32 +3423311517,66.97,33 +3423311640,49.37,40 +3423311320,49.54,21 +3423312185,51.73,61 +3423312896,29.68,30 +3423314019,66.12,38 +3423311839,52.68,30 +3423311624,50.32,46 +3423312883,63.2,40 +3423311772,45.12,25 +3423313050,38.69,22 +3423312274,50.77,32 +3423311429,48,23 +3423310739,55.36,47 +3423311395,53.11,48 +3423311354,38.51,28 +3423311393,38.96,21 +3423312817,27.4,30 +3423312479,34.95,35 +3423310481,68.86,31 +3423311405,60.4,28 +3423311690,37.33,38 +3423313373,59.63,24 +3423311282,33.92,43 +3423312839,43.98,6 +3423314319,53.74,31 +3423313884,45.51,33 +3423313972,55.09,19 +3423312249,68.87,18 +3423312335,58.44,19 +3423311494,43.75,19 +3423312701,47.07,33 +3423313515,47.89,28 +3423314282,53.3,41 +3423312619,57.02,37 +3423312412,43.44,31 +3423314324,38.49,21 +3423311966,61.08,24 +3423312519,62.52,41 +3423311365,60.04,34 +3423311004,53.63,20 +3423311801,44.96,46 +3423311838,43.37,50 +3423311337,31.88,30 +3423311381,41.01,37 +3423311788,53.72,13 +3423312168,48.05,18 +3423311734,63.38,41 +3423313610,55.92,22 +3423313259,51.54,22 +3423313763,39.27,34 +3423313915,59.22,14 +3423313443,57.77,14 +3423314388,60.84,9 +3423311771,63.59,21 +3423312064,53.12,30 +3423313947,58.89,9 +3423311377,61.69,36 +3423311879,49.56,24 +3423313273,45.44,29 +3423313260,50.28,21 +3423312737,47.42,35 +3423313200,45.8,28 +3423313121,38.07,23 +3423311258,44.13,24 +3423314365,51.91,59 +3423310478,51.7,36 +3423310833,39.66,19 +3423313185,49.73,26 +3423312421,52.86,24 +3423311571,55.81,29 +3423312578,42.86,24 +3423312982,52.31,25 +3423314281,62.35,28 +3423312574,32.9,16 +3423313596,48.51,23 +3423312692,45.33,12 +3423313089,49.09,12 +3423314364,37.89,14 +3423312830,67.15,17 +3423314422,48.01,32 +3423314257,45.44,28 +3423312259,52.54,22 +3423310550,59.23,40 +3423312359,31.85,37 +3423313930,49.4,54 +3423313282,51.11,25 +3423312938,64.34,47 +3423311073,58.32,42 +3423311930,46.99,13 +3423313641,65.75,34 +3423311511,62.34,12 +3423313352,51.33,38 +3423313889,50,27 +3423313542,63.06,27 +3423312858,64.66,24 +3423314389,54.99,36 +3423313004,40.95,35 +3423311375,52.29,40 +3423311023,40.47,30 +3423310823,59.56,42 +3423312540,58,46 +3423313927,55.12,34 +3423312262,59.21,26 +3423312663,65.54,39 +3423313830,55.64,24 +3423313165,62.4,40 +3423313852,51.01,22 +3423311996,48.83,24 +3423312388,57.58,35 +3423311174,47.21,34 +3423313624,49.26,26 +3423312679,59.51,30 +3423312640,46.12,33 +3423311984,51.7,37 +3423312793,85.36,17 +3423311513,40.04,35 +3423312297,34.77,5 +3423310599,45.1,4 +3423311331,56,8 +3423312797,38.14,5 +3423310994,53.39,5 +3423314011,50.55,7 +3423311899,48.8,6 +3423312465,55.99,3 +3423312364,48.37,4 +3423310629,55.19,6 +3423312374,51.51,5 +3423311872,43.29,9 +3423311133,44.96,7 +3423314224,43.45,5 +3423311001,48.92,8 +3423312057,52.9,6 +3423310758,50.99,5 +3423311732,45.08,5 +3423311038,53.36,2 +3423312266,53.27,4 +3423313753,51.93,6 +3423313741,64.96,6 +3423313179,49.22,5 +3423313355,59.82,7 +3423312895,45.24,4 +3423314338,60.4,7 +3423312533,35.14,5 +3423312437,43.86,6 +3423313894,57.49,6 +3423310963,47.98,7 +3423311552,48.33,7 +3423310655,52.42,3 +3423313603,40.59,6 +3423310676,42.54,3 +3423310888,39.43,6 +3423311109,67.04,4 +3423313105,32.58,5 +3423310977,41.75,9 +3423313010,55.95,4 +3423312400,49.61,4 +3423313807,43.04,8 +3423312684,33.5,5 +3423312188,47.25,6 +3423313285,47.47,7 +3423310830,29.47,8 +3423310541,52.55,2 +3423313828,52.6,3 +3423310737,58.98,6 +3423310778,44.94,4 +3423311521,59.72,7 +3423314275,57.38,5 +3423310783,46.76,6 +3423311087,35.58,5 +3423312845,42.55,10 +3423313480,46.44,5 +3423312741,29.66,1 +3423310820,30.77,8 +3423310614,65.73,8 +3423311591,73.5,6 +3423313420,41.94,3 +3423313854,38.2,6 +3423313323,49.47,4 +3423312841,42.9,6 +3423311146,44.02,4 +3423311855,74.03,6 +3423313450,41.73,7 +3423313376,63.6,4 +3423311989,59.71,4 +3423311621,51.75,4 +3423312387,49.68,5 +3423311702,43.6,1 +3423311158,60.76,6 +3423312490,44.84,2 +3423313181,53.99,4 +3423312014,46.74,4 +3423311633,48.99,7 +3423311928,56.28,4 +3423312069,49.19,3 +3423312219,51.3,5 +3423312360,62.3,6 +3423312190,64.5,5 +3423314203,43.31,3 +3423311618,50.87,4 +3423314159,33.18,1 +3423313130,60.07,2 +3423312497,47.48,2 +3423311271,38.98,5 +3423314265,61.65,4 +3423313548,60.52,9 +3423312569,50.87,4 +3423312260,40.61,4 +3423313278,61.48,3 +3423310549,50.18,6 +3423311544,50.89,10 +3423313908,51.97,4 +3423312011,52.07,5 +3423310929,31.06,3 +3423311108,52.44,6 +3423314372,35.84,9 +3423311332,52.66,7 +3423312722,50.39,4 +3423313104,54.05,1 +3423312214,25.78,3 +3423312499,51.53,8 +3423312406,36.51,7 +3423313492,48.2,6 +3423312460,47.23,5 +3423313299,60.59,6 +3423311261,59.46,8 +3423311635,54.7,4 +3423313994,45.16,4 +3423313870,44.92,3 +3423310781,53.44,5 +3423313895,32.54,9 +3423313229,44.51,5 +3423310465,62.26,6 +3423313134,39.05,3 +3423312454,37.68,4 +3423310770,42.33,10 +3423314185,66.06,4 +3423312672,32.94,7 +3423310554,39.67,4 +3423312348,63.08,9 +3423310824,73.78,4 +3423312602,45.24,5 +3423312717,59.71,9 +3423311747,39.79,6 +3423313025,35.93,6 +3423313790,49.1,5 +3423312402,50.22,8 +3423312144,68.63,8 +3423312289,62.03,6 +3423313758,55.64,7 +3423313921,55.4,8 +3423310454,52.28,8 +3423312081,39.84,1 +3423310510,35.52,5 +3423314262,48.79,3 +3423313860,70.7,2 +3423312516,61.92,4 +3423310785,59.86,4 +3423314276,68.37,5 +3423312309,47.65,4 +3423313959,46.7,0 +3423311509,31.24,1 +3423313448,49.47,10 +3423313335,50.58,8 +3423314117,38.61,6 +3423311619,42.83,6 +3423310706,59.72,7 +3423311537,50.21,2 +3423311089,38.04,7 +3423313621,66.87,6 +3423313346,31.07,3 +3423311660,57.66,6 +3423312382,50.89,5 +3423311689,35.99,5 +3423311423,62.2,4 +3423313349,56.52,2 +3423312111,63.05,1 +3423313822,46.99,3 +3423313537,30.45,2 +3423312774,58.23,6 +3423313452,37.05,5 +3423311569,31.14,4 +3423310482,28.91,4 +3423310645,58.82,4 +3423313386,34.34,3 +3423310799,32.49,3 +3423311807,57.76,3 +3423314421,73.89,4 +3423314387,38.4,8 +3423311188,44.13,5 +3423312263,42.26,6 +3423312804,45.89,7 +3423312572,62.56,8 +3423310978,53.54,6 +3423313497,55.4,4 +3423310975,48.81,3 +3423314045,51.01,7 +3423312813,36.88,7 +3423311114,56.42,6 +3423313164,39.94,4 +3423312433,38.7,8 +3423310703,47.39,6 +3423314355,64.67,3 +3423311349,67.47,5 +3423313726,49.39,6 +3423312733,43.1,6 +3423311774,34.03,5 +3423311203,32.72,0 +3423312485,60.65,5 +3423312560,57.26,6 +3423313976,55.1,4 +3423313424,61.06,4 +3423311644,38.91,4 +3423311240,51.35,3 +3423310854,56.87,6 +3423313832,50.2,5 +3423313864,55.34,6 +3423310936,47.95,3 +3423310470,47.47,6 +3423312401,35.44,6 +3423310687,31.45,8 +3423310892,28.16,3 +3423311490,68.57,6 +3423312120,41.5,5 +3423313369,53.75,8 +3423312112,37.47,5 +3423313878,55.63,4 +3423310526,50.45,7 +3423310926,51.61,3 +3423311247,38.08,7 +3423312367,37.26,5 +3423314414,33.01,4 +3423312413,45.87,6 +3423311585,45.45,4 +3423314004,53.15,8 +3423312148,54.58,5 +3423312449,53.08,2 +3423310472,43.09,4 +3423312356,34.46,4 +3423313067,52.19,4 +3423310949,55.54,5 +3423313254,51.45,5 +3423314300,62.7,1 +3423312015,50.18,8 +3423311866,53.44,6 +3423311129,53.42,4 +3423311808,54.52,4 +3423311615,50.89,6 +3423311026,64.49,9 +3423311837,51.92,8 +3423310750,57.49,6 +3423314208,42.97,1 +3423312029,43.27,4 +3423314376,44.69,6 +3423311646,54.16,3 +3423312070,50.07,4 +3423314034,48.42,5 +3423313689,42.59,6 +3423314221,50.69,4 +3423312245,62.86,3 +3423310615,51.78,2 +3423312720,47.89,3 +3423314111,38.05,1 +3423311744,54.09,4 +3423311053,38.88,4 +3423314345,58,5 +3423311390,42.04,5 +3423312872,35.44,2 +3423313186,43.73,8 +3423312826,65.7,5 +3423310695,43.3,5 +3423312055,54.89,3 +3423314371,54.69,6 +3423310787,46.22,6 +3423311333,47.7,5 +3423313052,39.19,8 +3423313129,57.11,3 +3423313562,47.65,7 +3423312213,38.13,2 +3423312699,33.77,4 +3423312248,54.62,6 +3423314339,51.86,4 +3423311512,56.75,6 +3423314021,50.98,4 +3423313850,46.99,6 +3423314382,52.3,4 +3423311510,47.64,5 +3423311688,28.82,9 +3423311539,53.48,2 +3423310881,45.16,6 +3423313868,56.73,4 +3423310536,71.55,4 +3423313418,32.1,2 +3423310458,41.81,3 +3423312494,52.98,3 +3423310589,43.82,3 +3423311234,40.81,3 +3423311099,46.33,8 +3423311380,37.48,6 +3423313300,42.87,4 +3423313328,54.26,5 +3423313195,46.69,6 +3423312836,48.46,6 +3423310800,56.82,8 +3423313336,48.73,4 +3423313817,38.76,4 +3423312065,38.23,2 +3423313729,54.06,6 +3423311265,42.25,9 +3423310776,35.22,3 +3423312079,43.64,5 +3423314235,45.7,5 +3423311705,45.41,6 +3423314392,70.97,4 +3423311696,36.19,4 +3423311472,56.54,3 +3423313301,44.52,1 +3423313384,42.87,6 +3423311703,33.86,5 +3423311197,65.17,10 +3423311656,42.75,4 +3423310696,42.59,6 +3423314437,45.15,3 +3423311419,49.7,3 +3423312537,55.39,2 +3423313631,32.12,6 +3423313117,60.07,6 +3423313966,62.34,3 +3423311283,75.84,3 +3423312186,49.46,8 +3423312963,58.15,7 +3423314215,49.1,9 +3423311568,45.97,4 +3423311782,37.1,6 +3423310664,55.74,6 +3423311421,42.81,6 +3423313735,53.28,5 +3423310606,43.15,3 +3423312350,54.53,4 +3423312885,69.29,4 +3423314075,42.99,4 +3423312319,66.21,4 +3423313840,50.94,5 +3423311059,53.75,6 +3423312828,45.17,5 +3423311763,44.64,7 +3423313655,39.62,7 +3423312566,54.55,7 +3423311554,53.05,4 +3423313128,54.35,1 +3423311830,48.64,6 +3423313787,49.34,5 +3423312940,46.83,2 +3423314039,31.81,4 +3423310512,34.46,3 +3423314109,27.3,5 +3423311610,53.95,3 +3423312550,46.57,6 +3423314273,33.36,6 +3423313318,36.42,7 +3423311833,65.73,5 +3423310622,49.9,5 +3423313486,56.31,3 +3423313417,33.04,3 +3423313760,48.62,10 +3423310959,47.42,5 +3423314026,68.57,4 +3423312527,45.07,4 +3423311975,56.07,7 +3423312164,47.01,6 +3423310547,51.53,6 +3423310648,51.81,7 +3423313553,66.83,6 +3423311998,50.79,4 +3423313444,46.33,2 +3423312617,30.52,11 +3423313274,41.67,1 +3423312788,53.38,7 +3423311815,60.45,6 +3423311318,41.56,3 +3423311056,28.94,4 +3423312790,43.5,8 +3423311187,56.31,6 +3423312471,67.77,4 +3423312924,41.93,2 +3423314280,71.73,5 +3423311852,57.04,0 +3423314135,38.44,7 +3423312276,55.64,5 +3423314325,57.39,2 +3423313592,60.07,3 +3423313395,46.69,2 +3423311523,46.59,3 +3423314106,55.64,3 +3423311452,46.09,4 +3423313046,46.25,2 +3423314403,61.75,8 +3423314303,66.17,4 +3423314233,58.07,6 +3423311653,56.07,4 +3423312383,49.81,6 +3423311778,49.75,5 +3423311916,35.17,9 +3423312658,55.71,5 +3423311903,49.71,4 +3423311853,48.97,3 +3423313111,61.79,5 +3423312175,42.77,7 +3423313709,47.63,8 +3423313297,67.19,4 +3423312072,52.03,3 +3423314173,55.64,5 +3423314028,44.09,5 +3423312891,49.52,7 +3423313045,49.78,4 +3423312073,39.56,5 +3423312508,50.54,8 +3423311818,39.46,9 +3423314430,41.69,5 +3423312844,48.03,8 +3423310534,37.91,5 +3423311139,40.81,4 +3423312170,58.1,2 +3423311659,49.63,5 +3423312734,59.36,7 +3423313806,37.14,6 +3423313066,49.41,6 +3423313269,57.34,2 +3423311361,46.58,7 +3423313306,42.21,6 +3423311548,41.14,2 +3423313692,37.63,3 +3423311121,60.88,5 +3423310928,64.56,3 +3423314099,58.97,6 +3423313531,51.2,6 +3423311416,52.84,3 +3423313875,22.36,5 +3423313812,60.75,2 +3423314008,60.38,4 +3423310593,64.18,1 +3423311764,71.11,6 +3423310521,31.35,3 +3423312928,52.38,4 +3423311155,54.01,4 +3423310768,51.31,5 +3423313122,65.89,3 +3423313403,51.92,7 +3423312683,57.51,5 +3423311154,48.94,3 +3423313363,34.86,6 +3423310747,42.96,3 +3423311016,38.96,4 +3423311731,41.74,8 +3423313375,66.37,5 +3423311386,31.92,4 +3423313979,52.07,6 +3423312979,55.77,5 +3423312246,59.03,4 +3423311971,49.53,7 +3423314361,56.58,7 +3423314055,33.62,7 +3423312698,70.55,6 +3423311829,63.68,2 +3423310906,61.78,9 +3423311206,39.21,8 +3423310546,37.13,5 +3423312794,57.08,6 +3423312009,24.9,8 +3423312126,55.34,5 +3423310634,43.47,7 +3423314049,73.8,6 +3423311275,58.36,10 +3423313127,52.88,4 +3423311694,44.65,6 +3423313364,46.85,5 +3423312763,55.2,7 +3423311626,58.28,0 +3423313314,67.36,6 +3423312050,50.38,5 +3423311578,56.21,1 +3423314214,47.79,6 +3423312244,63.18,5 +3423314336,57.18,5 +3423310930,58.44,6 +3423313155,60.4,6 +3423313175,43.94,4 +3423310537,53.34,3 +3423314321,42.96,3 +3423312097,46.07,3 +3423311414,52.09,9 +3423311584,57.35,5 +3423312217,72.05,7 +3423312163,55.19,4 +3423312061,33.32,4 +3423314054,51.53,4 +3423313950,52.11,5 +3423313135,46.24,7 +3423314444,51.59,6 +3423312929,63.62,6 +3423313565,59.02,5 +3423311674,53.71,6 +3423314304,60.37,6 +3423310530,56,9 +3423310779,58.05,5 +3423313416,50.86,6 +3423313880,39.44,8 +3423313672,44.61,8 +3423314151,54.2,10 +3423310729,36.76,5 +3423312719,74.83,4 +3423313687,46.57,6 +3423310818,50.74,4 +3423312178,46.85,4 +3423311060,52.01,4 +3423311914,63.96,6 +3423314037,68.24,2 +3423312700,43.22,7 +3423311825,51.73,6 +3423314432,44.04,5 +3423312232,53.66,6 +3423313926,54.99,4 +3423312127,59.09,5 +3423312103,64.15,6 +3423314189,28.36,7 +3423313891,57.13,4 +3423311456,54.29,9 +3423311253,70.87,7 +3423310794,46.55,5 +3423312031,64.77,5 +3423312407,48.84,5 +3423312514,51.37,2 +3423313064,75.54,7 +3423313366,44.68,7 +3423313208,46.59,4 +3423310467,46.66,8 +3423313929,50.36,3 +3423312191,51.39,4 +3423314228,53.95,2 +3423310692,57.78,7 +3423311920,45.77,4 +3423312323,44.64,6 +3423312158,42.6,2 +3423310710,63.59,3 +3423312605,53.75,6 +3423312843,55.88,3 +3423312880,31.35,7 +3423312507,42.2,5 +3423313096,50.1,7 +3423312316,40.18,4 +3423313210,43.8,3 +3423314052,41.84,7 +3423311163,57.03,4 +3423311520,70.83,5 +3423313545,54.96,3 +3423314212,72.8,1 +3423313053,45.58,5 +3423312744,61.79,3 +3423311212,39.81,4 +3423313654,60.9,3 +3423310448,53.76,9 +3423313473,55.85,5 +3423312980,63.36,6 +3423310613,63.15,6 +3423310731,58.98,6 +3423311773,44.21,5 +3423311759,42.09,2 +3423311312,39.78,5 +3423310673,70.77,5 +3423312509,54.82,3 +3423313839,54.07,5 +3423311874,56.74,7 +3423311249,42.78,5 +3423312534,22.27,7 +3423311263,43.07,4 +3423313849,43.5,4 +3423310786,58.48,8 +3423313756,40.4,6 +3423311443,54.84,5 +3423311534,51.83,6 +3423312046,53.08,2 +3423313578,51.08,7 +3423312827,63.84,1 +3423311243,40.95,6 +3423313462,48.07,4 +3423313547,63.67,7 +3423310565,56.59,2 +3423311107,39.37,5 +3423312239,46.7,6 +3423314073,50.53,6 +3423312083,54.02,4 +3423312807,47.21,4 +3423311298,48.05,9 +3423311904,52.01,7 +3423312736,65.61,4 +3423313404,58.38,5 +3423313126,51.45,5 +3423310912,51.74,3 +3423312724,54.57,3 +3423311322,53.41,8 +3423312759,46.4,5 +3423313598,39.22,3 +3423311943,56.13,5 +3423312448,48.63,7 +3423313752,40.75,4 +3423314205,50.55,5 +3423311905,45.64,3 +3423313499,59.35,4 +3423314024,50.47,1 +3423313187,36.82,3 +3423313211,46.86,4 +3423313995,43.53,2 +3423313745,48.16,6 +3423312860,48.21,2 +3423314194,59.44,7 +3423310682,42.9,6 +3423313534,57.77,2 +3423311686,46.41,5 +3423313698,40.59,9 +3423311231,44.24,4 +3423314435,60.7,1 +3423311611,52.5,3 +3423312638,56.38,10 +3423314419,45.2,6 +3423312952,48.46,6 +3423313896,48.5,6 +3423313488,45.83,2 +3423313103,54.89,4 +3423313574,38.09,2 +3423312978,52.25,5 +3423313604,48.21,8 +3423311641,50.43,7 +3423310531,50.07,2 +3423312591,48.47,9 +3423312800,66.46,6 +3423311865,40.46,5 +3423311850,15.52,3 +3423311697,62.54,4 +3423312242,39.57,6 +3423313733,47.88,7 +3423312871,27.5,3 +3423311075,45.97,7 +3423310874,42.01,4 +3423312241,62.54,4 +3423312483,46.36,3 +3423312575,47.58,10 +3423313582,51.68,4 +3423311185,71.04,4 +3423312542,55.43,4 +3423311647,36.37,2 +3423312651,52.4,3 +3423313272,36.89,6 +3423311184,54.39,7 +3423313953,54.56,5 +3423310490,60.76,5 +3423314243,30.32,3 +3423310777,58.49,6 +3423312286,59.55,8 +3423311483,30.84,7 +3423311134,44.06,1 +3423313611,37.52,6 +3423311889,64.31,5 +3423313514,50.05,4 +3423313799,54.52,6 +3423312625,56.53,5 +3423311465,59.16,8 +3423313942,56.49,6 +3423314413,33.68,4 +3423311077,54.42,7 +3423313197,47.62,5 +3423310979,52.44,7 +3423313388,55.47,7 +3423313922,41.38,8 +3423312059,60.32,3 +3423313810,42.71,5 +3423313673,41.83,8 +3423313521,48.98,4 +3423312852,57.87,5 +3423311057,41.84,4 +3423313721,55.12,6 +3423314240,38.5,1 +3423311350,37.73,4 +3423312264,36.91,3 +3423312349,37.5,6 +3423312331,60.28,2 +3423313918,45.51,6 +3423310450,51.59,4 +3423314161,49.46,6 +3423313161,57.05,4 +3423313824,41.16,2 +3423311086,55.64,5 +3423310493,56.91,3 +3423313086,57.02,7 +3423310598,60.17,5 +3423310836,45.68,3 +3423312386,68.44,6 +3423310663,58.77,5 +3423311473,51.97,2 +3423313690,58.28,5 +3423312890,50.39,5 +3423311988,23.9,6 +3423313746,36.91,6 +3423314131,48.86,4 +3423313378,70.21,6 +3423311596,61.06,3 +3423313235,49.54,9 +3423313803,47.36,7 +3423313936,54.75,5 +3423310980,62.01,4 +3423314301,56.41,3 +3423310937,32.09,2 +3423311875,42.75,7 +3423312115,73.66,5 +3423311906,54.27,5 +3423314118,42.71,7 +3423311145,46.62,3 +3423314354,45.74,4 +3423312154,53.28,5 +3423313711,41.83,5 +3423311309,48.76,4 +3423314146,47.89,4 +3423314247,49.28,2 +3423314337,56.76,6 +3423314061,42.92,6 +3423310723,44.74,8 +3423313207,51.19,5 +3423311670,41.78,5 +3423310506,47.95,9 +3423312373,58.91,6 +3423312835,52.6,7 +3423311978,53.59,2 +3423313201,48.09,6 +3423314353,39.06,1 +3423310811,57.59,4 +3423312778,44.44,3 +3423311500,59.68,4 +3423310491,57.78,4 +3423311985,27.86,8 +3423312044,48.62,9 +3423312049,54.09,5 +3423313576,39.3,2 +3423313458,38.32,5 +3423313326,43.68,1 +3423311970,53.08,5 +3423314154,55.26,5 +3423314184,51.24,2 +3423312292,50.03,3 +3423312032,49.38,3 +3423314431,68.26,5 +3423312491,44.97,8 +3423311055,43.36,3 +3423313771,42,8 +3423314306,56.86,6 +3423311267,36.67,7 +3423312265,34.28,2 +3423313319,57.2,5 +3423311303,61.23,7 +3423312337,41.21,5 +3423311285,43.36,4 +3423313887,43.16,6 +3423310878,55.7,5 +3423313772,46.44,5 +3423314335,55.43,3 +3423313954,55.96,8 +3423314362,60.51,2 +3423314107,55.29,6 +3423313405,64.11,5 +3423310610,47.18,7 +3423311949,49.05,5 +3423313407,41.91,5 +3423313160,39.94,6 +3423313230,52.03,5 +3423313647,47.6,4 +3423310576,57.05,9 +3423314014,57.45,5 +3423311237,29.65,3 +3423314122,44.09,4 +3423314375,61.38,9 +3423312078,39.04,4 +3423311859,48.86,10 +3423313727,42.99,8 +3423313716,53.51,8 +3423310675,63.68,6 +3423313546,35.51,7 +3423313383,38.84,4 +3423311406,52.64,6 +3423314002,65.57,6 +3423313740,47.85,7 +3423312942,57.64,4 +3423311191,62.54,7 +3423311259,52.2,7 +3423312628,35.54,7 +3423311245,41.49,6 +3423311334,49.31,6 +3423314003,33.9,6 +3423312489,41.13,4 +3423312473,57.57,1 +3423313794,40.22,6 +3423310514,39.79,4 +3423311589,56.45,4 +3423311293,51.2,3 +3423311634,54.34,6 +3423311704,75.52,0 +3423312300,37.02,2 +3423311844,57.63,8 +3423313519,37.91,7 +3423310947,36.49,5 +3423312655,44.97,9 +3423310843,49.35,10 +3423311474,53,4 +3423312573,56.63,5 +3423310535,66.45,4 +3423310605,41.42,7 +3423310616,59.14,8 +3423312884,36.8,4 +3423313044,55.03,2 +3423312787,51.24,5 +3423313031,45.52,4 +3423310693,50.75,5 +3423311067,73.97,3 +3423312510,40.05,5 +3423313071,49.55,3 +3423311072,60.58,3 +3423310570,36.12,4 +3423312114,43.7,7 +3423311561,77.82,9 +3423312317,42.41,7 +3423311941,46.46,6 +3423313768,42.52,5 +3423312255,48.96,6 +3423313865,41.59,3 +3423312926,58.07,5 +3423313544,49.89,9 +3423314088,71.42,3 +3423312267,48.29,5 +3423311199,47.24,3 +3423313601,42,5 +3423311069,52.11,8 +3423310745,54.97,5 +3423313489,58.11,9 +3423312132,72.99,4 +3423313178,58.53,2 +3423311202,56.08,4 +3423313662,33.77,3 +3423312954,39.31,3 +3423310538,33.46,8 +3423313469,48.75,8 +3423311000,64.69,5 +3423312847,38.47,2 +3423313780,47.13,3 +3423311079,50.73,9 +3423314274,34.4,2 +3423311233,33.48,5 +3423312287,45.9,7 +3423310868,49.23,3 +3423313242,51.51,6 +3423312305,49.18,5 +3423313730,57.78,5 +3423311336,28.74,5 +3423312674,45.15,5 +3423314423,50.93,7 +3423314204,61.99,5 +3423314015,47.9,3 +3423311724,65.76,4 +3423310661,56.57,8 +3423314352,48.53,4 +3423312673,53.39,7 +3423313304,43.86,8 +3423311650,58.52,6 +3423314162,66.75,7 +3423311981,68.18,5 +3423312708,62.89,5 +3423311842,55.64,4 +3423312918,53.12,6 +3423310789,57.4,9 +3423313525,35.52,6 +3423314278,40,3 +3423313952,58.5,5 +3423312087,27.59,7 +3423313006,45.9,6 +3423310753,62.47,7 +3423313102,38.7,5 +3423313011,53.45,4 +3423310607,56.48,3 +3423310908,58.94,5 +3423311355,58.46,8 +3423312597,58.46,5 +3423311982,57.82,1 +3423311115,46.11,4 +3423312475,57.25,6 +3423311022,35.04,5 +3423314271,38.49,7 +3423311173,62.91,4 +3423312670,51.77,5 +3423312833,34.74,8 +3423311446,48.67,3 +3423313116,72.61,4 +3423314066,41.2,3 +3423310486,45.08,4 +3423313356,54.41,7 +3423313227,36.24,2 +3423314020,43.29,4 +3423311032,35.7,2 +3423313374,49.07,7 +3423312629,53.42,7 +3423310816,66.52,2 +3423312637,54.8,8 +3423314101,51.22,4 +3423314287,40.14,1 +3423312468,45.01,7 +3423312922,67.27,3 +3423310704,55.42,6 +3423313190,45,7 +3423310899,63.99,5 +3423311761,50.13,7 +3423311142,41.71,8 +3423313675,64.48,3 +3423310935,52.58,7 +3423310877,50.79,5 +3423311912,60.41,8 +3423312705,44.97,6 +3423313140,43.83,3 +3423311031,41.84,7 +3423313778,60.82,5 +3423313287,46.26,7 +3423312002,44.7,4 +3423313494,42.04,3 +3423311983,38.93,1 +3423311180,57.13,5 +3423312351,37.52,1 +3423311348,59.66,3 +3423311125,49.6,4 +3423310477,40.66,7 +3423314211,48.05,7 +3423312279,55.67,3 +3423312010,43.35,3 +3423310807,46.51,6 +3423314270,70.91,7 +3423311150,44.03,7 +3423313294,35.2,6 +3423311953,43.3,5 +3423310726,44.78,6 +3423310523,52.82,5 +3423313305,37.21,5 +3423311563,42.7,6 +3423314025,33.32,8 +3423312696,58.67,6 +3423312639,58.13,7 +3423312209,47.11,6 +3423313502,45.39,6 +3423312231,39.61,6 +3423313619,63.94,4 +3423310825,45.15,3 +3423310812,59.59,6 +3423312687,64.19,4 +3423310886,45.63,7 +3423311613,47.64,3 +3423311637,63.75,8 +3423313983,54.16,5 +3423312346,66.42,4 +3423310665,45.96,4 +3423310690,37.21,6 +3423310612,70.05,7 +3423312034,54.32,3 +3423311923,49.09,5 +3423312052,47.47,7 +3423312882,56.14,6 +3423313802,47.57,5 +3423313174,55.15,9 +3423312118,52.99,9 +3423313225,68.57,4 +3423310946,35.46,5 +3423310773,27.32,6 +3423311118,44.74,2 +3423311330,46.55,5 +3423311447,36.59,7 +3423313851,45.58,0 +3423313202,68.64,9 +3423311277,36.22,5 +3423313723,43.94,6 +3423313238,39.21,5 +3423313334,32.36,5 +3423310964,70.25,6 +3423312381,51.06,7 +3423310752,46.92,6 +3423312092,44.49,7 +3423311495,62.01,2 +3423312815,32.33,7 +3423312452,54.26,3 +3423311205,52.53,6 +3423313762,50.69,8 +3423313819,53.79,5 +3423310567,36.19,2 +3423312487,42.01,4 +3423311182,31.12,5 +3423314094,56.16,6 +3423313657,62.38,2 +3423312635,52.32,6 +3423314206,34.81,3 +3423313099,42.03,4 +3423311341,43.61,8 +3423311270,63.2,7 +3423312747,41.61,4 +3423312730,33.45,5 +3423311880,39.06,8 +3423314108,53.51,1 +3423311003,19.62,3 +3423310788,41.26,7 +3423311274,51.31,3 +3423310623,32.51,2 +3423312941,53.18,7 +3423312675,51.24,5 +3423310466,48.97,4 +3423312480,68.7,3 +3423312258,74.14,5 +3423312784,38.76,6 +3423310581,46.39,6 +3423312802,56.55,2 +3423311120,50.93,10 +3423312503,44.51,1 +3423314237,76.31,4 +3423311683,59.02,3 +3423313909,45.85,6 +3423311735,59.98,6 +3423312223,44.15,5 +3423313501,56.09,11 +3423314060,41.52,5 +3423312981,71.33,4 +3423313024,66.13,4 +3423313446,38.77,5 +3423312773,48.5,7 +3423311973,43.65,5 +3423313267,52.81,10 +3423310743,57.35,7 +3423313143,57.27,4 +3423311397,58.51,6 +3423310885,48.44,7 +3423314436,45.99,3 +3423313051,62.48,8 +3423314195,53.07,1 +3423313506,43.96,5 +3423311942,55.07,6 +3423310542,34.54,2 +3423310991,50.82,6 +3423312632,55.45,8 +3423313940,50.43,7 +3423314385,59.81,8 +3423311643,48.09,3 +3423313475,64.81,8 +3423312707,48.74,7 +3423311590,53.44,1 +3423310516,52.31,3 +3423311002,38.29,6 +3423313084,47.79,9 +3423311789,71.67,7 +3423313890,37.41,5 +3423314071,52.13,7 +3423310867,63.03,2 +3423312704,52.1,2 +3423313219,47.39,5 +3423312395,42.57,5 +3423314181,76.89,5 +3423312472,52.59,6 +3423313699,56.08,8 +3423311809,38.66,1 +3423311281,45.23,5 +3423312119,52.85,5 +3423312515,56.87,6 +3423313816,36.66,4 +3423310863,48.88,2 +3423313939,60.99,4 +3423313415,48.69,6 +3423311398,36.84,6 +3423311453,36.08,5 +3423313524,49.37,3 +3423313530,51.82,5 +3423313688,42.53,4 +3423311394,44.13,6 +3423313002,53.27,6 +3423310734,54.33,6 +3423313512,65.01,6 +3423313232,51.46,4 +3423310915,56.07,6 +3423310797,53.48,5 +3423312745,71.54,1 +3423314191,44.69,3 +3423313551,36.53,6 +3423313737,50.75,4 +3423313706,57.83,7 +3423312997,47.41,3 +3423310489,52.98,6 +3423313075,46.99,6 +3423313199,66.27,5 +3423314150,33.42,9 +3423311692,61.98,4 +3423313157,34.11,1 +3423313679,52.42,7 +3423314296,57.05,5 +3423313912,54.73,5 +3423311716,64.47,2 +3423313119,31.38,7 +3423314067,68.87,5 +3423311172,57.61,5 +3423311558,57.65,8 +3423312332,58.08,4 +3423312003,51.31,6 +3423312174,66.32,6 +3423311112,51.75,5 +3423312555,51.75,6 +3423311726,34.26,5 +3423312903,48.79,6 +3423313900,23.46,2 +3423312798,61.32,4 +3423311433,44.18,5 +3423314283,38.98,7 +3423311800,41.52,8 +3423311189,64.71,5 +3423313815,55.04,4 +3423312571,41.94,3 +3423311680,46.88,6 +3423312467,73.97,3 +3423311506,35.55,5 +3423311411,51.54,9 +3423314038,53.32,4 +3423312996,63.86,2 +3423313030,33.69,8 +3423311242,47.34,6 +3423313658,35.38,4 +3423311668,61.31,4 +3423313668,55.71,1 +3423310856,58.87,3 +3423313214,43.3,5 +3423312091,36.19,8 +3423313463,47.62,5 +3423312825,39.87,5 +3423311781,51.76,9 +3423313460,46.82,5 +3423312321,59.73,8 +3423314110,60.88,7 +3423313091,55.99,4 +3423313518,46.93,3 +3423311413,62.48,2 +3423312492,59.47,5 +3423312461,49.09,5 +3423311977,42.49,5 +3423313725,36.63,5 +3423313783,57.03,6 +3423312545,40.71,6 +3423312824,42.1,6 +3423310630,53.8,3 +3423313049,53.5,2 +3423312517,37.27,6 +3423311843,44.17,6 +3423312145,39.32,5 +3423311530,48.39,6 +3423312823,57.14,5 +3423314089,47.26,3 +3423310475,38.03,5 +3423314425,31.36,5 +3423312271,68.26,6 +3423312706,54.23,2 +3423312755,52.52,1 +3423310603,52.19,3 +3423314129,78.26,4 +3423313239,78.56,4 +3423311792,58.48,5 +3423311214,47.36,7 +3423312968,50.25,6 +3423311062,76.26,2 +3423313061,34.12,4 +3423311006,49.99,7 +3423311420,60.89,6 +3423313426,51.83,6 +3423311968,40.4,2 +3423313062,64.86,8 +3423312486,51.32,3 +3423313533,55.83,4 +3423311198,61.88,2 +3423312644,45.35,4 +3423313666,55.7,1 +3423312816,40.17,2 +3423310983,61.46,7 +3423311403,41.67,4 +3423314256,44.85,3 +3423314402,53.25,8 +3423314390,41.92,5 +3423310909,63.38,4 +3423312769,55.52,8 +3423311730,59.16,7 +3423310847,62.14,6 +3423312290,33.4,9 +3423313493,41.01,3 +3423312296,39.59,3 +3423313434,48.31,6 +3423311220,47.96,4 +3423313379,40.43,5 +3423311749,50.35,4 +3423313999,46.08,3 +3423312096,48.83,5 +3423313584,50.29,4 +3423312543,56.23,4 +3423311498,36.93,4 +3423311700,51.13,8 +3423313387,45.21,3 +3423314426,47.56,8 +3423311840,49.5,9 +3423312315,41.22,8 +3423313844,65.02,3 +3423310501,61.22,9 +3423310720,39.6,7 +3423311546,59.64,6 +3423311058,50.75,8 +3423313464,55.84,3 +3423311090,51.12,6 +3423310694,54.09,6 +3423313948,45.71,7 +3423311885,64.43,5 +3423312299,51.49,2 +3423311412,42.73,5 +3423312399,55.28,2 +3423312150,46.47,7 +3423311681,65.79,3 +3423313322,42.04,5 +3423313193,52.88,5 +3423311260,51.86,2 +3423311605,42.75,4 +3423313158,41.34,3 +3423314046,37.14,6 +3423313476,51.39,9 +3423313833,41.93,4 +3423313226,64.54,7 +3423313792,50.24,8 +3423311516,43.43,10 +3423312911,26.87,3 +3423314370,55.75,3 +3423313588,51.23,5 +3423311415,45.6,4 +3423314386,53.04,9 +3423312914,74.2,3 +3423311396,68.53,3 +3423313886,61.21,1 +3423313990,51.91,2 +3423311709,39.73,7 +3423313797,63.49,4 +3423314438,41.37,1 +3423313701,58.09,6 +3423311725,41.1,4 +3423312149,29.32,6 +3423313777,53.43,5 +3423311020,42.54,5 +3423312806,48.38,6 +3423313708,53.68,2 +3423311244,43.9,1 +3423311225,50.56,4 +3423312392,62.59,4 +3423311796,37.18,3 +3423313509,48.98,6 +3423314373,46.67,6 +3423313674,56.56,3 +3423310677,48.74,3 +3423312838,63.24,3 +3423311810,36.26,11 +3423310594,37.36,2 +3423312848,48.48,1 +3423312677,29.42,7 +3423310764,55.04,5 +3423313749,54.05,2 +3423310566,49.24,1 +3423312429,51.72,5 +3423312799,43.03,7 +3423312631,56.77,7 +3423312654,31.62,5 +3423314036,30.61,2 +3423311352,39.68,4 +3423314149,44.39,4 +3423311008,47.51,9 +3423314245,59.39,5 +3423313291,65.3,5 +3423312254,44.3,3 +3423313484,47.74,3 +3423311463,57.2,7 +3423312528,37.62,3 +3423311593,44.39,3 +3423312529,51.67,5 +3423311096,59.58,4 +3423312370,42.08,7 +3423310617,34.59,5 +3423313645,60.39,8 +3423312107,59.46,6 +3423311135,56.72,3 +3423312559,55.09,4 +3423313577,59.63,3 +3423312189,54.41,4 +3423312693,46.15,5 +3423310686,29.92,4 +3423311972,52.07,3 +3423313951,57.64,6 +3423312974,55.9,3 +3423310551,44.65,4 +3423314254,60.79,6 +3423314169,50.38,6 +3423313293,62.09,3 +3423314063,51.05,4 +3423313330,53.11,2 +3423314397,64.75,7 +3423310471,61.68,6 +3423312001,51.3,5 +3423314231,59.63,1 +3423312863,56.13,6 +3423310572,42.12,4 +3423312962,60.37,7 +3423311379,37.6,6 +3423313009,51.12,4 +3423313943,46.04,2 +3423313112,42.52,10 +3423314284,66.51,10 +3423312765,37.17,5 +3423312236,55.8,5 +3423313676,56.27,4 +3423314349,65.04,8 +3423313968,43.77,6 +3423313607,63.01,7 +3423313670,42.77,6 +3423311777,44.26,8 +3423310831,55.84,9 +3423314238,52.42,7 +3423312967,29.67,5 +3423312614,46.95,8 +3423310744,46.28,5 +3423311153,55.74,5 +3423312099,56.12,3 +3423313196,54.26,4 +3423310958,56.04,1 +3423311148,60.79,8 +3423311604,50.19,5 +3423313139,50.72,6 +3423313431,52.72,2 +3423312777,61.67,5 +3423312685,47.17,5 +3423311272,51.93,5 +3423310557,38.38,6 +3423313786,57.32,9 +3423312592,39.57,4 +3423311857,68.99,4 +3423311106,35.39,7 +3423314097,56.91,8 +3423311238,48.66,4 +3423312089,57.91,4 +3423314164,41.25,6 +3423313732,46.26,8 +3423313474,27.99,2 +3423312992,40.4,5 +3423311802,45.54,3 +3423310453,34.68,6 +3423313022,48.75,6 +3423310495,46.79,7 +3423310810,53.83,6 +3423313635,51.13,5 +3423312908,61.03,7 +3423312714,32.3,5 +3423312652,55.84,5 +3423314391,29.73,4 +3423312368,58.43,5 +3423311886,45.56,3 +3423311891,56.99,7 +3423313490,42.04,4 +3423311878,54.03,3 +3423311278,49.17,5 +3423311769,55.12,6 +3423313722,39.28,6 +3423310639,39.88,4 +3423311085,34.49,8 +3423311913,50.52,7 +3423310880,40.7,2 +3423310483,53.33,3 +3423311540,34.04,5 +3423313182,45.54,11 +3423312660,52.09,8 +3423310716,45.01,8 +3423312576,52.06,3 +3423310555,58.46,5 +3423311488,37.58,7 +3423311565,46.91,2 +3423314294,48.57,8 +3423310796,25.39,7 +3423310684,53.29,8 +3423313063,55.97,3 +3423313667,39.14,7 +3423313092,48.74,4 +3423310945,52.26,2 +3423313077,67.57,7 +3423312842,38.43,4 +3423311256,29.59,4 +3423311455,39.81,8 +3423311617,37.84,3 +3423313223,53.56,2 +3423312633,48.79,2 +3423311947,37.66,2 +3423311599,52.05,7 +3423311931,65.27,6 +3423313587,38.74,6 +3423313321,29.28,3 +3423312496,21.6,3 +3423312889,28.7,0 +3423313467,46.08,7 +3423311868,42.02,5 +3423312820,49.96,1 +3423310950,48.05,9 +3423311171,61.36,5 +3423314400,52.83,7 +3423310865,69.19,4 +3423313510,33.2,3 +3423311555,51.27,5 +3423313396,54.48,2 +3423311501,42.47,7 +3423310730,46.2,5 +3423311581,47.92,2 +3423313265,49.43,5 +3423312039,38.26,5 +3423310562,30.5,5 +3423311654,48.88,7 +3423314105,49.45,4 +3423310620,60.69,6 +3423312325,57.36,7 +3423313350,51.15,6 +3423313365,30.3,1 +3423313303,55.41,3 +3423313593,44.98,3 +3423313903,56.9,10 +3423314383,36.37,5 +3423312957,55.2,1 +3423311046,66.98,9 +3423312584,65.93,3 +3423311081,50.89,5 +3423314023,40.97,3 +3423312484,46.17,3 +3423312829,43.19,5 +3423310985,42.01,6 +3423313831,66.26,6 +3423314326,48.41,6 +3423311714,64.68,10 +3423311028,65.17,2 +3423310801,40.48,4 +3423311664,38.74,7 +3423312198,39.83,4 +3423310627,42.94,3 +3423311499,54.04,7 +3423310927,35.32,6 +3423311149,62.47,4 +3423311335,39.72,7 +3423310587,47.95,9 +3423311699,40.29,7 +3423313159,51.25,5 +3423311009,51.77,3 +3423311742,32.28,8 +3423311036,44.08,5 +3423310657,52.84,4 +3423313110,48.98,3 +3423310855,57,8 +3423313310,58.22,5 +3423310474,41.38,2 +3423310738,45.01,7 +3423311816,43.28,9 +3423313686,52.78,9 +3423311939,37.25,8 +3423313937,53.88,3 +3423313731,53.84,6 +3423311583,52.7,1 +3423313180,44.56,4 +3423314253,50.16,7 +3423313910,51.76,4 +3423312417,40.61,4 +3423311564,56.47,8 +3423312172,41.94,6 +3423313036,62.92,7 +3423314076,65.48,7 +3423310533,48.82,2 +3423311504,39.13,4 +3423313413,30.27,4 +3423314166,47.01,6 +3423310846,39.04,4 +3423313644,36.6,4 +3423311550,60.09,3 +3423312431,38.5,7 +3423314140,60.46,3 +3423313339,48.26,5 +3423311193,46.74,2 +3423310674,44.85,2 +3423310813,61.38,6 +3423313385,63.88,6 +3423312894,44.43,4 +3423312546,59.64,6 +3423312324,55.07,1 +3423310582,56.72,7 +3423313956,52.33,4 +3423312579,47.04,7 +3423313263,45.59,6 +3423311374,45.47,5 +3423314207,50.18,3 +3423310871,38.85,8 +3423312920,51.57,3 +3423310763,56.38,1 +3423314218,43.52,8 +3423310714,53.46,5 +3423310625,45.74,4 +3423311755,46.48,6 +3423313620,50.63,8 +3423313898,43.06,3 +3423312192,34.77,4 +3423310485,47.34,5 +3423311469,59.12,11 +3423313663,41.22,4 +3423311211,46.41,10 +3423313639,45.58,4 +3423312415,45.28,7 +3423314069,55.43,6 +3423313569,54.32,4 +3423311805,40.04,3 +3423312851,62.46,5 +3423312423,60.24,4 +3423310887,54.78,6 +3423314192,52.11,4 +3423313163,46.1,6 +3423310700,46.13,9 +3423311304,45.62,5 +3423312122,56.74,3 +3423314310,42.62,4 +3423312810,74.88,7 +3423314096,33.22,3 +3423313978,39.93,4 +3423313243,60.35,2 +3423314127,54.66,4 +3423312463,34.96,6 +3423310631,58.1,8 +3423311753,46.64,2 +3423313430,63,6 +3423312330,48.1,9 +3423312904,38.66,3 +3423312018,48.91,0 +3423312641,63.5,4 +3423313056,44.26,4 +3423314329,36.23,3 +3423314174,45.16,5 +3423310513,49.35,8 +3423310662,49.73,11 +3423310883,54.63,5 +3423311575,54.04,7 +3423311410,60.63,6 +3423312147,40.05,0 +3423311757,61.61,1 +3423312910,45.63,8 +3423312017,40.1,4 +3423311739,35.24,7 +3423312620,49.02,7 +3423314022,50.07,8 +3423312715,54.38,4 +3423311378,56.85,3 +3423312760,42.39,4 +3423311719,49.53,4 +3423310918,37.42,8 +3423313148,57.2,6 +3423313770,38.4,4 +3423313465,50.1,5 +3423311803,53.42,1 +3423312728,50.55,4 +3423312568,45.7,7 +3423311595,48.56,3 +3423310529,59.31,7 +3423311921,46.6,4 +3423313145,49.49,3 +3423313637,40.67,5 +3423312661,62.34,3 +3423314360,56.89,7 +3423311323,52.19,4 +3423312532,39.67,5 +3423312354,49.69,5 +3423313371,56.33,3 +3423313800,44.21,5 +3423313618,43.54,6 +3423312142,33.63,5 +3423312764,58.27,1 +3423311566,40.18,6 +3423311043,50.38,5 +3423313897,37.68,5 +3423312207,50.65,3 +3423313960,44,7 +3423314157,50.05,8 +3423313575,56.99,7 +3423312618,66.04,5 +3423311698,33.17,2 +3423310520,45.31,6 +3423313919,64.43,8 +3423311835,52.05,4 +3423310996,45.12,4 +3423310509,55.8,8 +3423312284,55.43,4 +3423311084,43.25,4 +3423312549,55.56,4 +3423312124,47.95,8 +3423311019,66.13,5 +3423313559,50.75,2 +3423310804,64.75,4 +3423310848,36.52,7 +3423310866,60.46,4 +3423311217,49.37,9 +3423313432,50.69,7 +3423312849,70.29,8 +3423311140,50.64,2 +3423312912,38.68,4 +3423313757,56.9,3 +3423312161,43.42,8 +3423312385,40,6 +3423311994,66.08,4 +3423313944,62.75,6 +3423314408,50.25,10 +3423314263,41.55,3 +3423312976,51.98,3 +3423314112,56.02,5 +3423313296,47.55,3 +3423311050,45.86,6 +3423313014,44.31,4 +3423310637,63.6,2 +3423313325,51.62,6 +3423310910,47.76,3 +3423312596,49.68,9 +3423311922,45.02,5 +3423311021,37.79,3 +3423312853,49.27,5 +3423310751,39.67,9 +3423312208,50.9,9 +3423313406,29.54,5 +3423311222,44.2,6 +3423313423,59.43,5 +3423313358,45.78,8 +3423310459,48.82,5 +3423311846,51.48,6 +3423311894,36.11,9 +3423312958,47.52,6 +3423310624,57.27,9 +3423312959,47.95,5 +3423312197,49.19,7 +3423312143,54,4 +3423310981,52.27,7 +3423312371,62.39,4 +3423311166,48.22,5 +3423313734,71.83,5 +3423310711,42.53,3 +3423313107,41.02,9 +3423312384,39.76,8 +3423313589,54.67,4 +3423312801,65.55,3 +3423314158,56.67,5 +3423311997,58.09,3 +3423313217,45.26,11 +3423313264,48.47,6 +3423313482,60.23,2 +3423312743,39.97,3 +3423313035,67.34,1 +3423311858,67.87,3 +3423312028,29.69,6 +3423310961,54.15,5 +3423310845,51.15,3 +3423313098,50.1,5 +3423313535,68.64,3 +3423311822,54.16,3 +3423312110,43.23,3 +3423310767,63.62,4 +3423311938,52.73,3 +3423313612,64.16,7 +3423313138,50.08,3 +3423314250,47.15,3 +3423313941,49.3,1 +3423310766,43.37,4 +3423312362,49.88,5 +3423312786,55.95,8 +3423312753,47.52,8 +3423312379,56.61,6 +3423314230,53.26,6 +3423314084,55.77,5 +3423311902,44.67,7 +3423314320,55.07,4 +3423313435,39.75,3 +3423313962,65.73,5 +3423312498,49.55,3 +3423314128,59.63,4 +3423314095,54.67,5 +3423312022,56.32,8 +3423312339,44.71,6 +3423314187,45.95,3 +3423314239,54.84,6 +3423311014,47.34,6 +3423313149,45.73,3 +3423312036,48.84,7 +3423314308,57.45,6 +3423312642,60.97,9 +3423314219,50.64,4 +3423311642,52.12,5 +3423311898,34.72,5 +3423312951,53.49,4 +3423310494,56.82,3 +3423312408,50.26,5 +3423311737,44.29,6 +3423313152,29.93,4 +3423314006,58.14,7 +3423313332,43.63,8 +3423313989,52.03,4 +3423313710,55.81,6 +3423312056,69.54,7 +3423314446,35.54,6 +3423312068,37.06,8 +3423313649,44.46,4 +3423313555,53.36,11 +3423312531,54.1,8 +3423313739,41.54,8 +3423314429,37.17,2 +3423313206,48.39,5 +3423311608,51.92,7 +3423311969,60.39,7 +3423314316,40.03,4 +3423313564,32.36,5 +3423310894,65.55,2 +3423313246,58.83,3 +3423311662,59.37,6 +3423314170,74.91,5 +3423313917,51.97,5 +3423312187,35.67,5 +3423314217,41.93,4 +3423313037,48.01,6 +3423312155,29.85,6 +3423313627,57.22,7 +3423311098,52.76,4 +3423313228,71.84,3 +3423313862,57.11,5 +3423312877,45.24,3 +3423314322,65.49,6 +3423314378,58.16,9 +3423314175,30.44,3 +3423311489,58.54,7 +3423312662,55.85,5 +3423310611,54.94,7 +3423312688,66.46,6 +3423311775,44.42,6 +3423312611,63.96,5 +3423310602,44.38,5 +3423313560,67.31,5 +3423314393,54.63,2 +3423310527,50.98,0 +3423313532,66.8,6 +3423314366,52.43,6 +3423313188,39.4,8 +3423311368,50.63,3 +3423310931,49.28,6 +3423312094,72.16,5 +3423311538,36.74,7 +3423310553,52.48,8 +3423314367,59.95,7 +3423311507,65.88,8 +3423312203,49.83,9 +3423314056,60.85,5 +3423314068,59.7,10 +3423310585,66.76,2 +3423313205,50.68,4 +3423312523,44.67,5 +3423314268,53.88,4 +3423314327,48.34,4 +3423310870,46.08,3 +3423311061,58.53,8 +3423312897,53.43,6 +3423314225,51.34,8 +3423313329,74.11,6 +3423312033,55.79,5 +3423310604,71.3,8 +3423312742,59.34,9 +3423310838,73.13,3 +3423312269,38.97,4 +3423312211,31.88,7 +3423312306,26.14,5 +3423312547,35.39,8 +3423311300,60.21,6 +3423312502,49.17,0 +3423310502,35.29,3 +3423312740,51.57,5 +3423313769,56.01,4 +3423313347,59.08,5 +3423310859,56.78,6 +3423312859,56.02,5 +3423311071,31.3,5 +3423311597,56.1,9 +3423313541,60.55,7 +3423313013,64.6,3 +3423311738,32.09,6 +3423311766,52.49,6 +3423314350,61.01,7 +3423312278,57.26,6 +3423312751,34.42,6 +3423311871,44.44,7 +3423312250,45.53,5 +3423312320,44.78,6 +3423312086,31.13,6 +3423310971,39.02,4 +3423311669,53.34,2 +3423312615,51.01,5 +3423311529,48.81,6 +3423311268,50.16,4 +3423314080,66.56,4 +3423313277,51.05,4 +3423311918,51.44,5 +3423311029,49.22,5 +3423312216,44.82,7 +3423312949,55.23,8 +3423313156,45.51,3 +3423313409,44.53,4 +3423312822,25.9,3 +3423313829,61.28,6 +3423313392,30.89,5 +3423311482,41.6,6 +3423310717,69.79,6 +3423312659,47.48,3 +3423313167,50.64,3 +3423310821,62.55,3 +3423313250,40.97,6 +3423311779,64.38,6 +3423313681,72.33,5 +3423311399,57.8,8 +3423311230,60.3,7 +3423313873,45.25,8 +3423310740,61.46,6 +3423314252,50.14,3 +3423311353,51.31,2 +3423313307,41.42,6 +3423314412,33.65,8 +3423312710,37.09,7 +3423311851,35.76,7 +3423314302,54.8,6 +3423314092,41.47,6 +3423313608,43.54,4 +3423311727,42.72,3 +3423312840,56.12,2 +3423311967,51.91,6 +3423312554,39.26,7 +3423313973,50.81,5 +3423311210,53,5 +3423312365,43.65,8 +3423310519,58.17,5 +3423312221,58.88,5 +3423311152,49.9,6 +3423311661,49.94,3 +3423312045,63.02,3 +3423314279,41.73,4 +3423310960,58.95,2 +3423311776,52.33,6 +3423314272,49.25,7 +3423311841,71.63,6 +3423314155,53.34,8 +3423313233,40.49,5 +3423312422,35.32,2 +3423311760,41.45,3 +3423312136,61.34,5 +3423311536,43.15,8 +3423313924,42.32,5 +3423311426,30.44,4 +3423313340,56.59,6 +3423313147,45.05,4 +3423312846,47.22,7 +3423313693,34.87,4 +3423312341,47.16,4 +3423312355,44.75,3 +3423310539,41.62,9 +3423312493,47.67,4 +3423312500,51.09,7 +3423311123,54.14,8 +3423310649,74.51,5 +3423311787,51.47,7 +3423312129,29.96,6 +3423312606,35.52,6 +3423311791,51.63,7 +3423313720,31.73,8 +3423312983,65.39,5 +3423311687,41.39,10 +3423311342,43.95,5 +3423312082,54.48,8 +3423312805,58.12,8 +3423311364,53.04,4 +3423313425,43.05,4 +3423314244,56.28,7 +3423312726,42.21,8 +3423311147,35.47,3 +3423311950,42.97,8 +3423313842,70.3,2 +3423310829,41.82,3 +3423312558,63.43,10 +3423313345,45.45,3 +3423311317,56.25,6 +3423313600,41.64,1 +3423312867,49.85,5 +3423310709,44.93,5 +3423310638,45.35,6 +3423311570,66.26,3 +3423311522,56.11,1 +3423311157,40.37,4 +3423311167,27.14,4 +3423313338,46.89,6 +3423311302,52.27,1 +3423314078,40.92,6 +3423312393,58.37,2 +3423311795,32.08,4 +3423313503,50.93,6 +3423313718,67.87,2 +3423310644,59.36,7 +3423313081,47.05,5 +3423312390,59.87,4 +3423313634,32.52,10 +3423313095,43.65,2 +3423312128,49.98,4 +3423313872,61.27,6 +3423313970,40.5,4 +3423312505,62.04,5 +3423311573,50.56,3 +3423313047,31.4,7 +3423311449,41.91,6 +3423313360,38.34,4 +3423314182,42.44,3 +3423313088,45.94,6 +3423311907,60.7,3 +3423311827,55.02,3 +3423314156,50.91,5 +3423312565,54.92,2 +3423312727,65.84,6 +3423311076,55.84,4 +3423313400,51.61,2 +3423312680,45.94,3 +3423314410,31.39,5 +3423311707,41.02,7 +3423312716,34.61,6 +3423313933,34,6 +3423312966,34.34,0 +3423314227,53.94,4 +3423311493,53.75,3 +3423313311,69.02,5 +3423311900,60.75,5 +3423312725,50.41,3 +3423314005,49.24,5 +3423313315,65.23,5 +3423314133,44.8,7 +3423312603,48.77,6 +3423313859,48.43,3 +3423312746,56.71,8 +3423311454,53.06,5 +3423313397,26.95,8 +3423310852,46.54,4 +3423313782,43.23,4 +3423314404,59.07,1 +3423311227,35.88,4 +3423311946,49.41,7 +3423314305,45,7 +3423312866,58.96,6 +3423311359,60.01,6 +3423311040,43.02,7 +3423311636,43.2,7 +3423312657,62.91,7 +3423313359,39.21,6 +3423312785,40.92,7 +3423312048,41.31,5 +3423313289,69.15,3 +3423311080,69.44,8 +3423313508,48.14,3 +3423312240,53.95,3 +3423314213,62.97,2 +3423313344,74.19,9 +3423314445,51.73,3 +3423312234,28.32,3 +3423310456,46.75,4 +3423313685,47.98,8 +3423312561,43.98,4 +3423313255,41.47,6 +3423312462,77.94,8 +3423311607,53.72,6 +3423312051,60.71,6 +3423312151,52.3,4 +3423310728,56.09,6 +3423312671,25.69,8 +3423313623,45.3,5 +3423310591,58.88,6 +3423311831,59.43,6 +3423311712,57.76,5 +3423310997,47.29,3 +3423314398,28.69,6 +3423313570,42.88,5 +3423311505,57.96,2 +3423312041,46.72,3 +3423312861,60.59,8 +3423313236,36.03,6 +3423312183,64.94,1 +3423312272,53.9,4 +3423311993,49.08,5 +3423312439,38.58,3 +3423313108,38.13,4 +3423311345,46.65,3 +3423313848,46.88,6 +3423310891,49.9,4 +3423310668,48.73,8 +3423311729,31.88,6 +3423310632,62.58,5 +3423312593,61.6,3 +3423312180,52.03,4 +3423311430,28.98,5 +3423311162,64.43,3 +3423312865,48.26,3 +3423313661,53.25,3 +3423311236,45.32,4 +3423312544,49.93,11 +3423312783,50.28,6 +3423313038,48.58,8 +3423310919,67.82,6 +3423313516,59.91,6 +3423310672,64.55,7 +3423313438,58.33,3 +3423312302,32.88,6 +3423311910,57.98,6 +3423312970,52.67,6 +3423311195,51.51,5 +3423310817,54.46,3 +3423312796,69.63,5 +3423310827,49.92,2 +3423314053,48.66,6 +3423311190,38.6,2 +3423314172,39.93,5 +3423312713,66.74,6 +3423310656,40.44,6 +3423313652,52.86,5 +3423314050,39.75,8 +3423310968,61.33,1 +3423312251,57.2,2 +3423310911,65.91,3 +3423313169,40,9 +3423314198,49.98,4 +3423311110,66,3 +3423312748,52.84,4 +3423312598,59.6,5 +3423310844,48.82,6 +3423311027,41.85,6 +3423310791,53.52,9 +3423311752,47.65,6 +3423313901,56.01,6 +3423310722,46.18,1 +3423313007,64.8,8 +3423310973,69.64,8 +3423313677,56.23,2 +3423314288,34.12,4 +3423311478,61.8,1 +3423310702,31.13,4 +3423312582,73.18,8 +3423313997,55.61,6 +3423311241,51.84,2 +3423312443,59.76,4 +3423313696,54.84,7 +3423312357,36.31,6 +3423313719,29.94,4 +3423312621,49.01,4 +3423311194,47.93,5 +3423313162,54.86,4 +3423310719,34.3,8 +3423313189,53.1,4 +3423310583,41.62,4 +3423314070,67.55,9 +3423311294,60.29,6 +3423313566,60.44,3 +3423314134,30.15,6 +3423313146,48.98,7 +3423312206,45.56,4 +3423313090,46.39,7 +3423312712,54.84,3 +3423311360,45.67,7 +3423313454,47,5 +3423314201,40.99,7 +3423312689,50.64,5 +3423311362,53.24,3 +3423313617,61.45,9 +3423313059,48.35,3 +3423314229,33.51,4 +3423313380,47.64,5 +3423313184,31.37,5 +3423313906,41.53,5 +3423314323,56.07,6 +3423311340,54.43,4 +3423313281,65.17,4 +3423311464,50.66,6 +3423311746,56.41,7 +3423311836,49.76,5 +3423311867,44,4 +3423312419,55.67,2 +3423310914,53.45,5 +3423312944,48.64,6 +3423311143,39.16,5 +3423313835,43.46,6 +3423313000,48.79,5 +3423310732,51.7,5 +3423313427,34.76,5 +3423311884,41.93,6 +3423310508,64.32,4 +3423313076,45.64,6 +3423312446,49.3,5 +3423313033,41.5,4 +3423313115,54.82,5 +3423310902,48.47,4 +3423311492,43.07,1 +3423313113,64.36,6 +3423311034,66.9,5 +3423313372,60.9,6 +3423311132,42.6,5 +3423312037,60.85,5 +3423312792,66.57,2 +3423314236,53.67,6 +3423311113,65.02,11 +3423313097,43.18,4 +3423313253,52,3 +3423312643,54.1,4 +3423313748,47.57,6 +3423311682,49.57,5 +3423312749,44.84,7 +3423312779,55.73,1 +3423310921,37.3,5 +3423312218,55.44,5 +3423314104,47.04,4 +3423314171,51.36,9 +3423313808,70.59,2 +3423311226,56.02,6 +3423312874,46.99,7 +3423314115,53.64,3 +3423310907,39.17,4 +3423314369,39.19,8 +3423314193,52.8,7 +3423311508,55.62,3 +3423311553,43.95,3 +3423312105,39.31,6 +3423310990,42.46,5 +3423313362,47.59,4 +3423312085,40.71,5 +3423310771,31.62,2 +3423311606,31.14,9 +3423314047,39.21,6 +3423311526,50.7,7 +3423311339,47.91,4 +3423311138,54.47,4 +3423313529,47.52,2 +3423312934,65.2,7 +3423311657,59.48,4 +3423312789,62.58,5 +3423313511,50.47,6 +3423314083,53.41,3 +3423314093,83.42,5 +3423311116,58.78,3 +3423313290,41.65,7 +3423313023,53,3 +3423313826,56.12,0 +3423313172,37.54,5 +3423314027,46.08,7 +3423311961,34.55,7 +3423314295,46.45,4 +3423312811,66.17,6 +3423312869,44.1,2 +3423311860,56.24,5 +3423311652,53.48,5 +3423313085,70.32,4 +3423311181,46.94,6 +3423313899,36.98,6 +3423311793,49.3,3 +3423311035,27.04,6 +3423311048,45.28,6 +3423313244,50.03,2 +3423311876,68.13,2 +3423310986,26.78,2 +3423311305,58.24,5 +3423314062,46.98,6 +3423312616,34.04,9 +3423310808,53.49,4 +3423311991,44.63,2 +3423312238,51.27,6 +3423311460,62.51,4 +3423313642,42.25,5 +3423313616,39.53,6 +3423311957,46.59,4 +3423311767,27.8,6 +3423311255,52.12,7 +3423310601,49.26,7 +3423313447,55.56,5 +3423310756,40.53,4 +3423312294,53.96,1 +3423311691,44.44,7 +3423313755,61.23,3 +3423312095,32.67,3 +3423313916,36.99,3 +3423313312,49.41,4 +3423313286,57.89,6 +3423311291,50.86,5 +3423312353,57.64,8 +3423310943,51.43,5 +3423311213,37.73,3 +3423314183,58.56,3 +3423312428,45.26,4 +3423314399,37.1,6 +3423310952,44.41,3 +3423312511,50.62,7 +3423311701,55.14,1 +3423314222,39.01,5 +3423313931,52.11,7 +3423312522,64.48,5 +3423313074,48.69,6 +3423310691,57.18,5 +3423312520,59.04,6 +3423312589,48.86,7 +3423313528,59.12,7 +3423312414,45.86,7 +3423311161,60.78,0 +3423312541,56.91,5 +3423312451,34.95,0 +3423312140,47.84,4 +3423312275,48.14,2 +3423312694,44.89,1 +3423313536,65.76,9 +3423312372,41.39,8 +3423311480,66.81,5 +3423313234,58.92,3 +3423313789,46.72,7 +3423312256,44.12,2 +3423312650,53.59,2 +3423311170,60.9,5 +3423312063,38.06,7 +3423311251,64.21,7 +3423312194,56.61,8 +3423310522,48.03,5 +3423312989,48.63,4 +3423312280,46.73,5 +3423312562,51.97,5 +3423312775,60.65,5 +3423313125,54.41,6 +3423311078,65.88,5 +3423313324,38.04,4 +3423311932,26.27,4 +3423313665,60.05,6 +3423313858,50.65,4 +3423312329,50.07,4 +3423311572,44.68,1 +3423310841,69.88,7 +3423313411,48.53,6 +3423312731,61.06,8 +3423313069,54.4,4 +3423312252,65.32,3 +3423313892,48.91,2 +3423311141,47.33,3 +3423313636,49.7,3 +3423312436,35.97,7 +3423314216,55.46,5 +3423312411,49.48,4 +3423310858,45.89,8 +3423311387,45.69,3 +3423312923,70.45,6 +3423312868,58.45,6 +3423311514,65.42,6 +3423311391,38.21,5 +3423314130,42.86,4 +3423312116,44.62,2 +3423311925,58.27,6 +3423312169,34.84,8 +3423314009,56.96,7 +3423312134,58.07,11 +3423311715,51.52,8 +3423313283,36.76,7 +3423310449,51.86,4 +3423313955,57.16,4 +3423313602,48.67,3 +3423310925,59,4 +3423311068,54.36,2 +3423313609,41.52,4 +3423311370,46.37,5 +3423310457,51.91,6 +3423310579,65.6,6 +3423310762,61.92,4 +3423313963,40.24,5 +3423311063,65.11,5 +3423311367,26.3,7 +3423310884,41.87,6 +3423311491,51.6,6 +3423310540,43.69,4 +3423311528,63.9,7 +3423314251,72.04,3 +3423313742,50.67,4 +3423312377,57.78,1 +3423313572,69.93,3 +3423312948,56.26,6 +3423311175,44.91,7 +3423313136,52.01,7 +3423311549,41.83,5 +3423310882,49.27,5 +3423312478,40.09,5 +3423314433,75.09,5 +3423310775,44.64,4 +3423313522,41.63,6 +3423314368,54.41,5 +3423311794,27.46,5 +3423312936,52.21,5 +3423311873,74.91,4 +3423314044,25.48,5 +3423314341,52.75,4 +3423312000,45.21,7 +3423310904,49.97,6 +3423311756,55.33,5 +3423314137,30.34,6 +3423314196,53.01,4 +3423314309,58.27,6 +3423311954,41.23,6 +3423311196,46.1,4 +3423313070,60.1,5 +3423311037,51.02,7 +3423313517,52.59,4 +3423312038,43.26,2 +3423310940,50.62,8 +3423312137,46.51,8 +3423310462,49.64,5 +3423313393,50.67,4 +3423312664,65.59,9 +3423311813,45.58,7 +3423313825,55.72,5 +3423311487,44.47,4 +3423314267,55.88,5 +3423311658,47.4,6 +3423313904,64.15,8 +3423312881,61.37,1 +3423311315,35.09,4 +3423314332,47.59,5 +3423310957,53.96,4 +3423311176,63.61,5 +3423312004,45.05,4 +3423313980,43.89,5 +3423313080,57.82,7 +3423311518,50.96,3 +3423311870,42.94,5 +3423312955,67.08,1 +3423313728,49.88,6 +3423314087,37.35,5 +3423313845,53.29,6 +3423314292,37.04,5 +3423311515,50.28,4 +3423312613,50.96,6 +3423311229,46.38,6 +3423311990,34.79,4 +3423312925,38.92,7 +3423311579,46.54,4 +3423314141,62.24,10 +3423313295,50.96,1 +3423313846,53.23,6 +3423313429,53.89,2 +3423314277,74.41,5 +3423311280,50.85,5 +3423312361,62.67,1 +3423311092,36.33,8 +3423314246,50.48,8 +3423312053,43.32,4 +3423314266,50.78,2 +3423312440,49.62,4 +3423310993,52.6,4 +3423314132,45.29,9 +3423314415,64.98,3 +3423314311,41.82,4 +3423310924,55.3,4 +3423311088,52.57,7 +3423313068,51.53,2 +3423311169,46.71,4 +3423313885,52.3,3 +3423314374,43.69,6 +3423311297,29.7,6 +3423313705,53.59,5 +3423313809,48.51,8 +3423314136,29.01,3 +3423314417,56.14,9 +3423312179,56.83,5 +3423314188,59.93,7 +3423314178,52.72,4 +3423311307,59.63,3 +3423314261,32.63,4 +3423313483,26.62,5 +3423312430,66.95,5 +3423312782,35.24,2 +3423313218,66.96,4 +3423312702,54.87,6 +3423310956,70.04,1 +3423311083,45.11,6 +3423310518,44.77,2 +3423310905,75.68,7 +3423312261,53.45,5 +3423313171,55.01,6 +3423311632,34.24,4 +3423310873,52.42,5 +3423314048,37,5 +3423311100,49.14,6 +3423311025,49.35,6 +3423312389,32.54,8 +3423313964,52.15,4 +3423310640,47.49,5 +3423310761,54.5,5 +3423312548,31.58,6 +3423314180,59.91,7 +3423313595,34.02,6 +3423311119,57.14,2 +3423312513,38.27,8 +3423314226,33.56,7 +3423311012,57.74,4 +3423310563,51.29,5 +3423311329,51.87,4 +3423312580,41.31,5 +3423313650,16.39,8 +3423314007,56.55,5 +3423310953,41,6 +3423312901,31.72,2 +3423313977,63.75,4 +3423311392,67.15,7 +3423313881,33.66,8 +3423313248,40.32,6 +3423312295,52.41,3 +3423310792,56.56,4 +3423313040,50.32,5 +3423312464,50.72,5 +3423311097,25.17,6 +3423313550,45.32,6 +3423312581,45.15,7 +3423312006,38.63,4 +3423313078,63.43,4 +3423311049,32.45,6 +3423312141,34.68,7 +3423314077,49.89,2 +3423312204,66.63,9 +3423313333,34.11,5 +3423312425,56.86,4 +3423310862,46.14,5 +3423312326,69.49,1 +3423311101,67.46,4 +3423313455,37.83,7 +3423310822,52.81,5 +3423311639,35.42,3 +3423313073,48.84,4 +3423313491,58.43,2 +3423314363,37.76,5 +3423313969,29.88,4 +3423313377,49.48,2 +3423312551,36.84,5 +3423310452,53.5,5 +3423312876,48.95,2 +3423311926,51.91,6 +3423311804,72.14,6 +3423311436,51.91,6 +3423312585,32.58,6 +3423310499,52.68,3 +3423310654,58.94,5 +3423313695,63.95,3 +3423313863,36.88,5 +3423310840,57.25,3 +3423313993,44.51,6 +3423312512,58.4,7 +3423314411,38.4,4 +3423311477,44.95,5 +3423312121,47.87,4 +3423314013,35.83,6 +3423310842,24.58,4 +3423312202,48.71,5 +3423310922,56.67,3 +3423312570,48.72,6 +3423311864,47.6,1 +3423313191,55.05,6 +3423311347,62.81,6 +3423314139,50.86,4 +3423311111,51.63,4 +3423313656,27.83,5 +3423314394,50.4,7 +3423310708,53.69,5 +3423313684,48.17,4 +3423314441,38.25,6 +3423313660,54.66,3 +3423314079,42.41,7 +3423314428,52.04,1 +3423313659,67.41,4 +3423313280,50.71,7 +3423310642,48.45,6 +3423311811,58.03,4 +3423311758,64.12,4 +3423312369,41.6,7 +3423310815,42.32,6 +3423313811,69.72,4 +3423310835,60.39,7 +3423311937,57.86,6 +3423311908,58.88,6 +3423312225,54.94,7 +3423312019,48.64,4 +3423310948,69.66,4 +3423311745,55.4,9 +3423313192,42.27,4 +3423313093,58.67,1 +3423310564,58.64,1 +3423310469,49.85,8 +3423312930,37.14,2 +3423314313,33.95,5 +3423312153,40.72,6 +3423311890,56.34,6 +3423313118,72.41,7 +3423310577,48.52,7 +3423312288,68.6,4 +3423313065,61.86,7 +3423312040,52.59,5 +3423313257,50.94,6 +3423313982,58.56,7 +3423311266,62.48,7 +3423311945,27.48,5 +3423312627,37.26,1 +3423313292,70.3,3 +3423312476,42.54,5 +3423312752,54.23,5 +3423313309,52.43,11 +3423312973,54.87,4 +3423313913,33.42,4 +3423314098,67.51,7 +3423312506,52.69,1 +3423311024,55.16,7 +3423313055,52.11,4 +3423312552,54.27,6 +3423310972,56.07,5 +3423312403,46.64,5 +3423310984,39.64,4 +3423314113,43.58,7 +3423311407,54.64,5 +3423311363,39.87,4 +3423310679,52.7,7 +3423312609,45.53,4 +3423311933,50.34,6 +3423310619,53.39,4 +3423310741,44.5,8 +3423312588,62.6,6 +3423312347,53.76,4 +3423311279,54.1,7 +3423312993,56.2,6 +3423312809,49.33,7 +3423312447,72.9,7 +3423313414,45.83,2 +3423313351,57.25,0 +3423312667,66.67,5 +3423312781,34.86,3 +3423310724,51.34,6 +3423312344,49.93,5 +3423314059,49.43,7 +3423313678,49.56,4 +3423313585,64.63,8 +3423313820,68.18,5 +3423312875,56.45,6 +3423310733,35.82,5 +3423311979,51.35,6 +3423310861,48.34,5 +3423313382,45.94,6 +3423314116,42.37,5 +3423310669,60.81,6 +3423311288,36.82,5 +3423310558,47.77,6 +3423313766,59.02,3 +3423314090,60.99,4 +3423312521,63.68,4 +3423312768,40.54,5 +3423311065,47.81,6 +3423310872,49.72,4 +3423310559,58.11,7 +3423310895,60.73,5 +3423312108,47.39,7 +3423311321,63.15,0 +3423312101,71.55,4 +3423314264,55.78,7 +3423313680,44.77,3 +3423313368,36.18,5 +3423314012,55.18,5 +3423312646,67.74,1 +3423312434,59.16,8 +3423312308,55.51,8 +3423312343,40.01,1 +3423310574,49.71,6 +3423313410,56.49,4 +3423313697,55.05,7 +3423311366,57.91,7 +3423312181,50.49,3 +3423313561,56.18,6 +3423312935,51.35,2 +3423311117,48.29,5 +3423311519,30.16,5 +3423312906,57.01,7 +3423313869,42.9,6 +3423310580,64.85,6 +3423314347,61.21,7 +3423310463,31.6,5 +3423313348,43.61,3 +3423312586,35.6,1 +3423312109,55.47,9 +3423313470,37.15,7 +3423313579,48.09,5 +3423310544,49.65,11 +3423311672,31.25,4 +3423313717,54,6 +3423313905,47.68,4 +3423310712,48.19,6 +3423314200,37.64,7 +3423311614,63.66,1 +3423311948,34.94,5 +3423313468,34.41,4 +3423314258,47.46,2 +3423312016,60.78,6 +3423314152,55.58,4 +3423311376,44.49,4 +3423313879,36.07,5 +3423312873,49.26,7 +3423313591,64.9,6 +3423313925,53.25,3 +3423311358,54.25,7 +3423313279,48.47,3 +3423313539,50.56,5 +3423311439,45.67,7 +3423313251,59.67,3 +3423311629,47.77,5 +3423311319,51.2,9 +3423311648,52.57,5 +3423313967,56.52,7 +3423313747,63.71,6 +3423312837,63.72,8 +3423313422,65.08,3 +3423312668,48.42,5 +3423313027,41.08,2 +3423312933,46.2,5 +3423312686,47.74,7 +3423312177,50.78,5 +3423313945,62.3,5 +3423310497,38.42,2 +3423310900,51.55,6 +3423312008,41.31,6 +3423311695,57.2,5 +3423311676,51.97,7 +3423314220,34.74,4 +3423310969,62.81,4 +3423311924,55.97,4 +3423311819,54.99,6 +3423312495,58.13,4 +3423312947,66.86,6 +3423310832,45.03,11 +3423310681,53.54,7 +3423311287,23.2,8 +3423313765,64.13,6 +3423313082,53.11,3 +3423311284,62.58,8 +3423311944,57.68,2 +3423314317,29.13,5 +3423311232,58.83,5 +3423313150,44.83,7 +3423313804,40.46,7 +3423310941,58.92,4 +3423311559,56.04,6 +3423312459,41.14,6 +3423312152,48.35,7 +3423313341,56.17,1 +3423313261,47.57,5 +3423312035,44.22,5 +3423311649,52.85,4 +3423313453,41.15,7 +3423311404,47.82,4 +3423310496,66.89,3 +3423311440,43.25,7 +3423313216,74.47,5 +3423312409,47.43,2 +3423313017,49.34,3 +3423311262,56.37,3 +3423313357,50.64,4 +3423314380,60.36,5 +3423313580,45.74,5 +3423313823,60.32,5 +3423313026,59.08,5 +3423312754,40.87,5 +3423311269,63.55,1 +3423312622,54.65,4 +3423311824,53.47,4 +3423314031,41.7,5 +3423312334,55.76,7 +3423311722,53.33,5 +3423313798,49.58,4 +3423312524,40.9,5 +3423311289,54.32,5 +3423311883,52.11,11 +3423312961,54.34,3 +3423312950,41.75,5 +3423310989,49.19,5 +3423313442,46.8,6 +3423313958,30.97,8 +3423310903,42.42,4 +3423313204,30.31,6 +3423313736,32.95,10 +3423313987,58.13,6 +3423312043,64.32,3 +3423310608,39.93,4 +3423311018,40.63,3 +3423313240,48.81,6 +3423311952,40.34,2 +3423314340,54.29,9 +3423311684,47.92,7 +3423311762,55.11,6 +3423312277,58.93,6 +3423311588,34.04,6 +3423311295,50.3,8 +3423312892,51.38,4 +3423311424,42.02,1 +3423311919,39.33,6 +3423310879,49.76,7 +3423311679,46.96,1 +3423313557,58.61,5 +3423310998,32.53,4 +3423313176,38.94,8 +3423312888,46.81,9 +3423310701,50.56,5 +3423313016,37.57,6 +3423312117,49.86,5 +3423312293,53.57,4 +3423313861,35.15,5 +3423314035,36.91,6 +3423310954,60.04,5 +3423311733,41.25,3 +3423310584,49.53,7 +3423310511,44.54,7 +3423310876,53.61,3 +3423312771,62.81,7 +3423313743,49.22,8 +3423311130,52.34,7 +3423313183,55.11,3 +3423310571,50.03,9 +3423312518,54.39,3 +3423312761,47.56,5 +3423314346,39.39,8 +3423313451,46.85,7 +3423313552,37.68,7 +3423312998,50.56,5 +3423314190,179.22,95 +3423314144,192.34,69 +3423314442,140.25,92 +3423313001,184.28,70 +3423311047,200.58,50 +3423312780,197.99,95 +3423313932,199.81,32 +3423310689,168.45,70 +3423312919,197.87,74 +3423314356,172.08,87 +3423310586,164.31,45 +3423313042,162.56,56 +3423312098,165.2,74 +3423313466,211.36,5 +3423310545,157.51,59 +3423311847,208.47,26 +3423311620,180.95,85 +3423312577,167.83,70 +3423313198,144.23,49 +3423312363,163.82,79 +3423311045,186.28,4 +3423310933,182.21,72 +3423313498,149.08,78 +3423312553,171.83,50 +3423313412,212.8,51 +3423311630,185.3,49 +3423313019,170.11,49 +3423312418,175.82,80 +3423311974,234.52,88 +3423312470,164.03,86 +3423312832,200.77,79 +3423313399,168.47,56 +3423314334,184.79,89 +3423311956,158.61,71 +3423314293,192.11,68 +3423313331,176.49,82 +3423312723,190.97,76 +3423314086,175.93,53 +3423311402,157.99,44 +3423313142,197.56,57 +3423313018,140.32,56 +3423313048,162.56,23 +3423313646,181.12,58 +3423312691,197.26,76 +3423314248,156.85,58 +3423311901,164.13,99 +3423311468,187.37,86 +3423314100,174.32,28 +3423311601,145.21,96 +3423313433,178.05,29 +3423313784,183.7,74 +3423312054,121.07,57 +3423313479,171.16,51 +3423313821,148,66 +3423314285,162.93,75 +3423313505,163.17,91 +3423310597,184.96,65 +3423313168,178.28,71 +3423312770,190.13,52 +3423310962,167.55,70 +3423313779,176.01,70 +3423312312,157.02,21 +3423313902,173.72,84 +3423311160,224.47,52 +3423310942,219.46,75 +3423310464,156.79,46 +3423312665,193.01,43 +3423311784,191.2,70 +3423311425,183.98,61 +3423312435,181.04,58 +3423314343,192.37,99 +3423311627,185.03,41 +3423310923,202.94,80 +3423313428,178.47,99 +3423312913,213.93,47 +3423311790,188.36,63 +3423310671,164.08,71 +3423311382,173.91,75 +3423310860,175.29,34 +3423313827,208.65,45 +3423310643,212.49,25 +3423310938,183.78,56 +3423312233,162.3,74 +3423312899,164.83,53 +3423311897,172.53,76 +3423314328,179.26,79 +3423312184,187.12,99 +3423314358,191.47,93 +3423312173,202.02,38 +3423314384,188.06,75 +3423312303,201.54,90 +3423314396,165.23,57 +3423311005,183.92,29 +3423313215,166.22,77 +3423313020,167.75,41 +3423311010,219.54,39 +3423312601,209.43,64 +3423313496,145.28,91 +3423312088,185.5,67 +3423311039,168.77,55 +3423312649,174.41,36 +3423313853,169.45,67 +3423311723,159.2,46 +3423314000,184.76,56 +3423311845,228.21,61 +3423314017,151.81,67 +3423311612,158.47,100 +3423312623,186.03,65 +3423312230,169.85,99 +3423313271,154.52,51 +3423310898,198.3,99 +3423311917,149.71,78 +3423313436,182.53,99 +3423313456,204.62,38 +3423313513,224.94,27 +3423313871,177.2,87 +3423310955,182.4,83 +3423314016,189.88,97 +3423310913,214.17,73 +3423311064,159.8,58 +3423313485,150.8,8 +3423312898,149.55,7 +3423314299,154.72,19 +3423311602,188.1,15 +3423313449,151.59,4 +3423312138,209.27,11 +3423311828,184.26,7 +3423314409,184.77,15 +3423311384,168.27,14 +3423311995,178.52,17 +3423312445,190.16,12 +3423312991,176.3,18 +3423312690,195.01,10 +3423310934,176.1,5 +3423313224,186.4,19 +3423310543,158.63,13 +3423312986,190.1,6 +3423312380,198.1,3 +3423311316,165.93,1 +3423311580,162.35,13 +3423311849,203.98,11 +3423313137,177.21,16 +3423310850,192.25,12 +3423312975,164.47,7 +3423312375,210.88,12 +3423311623,171.76,12 +3423312075,180.06,8 +3423311965,166.36,1 +3423312879,203.85,12 +3423311357,200.37,1 +3423311663,149.51,10 +3423313194,180.26,15 +3423312653,203.91,12 +3423313245,203.13,11 +3423310515,139.35,6 +3423311545,158.88,20 +3423312131,194.37,15 +3423312855,170.38,16 +3423310635,179.61,15 +3423311783,221.45,3 +3423313813,181.05,4 +3423311486,179.97,0 +3423311200,192.78,7 +3423310988,174.71,9 +3423314148,168.64,5 +3423311888,156.17,17 +3423312645,186.51,15 +3423314091,173.21,14 +3423313153,166.02,12 +3423312021,159.14,8 +3423312378,207.13,13 +3423310621,183.67,14 +3423310705,171.29,14 +3423312030,200.64,10 +3423311252,179.25,12 +3423313583,177.43,10 +3423311314,180.8,9 +3423312909,166.45,17 +3423311094,156.41,10 +3423310951,172.5,5 +3423311093,158.02,8 +3423312298,179.76,8 +3423310982,163.26,13 +3423313461,177.75,12 +3423313500,184.44,5 +3423313141,187.98,15 +3423311960,146.8,18 +3423311582,196.9,1 +3423310774,171.24,1 +3423314424,220.23,2 +3423311718,200.41,7 +3423311882,207.52,10 +3423311826,183.1,14 +3423311708,175.04,10 +3423313928,173.26,19 +3423313714,159.7,11 +3423311235,175.37,18 +3423312857,198.44,3 +3423313231,168.76,5 +3423311179,165.87,3 +3423313268,145.73,9 +3423312291,179.68,0 +3423313761,184.94,11 +3423311325,227.52,7 +3423311144,149.36,9 +3423313981,182.32,15 +3423312338,200.6,18 +3423314124,183.21,13 +3423313613,195.43,9 +3423311223,185.17,4 +3423312352,179.33,5 +3423311861,196.17,12 +3423312333,183.14,9 +3423311557,156.71,17 +3423312682,174.79,10 +3423313256,174.48,4 +3423312469,220.39,7 +3423310987,189.07,4 +3423313628,152.64,16 +3423313622,185.33,8 +3423313421,176.88,7 +3423312307,171.65,15 +3423310727,166.33,8 +3423313664,163.94,0 +3423313221,192.77,14 +3423311531,206.13,9 +3423312066,173.2,16 +3423311751,171.41,12 +3423312900,202.04,7 +3423310839,168.84,7 +3423312453,178.48,7 +3423312420,188.51,13 +3423312647,187.13,19 +3423311044,192.89,15 +3423314307,163.5,9 +3423313669,174.78,14 +3423311854,182.67,9 +3423314163,193.88,9 +3423313507,147.55,12 +3423310917,182.63,12 +3423313276,169.78,15 +3423310809,225.68,2 +3423310901,206.08,15 +3423312410,154.75,13 +3423311385,165.07,8 +3423312530,197.07,14 +3423312964,183.97,11 +3423314114,165.07,4 +3423310995,213.82,7 +3423313754,155.84,13 +3423312084,188.57,9 +3423314119,199.02,9 +3423312984,189.61,16 +3423311962,178.34,9 +3423313988,187.46,7 +3423313877,193.3,7 +3423310707,169.02,4 +3423311963,165.71,6 +3423312960,169.48,1 +3423310864,174.91,12 +3423313008,171.99,13 +3423312135,179.52,8 +3423312482,172.02,14 +3423313471,161.32,9 +3423312139,164.26,11 +3423310683,206.47,21 +3423313504,158.71,9 +3423311936,207.08,4 +3423310748,177.63,7 +3423313439,193.4,12 +3423312336,190.06,19 +3423311603,171.15,5 +3423314074,198.13,16 +3423311209,160.23,4 +3423313151,167.46,14 +3423310479,195.66,4 +3423312676,175.16,13 +3423312405,194.29,10 +3423311286,188.86,6 +3423313856,182.61,10 +3423313986,180.45,4 +3423310487,191.98,10 +3423313554,184.19,14 +3423312758,211.8,9 +3423310992,199.29,21 +3423313961,201.79,1 +3423312535,187.37,14 +3423311326,215.94,8 +3423311327,182.43,9 +3423313781,210.69,2 +3423311485,212.54,13 +3423311484,205.43,7 +3423311748,185.52,16 +3423313401,188.19,3 +3423313934,179.74,11 +3423313398,217.21,8 +3423311308,170.23,13 +3423313625,180.18,11 +3423310532,191.03,1 +3423312201,199.33,9 +3423313682,206.15,14 +3423312247,170.85,16 +3423310798,137.67,4 +3423311651,197.27,14 +3423312210,148.83,1 +3423312441,181.67,8 +3423312442,166.1,18 +3423311717,185.73,3 +3423314260,184.66,8 +3423314351,175.8,7 +3423314065,189.2,4 +3423313764,205.78,9 +3423312766,156.87,16 +3423314249,185.54,8 +3423311475,186.14,11 +3423311765,183.3,16 +3423310967,190.39,13 +3423310568,166.15,11 +3423314318,162.71,10 +3423314102,189.62,8 +3423311183,212.42,7 +3423311592,186.3,9 +3423310480,152.55,17 +3423312831,174.29,10 +3423313683,146.35,16 +3423311445,134.81,20 +3423314033,191.22,8 +3423311706,187.7,17 +3423312681,219.29,17 +3423313481,206.59,10 +3423314259,218.79,8 +3423311033,171.7,14 +3423310857,168.33,10 +3423313712,160.71,11 +3423312396,217.76,4 +3423312076,140.49,7 +3423311955,170.73,11 +3423313855,196.13,17 +3423311409,170.58,22 +3423310455,160.52,19 +3423312939,181.77,18 +3423310569,195.7,12 +3423311156,159.86,9 +3423312526,162.96,22 +3423314126,177.53,4 +3423313057,179.4,2 +3423314443,156.38,6 +3423314377,196.12,8 +3423311736,204.96,7 +3423313390,184.1,19 +3423314290,186.88,9 +3423311216,193.86,6 +3423313144,171.51,8 +3423313888,169.25,9 +3423313495,155.86,6 +3423314001,176.54,3 +3423313568,168.36,13 +3423311964,149.91,5 +3423310795,189.38,6 +3423313170,194.15,12 +3423312905,186.47,10 +3423313796,185.5,13 +3423314314,186.69,15 +3423312304,179.23,6 +3423313586,158.05,8 +3423312607,189.13,6 +3423313005,181.82,6 +3423314041,150.31,14 +3423311054,177.95,8 +3423313836,160.49,6 +3423311228,184.62,6 +3423312917,197.02,8 +3423312878,184.32,11 +3423310966,195.47,14 +3423313957,191.27,8 +3423313526,208.19,17 +3423310659,178.37,6 +3423311893,169.24,4 +3423310826,166.62,14 +3423313626,189.78,17 +3423314142,145.39,15 +3423313998,164.47,10 +3423314344,163.39,6 +3423310660,159.11,15 +3423312457,169.56,17 +3423313984,188.25,18 +3423311547,184.12,7 +3423312987,199.37,7 +3423310600,175.81,8 +3423312222,175.06,9 +3423310650,190.85,9 +3423310556,188.89,1 +3423310592,192.77,14 +3423312237,179.24,7 +3423311437,170.86,7 +3423312060,171.29,19 +3423310451,168.9,5 +3423311896,161.76,5 +3423313094,169.54,12 +3423312243,173.72,10 +3423311562,180.98,8 +3423313520,181.89,17 +3423314010,172.85,13 +3423313028,244.79,9 +3423311091,176.79,8 +3423313938,148.44,7 +3423312195,216.66,9 +3423313540,161.14,10 +3423311343,183.38,3 +3423313354,175.29,11 +3423311273,167.02,10 +3423312090,234.09,7 +3423312630,168.98,14 +3423312656,242.37,15 +3423313614,160.78,11 +3423313744,190.56,11 +3423311207,195.33,14 +3423312318,196.23,4 +3423313440,209.76,15 +3423311667,172.78,5 +3423313419,201.99,15 +3423311675,162.56,13 +3423310772,191.81,7 +3423312376,170.72,1 +3423311577,189.45,21 +3423312199,189.77,16 +3423313874,201.69,14 +3423312563,191.59,12 +3423312125,154.93,14 +3423310468,149.19,16 +3423311927,165.07,12 +3423313367,156.57,7 +3423310782,162.34,9 +3423313563,177.82,12 +3423311895,205.07,15 +3423310802,155.83,10 +3423311711,181.53,13 +3423311124,195.87,3 +3423314160,170.3,6 +3423314439,167.89,15 +3423313394,187.17,7 +3423313043,153.81,18 +3423311524,164.53,7 +3423310488,174.81,13 +3423310976,152.1,10 +3423310636,157.26,18 +3423313985,203.03,10 +3423312253,172.78,8 +3423310561,164.95,11 +3423312228,182.99,5 +3423311248,170.9,0 +3423314330,150.57,15 +3423313249,187.45,4 +3423313241,135.98,14 +3423311306,187.55,5 +3423311567,241.71,22 +3423312971,168.04,12 +3423312695,156.01,14 +3423313691,189.75,9 +3423311458,178.42,3 +3423313707,211.26,6 +3423312027,171.7,12 +3423310828,145.07,6 +3423312721,194.55,6 +3423313893,157.26,10 +3423311743,153.34,9 +3423314407,193.78,6 +3423313437,200.72,20 +3423310658,193.21,14 +3423312224,176.23,16 +3423313615,172.11,12 +3423312314,193.31,8 +3423310688,174.3,12 +3423313653,164.4,10 +3423313284,162.68,5 +3423311218,173.55,3 +3423311168,172.15,8 +3423311418,202.84,17 +3423314177,197.95,10 +3423312767,167.21,5 +3423313391,136.52,13 +3423312310,199.92,0 +3423311221,152.09,24 +3423311476,170.87,15 +3423312902,137.16,16 +3423313237,164.38,8 +3423313946,212.41,12 +3423312327,163.36,15 +3423311151,158.31,4 +3423310819,162.26,14 +3423313220,211.99,6 +3423311013,160.26,20 +3423313039,228.96,13 +3423311264,182.69,11 +3423310715,162.8,11 +3423311136,193.58,16 +3423314138,161.26,10 +3423314447,201.12,19 +3423312953,181.35,13 +3423311631,168.64,9 +3423310916,175.58,9 +3423311560,160.2,9 +3423313109,192.22,6 +3423311296,197.6,10 +3423311224,189.82,7 +3423311770,205.8,13 +3423311438,193.33,12 +3423311479,132.03,15 +3423313791,190.33,14 +3423310834,173.9,11 +3423313114,179.11,6 +3423312157,209.64,7 +3423313638,161.9,15 +3423313700,166.62,9 +3423313308,161.64,10 +3423312812,162.9,10 +3423311186,178.88,10 +3423314147,171.97,12 +3423311041,179.56,17 +3423312058,188.59,16 +3423313581,151.09,13 +3423312594,194.87,8 +3423312394,200.21,5 +3423310742,196.39,6 +3423313643,193.05,11 +3423312587,183.61,6 +3423310517,166.8,8 +3423311204,190.45,14 +3423311797,184.29,7 +3423313629,197.44,15 +3423313704,170.72,7 +3423312342,168.56,5 +3423312438,168.22,9 +3423313883,165.22,16 +3423314342,143.37,10 +3423311159,224.24,16 +3423310500,173.49,8 +3423314197,190.04,16 +3423312100,178.39,11 +3423313573,194.68,2 +3423313694,170.72,1 +3423313317,181.64,17 +3423312856,207.34,9 +3423312624,189.85,9 +3423313640,142.83,5 +3423311798,168.14,1 +3423313975,161.94,9 +3423314286,151.65,1 +3423310721,177.29,10 +3423311324,173.02,3 +3423312972,182.6,10 +3423311856,208.78,11 +3423311344,160.5,14 +3423311104,170.45,1 +3423312762,164.37,7 +3423312756,180.1,14 +3423313337,192.31,12 +3423312937,167.88,6 +3423310698,231.57,13 +3423310626,189.68,10 +3423311481,192.61,10 +3423313606,218.21,11 +3423311435,168.88,14 +3423312599,150,8 +3423311638,178.17,10 +3423311178,190.84,1 +3423312636,211.34,15 +3423312634,161.98,9 +3423311786,208.42,7 +3423312042,185.7,17 +3423314072,165.15,9 +3423310498,166.68,14 +3423314312,213.12,3 +3423312850,217.91,3 +3423314082,159.47,22 +3423310749,176.44,5 +3423311239,173.35,6 +3423311740,194.1,4 +3423312281,176.01,16 +3423310507,193.08,9 +3423313935,180.84,21 +3423313478,191.52,9 +3423312282,141.96,15 +3423313549,154.33,3 +3423312945,185.15,10 +3423311356,185.38,7 +3423313715,208.96,18 +3423312893,176.97,4 +3423312750,162.58,1 +3423310667,184.14,9 +3423311587,154.74,1 +3423310746,180.91,11 +3423311388,184.74,1 +3423314145,186.24,4 +3423313177,181.3,11 +3423311371,180.36,13 +3423311276,163.72,6 +3423314241,177.04,6 +3423313302,154.59,19 +3423313605,196.37,17 +3423312167,219.84,15 +3423313370,189.97,3 +3423311444,168.09,5 +3423312424,190.94,20 +3423313571,192.12,16 +3423312171,218.67,1 +3423312025,188.26,14 +3423310725,200.32,9 +3423313100,188.22,12 +3423310651,207.01,18 +3423310641,173.38,11 +3423310806,161.51,5 +3423313651,169.75,5 +3423310653,169.66,11 +3423314298,150.46,14 +3423313949,176.96,23 +3423310560,200.62,8 +3423312538,197.08,14 +3423313034,196.95,11 +3423314125,116.58,4 +3423314042,174.58,5 +3423311122,187.16,11 +3423313801,184.46,4 +3423314186,170.72,15 +3423312739,168.24,10 +3423313222,187.67,17 +3423314143,218.58,9 +3423312738,155.02,9 +3423313996,153.84,9 +3423313599,180.65,16 +3423312450,168.76,7 +3423312864,153.5,1 +3423311401,159.53,15 +3423312501,207.44,13 +3423313320,229.3,15 +3423313914,180.6,13 +3423311915,165.09,4 +3423311600,205.4,11 +3423311911,171.19,10 +3423311432,188.24,3 +3423313774,189.25,7 +3423311673,160.94,13 +3423313021,185.58,3 +3423312227,171.79,6 +3423312212,179.7,9 +3423311441,193.68,8 +3423311471,195.95,4 +3423311677,169.72,6 +3423312345,207.05,13 +3423312969,205.62,11 +3423312999,166.64,11 +3423313974,168.34,9 +3423310525,185.5,13 +3423311428,191.31,7 +3423314165,140.59,3 +3423311986,188.61,9 +3423312504,167.46,11 +3423312273,186.26,4 +3423311042,188.34,18 +3423312283,157.73,12 +3423311250,175.69,16 +3423312862,134.54,10 +3423311215,190.28,10 +3423314030,168.01,3 +3423311541,188.49,11 +3423314040,164.43,12 +3423310896,186.42,14 +3423310760,201.97,17 +3423312556,201.38,1 +3423311678,196.34,9 +3423314395,171.64,16 +3423313759,183.96,6 +3423311862,192.31,8 +3423310965,180.63,12 +3423312062,194.13,11 +3423313123,175.76,5 +3423312220,163.89,8 +3423313788,164.29,12 +3423310484,170.9,15 +3423312994,166.87,15 +3423311427,164.77,6 +3423313767,184.45,12 +3423312678,191.82,7 +3423311311,183.61,14 +3423313594,171.42,4 +3423313154,186.55,8 +3423312104,200.37,6 +3423311806,166.72,9 +3423312907,192.38,10 +3423312023,151.88,4 +3423312366,196.56,1 +3423312102,172.43,11 +3423313847,169.64,9 +3423311616,177.33,1 +3423312005,156.76,15 +3423314064,165.76,5 +3423314348,176.43,11 +3423313133,159.11,16 +3423313556,186.67,15 +3423314209,169.32,10 +3423311496,177.78,11 +3423311007,180.73,1 +3423312313,202.18,1 +3423312159,159.31,10 +3423311666,166.8,10 +3423312328,177.09,7 +3423310718,170.6,3 +3423311372,190.14,13 +3423310757,201.05,9 +3423312666,174.91,14 +3423310793,168.01,15 +3423313060,152.99,6 +3423310699,145.17,15 +3423313445,174.44,9 +3423311503,181.52,14 +3423312162,185.27,9 +3423313270,176.37,11 +3423313923,159.92,15 +3423311338,150.21,1 +3423311417,152.6,15 +3423313992,197.71,9 +3423311543,174.78,4 +3423312398,156.2,4 +3423310970,174.5,6 +3423312772,210.4,10 +3423312071,200.05,9 +3423313523,164.96,5 +3423310790,176.5,11 +3423311535,192.86,21 +3423311671,188.22,8 +3423310528,178.57,17 +3423312026,203.38,8 +3423312426,192.32,14 +3423310897,176.15,9 +3423312080,156.61,20 +3423311502,176.51,14 +3423310875,190.66,7 +3423311299,154.62,5 +3423310784,162.97,15 +3423311645,207.96,7 +3423311750,176.65,11 +3423312557,164.56,0 +3423312711,203.41,7 +3423314405,182.19,15 +3423313785,201.68,16 +3423311720,135.38,8 +3423310849,153.45,1 +3423310939,182.71,11 +3423310505,177.44,4 +3423312205,208.52,5 +3423311422,175.58,11 +3423311929,161.6,6 +3423313965,168.09,12 +3423313072,168.77,10 +3423313258,158.86,14 +3423311999,182.16,6 +3423312965,199.42,8 +3423311467,172.99,10 +3423311074,182.44,10 +3423310814,166.96,8 +3423311459,153.58,12 +3423310920,185.68,18 +3423311219,214.8,20 +3423310460,185.39,20 +3423314058,169.35,19 +3423313313,181.75,10 +3423311165,159.89,17 +3423310652,196.52,12 +3423311383,169.65,14 +3423313327,192.21,8 +3423313079,180.29,20 +3423312932,222.08,11 +3423311728,165.1,13 +3423314434,212.75,9 +3423311292,170.64,13 +3423314315,196.2,14 +3423310889,189.99,12 +3423310851,208.96,10 +3423313381,219.39,1 +3423311448,188.25,10 +3423311551,187.13,12 +3423313834,187.28,8 +3423312123,192.74,13 +3423310590,211.2,8 +3423312146,189.88,9 +3423312648,165.58,6 +3423310473,191.88,7 +3423312226,194.22,12 +3423310647,167.22,2 +3423311832,185.37,14 +3423311103,203.8,22 +3423311192,167.05,10 +3423314043,177.48,19 +3423312391,170.22,16 +3423312567,209.76,18 +3423310685,160.04,10 +3423312600,176.17,5 +3423312921,170.91,12 +3423313630,176.14,5 +3423311533,168.03,9 diff --git a/011/exercise/readme.md b/011/exercise/readme.md new file mode 100644 index 00000000..75ee2e30 --- /dev/null +++ b/011/exercise/readme.md @@ -0,0 +1,79 @@ +# Problem Statement +Business challenge/requirement + +Lithionpower is the largest provider of electric vehicle(e-vehicle) batteries. It provides battery on a rental model to e-vehicle drivers. Drivers rent battery typically for a day and then replace it with a charged battery from the company. Lithionpower has a variable pricing model based on driver's driving history. As the life of a battery depends on factors such as overspeeding, distance driven per day etc.You as a ML expert have to create a cluster model where drivers can be grouped together based on the driving data. + +# Objective + +To understand how k-means works internally. + +# Task + +Drivers will be incentivised based on the cluster, so grouping has to be accurate. + +# K-Means Algorithm + +K-means clustering is a type of unsupervised learning, which is used with unlabeled dataset. The goal of this algorithm is to find K groups in the data. The algorithm works iteratively to assign each data point to one of K groups based on the features that are provided. Data points are clustered based on feature similarity. The results of the K-means clustering algorithm are: + +- The centroids of the K clusters, which can be used to label new data +- Labels for the training data (each data point is assigned to a single cluster) +- K-means works by defining spherical clusters that are separable in a way so that the mean value + converges towards the cluster center. Because of this, K-Means may underperform sometimes. + +# Use Cases: + +- Document Classification +- Delivery Store Optimization +- Customer Segmentation +- Insurance Fraud Detection etc. + +# Algorithm : +- Κ-means clustering algorithm inputs are the number of clusters Κ and the data set. Algorithm starts with initial estimates for the Κ centroids, which can either be randomly generated or randomly selected from the data set. The algorithm then iterates between two steps: + +1. Data assignment step: + +Each centroid defines one of the clusters. In this step, each data point based on the squared Euclidean distance is assigned to its nearest centroid. + +2. Centroid update step: + +Centroids are recomputed by taking the mean of all data points assigned to that centroid's cluster. + +The algorithm iterates between step one and two until a stopping criteria is met (no data points change clusters, the sum of the distances is minimized, or some maximum number of iterations is reached). + +This algorithm may converge on a local optimum. Assessing more than one run of the algorithm with randomized starting centroids may give a better outcome. + +3. Choosing K + +If the true label is not known in advance, then K-Means clustering can be evaluated using Elbow Criterion , Silhouette Coefficient , cross-validation, information criteria, the information theoretic jump method, and the G-means algorithm. . + +# Elbow Criterion Method: + +The idea behind elbow method is to run k-means clustering on a given dataset for a range of values of k (e.g k=1 to 10), for each value of k, calculate sum of squared errors (SSE). + +Calculate the mean distance between data points and their cluster centroid. Increasing the number of clusters(K) will always reduce the distance to data points, thus decrease this metric, to the extreme of reaching zero when K is as same as the number of data points. So the goal is to choose a small value of k that still has a low SSE. + +We run the algorithm for different values of K(say K = 10 to 1) and plot the K values against SSE(Sum of Squared Errors). And select the value of K for the elbow point. + +# Silhouette Coefficient Method: + +A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: + +The mean distance between a sample and all other points in the same class. +The mean distance between a sample and all other points in the next nearest cluster. +To find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample. +A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +K-Means algorithm uses Eucledean Distance, other popular distance metrics in Machine Learning are: + +- Cosine distance : It determines the cosine of the angle between the point vectors of the two points in the n dimensional space. Closer the point vectors are by angle, the higher is the Cosine Similarity + +- ManhattanDistance=|x1–x2|+|y1–y2| + Both the RMSE and the MAE are ways to measure the distance between two vectors: the vector of predictions and the vector of target values. Various distance measures, or norms, are possible: +Computing the root of a sum of squares (RMSE) corresponds to the Euclidian norm: it is the notion of distance you are familiar with. It is also called the ℓ2 norm(...) +Computing the sum of absolutes (MAE) corresponds to the ℓ1 norm,(...). It is sometimes called the Manhattan norm because it measures the distance between two points in a city if you can only travel along orthogonal city blocks. + +Here is a template notebook to get you started: + +https://www.kaggle.com/code/ryanholbrook/clustering-with-k-means + +### References +- https://www.javatpoint.com/k-means-clustering-algorithm-in-machine-learning \ No newline at end of file diff --git a/011/solution/k-means-clustering.ipynb b/011/solution/k-means-clustering.ipynb new file mode 100644 index 00000000..7ae43a78 --- /dev/null +++ b/011/solution/k-means-clustering.ipynb @@ -0,0 +1,835 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mean_dist_daymean_over_speed_perc
id
342331193571.2428
342331321252.5325
342331372464.5427
342331137355.6922
342331099954.5825
\n", + "
" + ], + "text/plain": [ + " mean_dist_day mean_over_speed_perc\n", + "id \n", + "3423311935 71.24 28\n", + "3423313212 52.53 25\n", + "3423313724 64.54 27\n", + "3423311373 55.69 22\n", + "3423310999 54.58 25" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = pd.read_csv(\"driver-data.csv\", index_col=\"id\")\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.cluster import KMeans" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [], + "source": [ + "kmeans = KMeans(n_clusters=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,\n", + " n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',\n", + " random_state=None, tol=0.0001, verbose=0)" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kmeans.fit(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 49.98428468, 5.21441441],\n", + " [180.34311782, 10.52011494],\n", + " [177.83509615, 70.28846154],\n", + " [ 50.46127059, 32.42823529]])" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kmeans.cluster_centers_" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([3, 3, 3, ..., 1, 1, 1])" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kmeans.labels_" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "unique, counts = np.unique(kmeans.labels_, return_counts=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 2775, 1: 696, 2: 104, 3: 425}" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dict_data = dict(zip(unique, counts))\n", + "dict_data" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [], + "source": [ + "data[\"cluster\"] = kmeans.labels_" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\sjangir\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\seaborn\\regression.py:546: UserWarning: The `size` paramter has been renamed to `height`; please update your code.\n", + " warnings.warn(msg, UserWarning)\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAdMAAAGoCAYAAAAdGw+vAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzsvXmYZGlV5/8598aSmZVZlVXZ1dVNF00XTdNO0wI6NUCLjAi2TyuruC+AyIgzg8ug46gj83N5YMRBZVRwYUQFUUFaFARlYBrZFGgahaZoLOimF6uprq4ts3KJjOXe8/vjvTfiRmREZNyIuBGREefzPPlkxo27vDcyM0683/Oe7xFVxTAMwzCM/vHGPQDDMAzD2OtYMDUMwzCMAbFgahiGYRgDYsHUMAzDMAbEgqlhGIZhDIgFU8MwDMMYEAumhmEYhjEgFkwNwzAMY0AsmBqGYRjGgOTGPYBBueWWW/R973vfuIdhGIZhDIaMewCDsOdnpufOnRv3EAzDMIwZZ88HU8MwDMMYNxZMDcMwDGNALJgahmEYxoBYMDUMwzCMAbFgahiGYRgDYsHUMAzDMAbEgqlhGIZhDIgFU8MwDMMYEAumhmEYhjEgFkwNwzAMY0AsmBqGYRjGgFgwNQzDMIwBsWBqGIZhGAOSaTAVkT8UkYdF5ERi2yER+YCIfCn6fjDaLiLyWyJyt4jcKSJfm+XYDMMYnOrqWbZO3s7GnR9m6+TtVFfPjntIhjEWsp6Z/jFwS8u2nwVuU9XrgNuixwDfAlwXfb0M+N2Mx2YYxgBUV89SfuAuwkoZ/BxhpUz5gbssoBozSabBVFU/Alxo2fw84M3Rz28Gnp/Y/hZ1fAJYFpErsxyfYRj9Uz1zL4iH+D4igvg+iOe2G8aMkRvDNY+o6mkAVT0tIpdH268C/jWx36lo2+nWE4jIy3CzV66++upsR2sYRlvCcgn8lrcQz3PbI6qrZ6meuZewXMIrzpM/coz88uGmQ3rZp5f9u52n3+f6HaMxe0zSAiRps03b7aiqb1TV46p6/PBh+4M2jHHgFechDJs3hqHbTm8ycFqpuNP+2w/e3fE83a6RxRiN2WQcwfRMLN9G3x+Otp8CHpnY7yjwlRGPzTCMHskfOQYaokGAqqJBABq67fQmA6eVijvu//B9Hc/T7RpZjNGYTcYRTN8NvDj6+cXAuxLbXxSt6n0KsBbLwYZhTB755cMUr74Br1CEoIZXKFK8+oa6/BmWS+C1vMW0yMC97JOk0/4EQcfzdLtGFmM0ZpNMc6Yi8ufA04HLROQU8AvAa4C/EJGXAg8A3xnt/rfAtwJ3A1vAS7Icm2EYg5NfPtwxd+gV5yNp1G9sTMjAve7Tyznx/cb3Nufpdo1hj9GYTTINpqr6vR2eemabfRV4eZbjMQxjdOSPHKP8wF1ogJvZhWGTDNzrPj2d8/JrqJ1/sON5ul1j2GM0ZpNxrOY1DGMGcDPWG7qugu1ln17PWd13oMt5ul1juGM0ZhNxE8K9y/Hjx/WOO+4Y9zAMw9hDWKnLRNKuomPPMEmlMYZhGJljpS5GFlgwNQxjprBSFyMLLJgahjFTWKmLkQW2AMkwZoxZzxdaqYuRBTYzNYwZwvKFuzs3GUY/WDA1jBnC8oW7OzcZRj+YzGsYM0QvnV72AoNK1d2cm4Z9LWM2sJmpYcwQu3V62QuMUqo2WdzoFQumhjFDTEO+cJRStcniRq9YMDWMGWIa8oWjLG2xMhqjVyxnahgzRpp84SQyytIWK6MxesVmpoZh7ClGKVVPgyxujAYLpoZh7ClGKVVPgyxujAaTeQ3D2HMMS6rupexlr8vixmiwmalhGDOJlb0Yw8SCqWEYM4mVvRjDxGRew5hCzLVnd6bFDcqYDGxmahhThsmXvTENblDG5GDB1DCmDJMve8PKXoxhYsHUMKYMc+3pDSt7MYaJ5UwNY8rYzbVnVPnUvZC3tbIXY1jYzNQwpoxu8uWo8qmWtzVmDQumhjFldJMvR5VPtbytMWuYzGsYU0gn+XJU5SDTXnayFyRsY7TYzNQwZohRlYNMc9mJSdhGOyyYGsYMMapykGkuOzEJ22iHybyGMUM4KfKGzCXKUV2nE1nKsNMuYRv9YcHUMGaMUZWDjKvsJJZhEa9JhoXh1JBaw3CjHSbzGoYxVWQtw06zhG30jwVTwzCmiqwdoMw5yWiHybyGYUwVo5Bh97pzkpX2DB+bmRqGMVWYDNsdK+3JBgumhmFMFSbDdsdKe7LBZF7DMCaGbvJjGmlyr8uwWWKlPdlgwdQwjImgW0kLkGm5yyxhpT3ZYDKvYRgTQTf50aTJ4WE55WywmalhGBPBrvKjSZNDYdzuVNOKBVPDMDIjTZ5zN/lxEGnSSkGasZzy8DGZ1zCMTEhbgtFNfhxEmrRSEGMUWDA1DCMT0uY5u5W0DFLuYvlWYxSYzGsYRib0U4LRTX7s9NxuEq6VghijwGamhmFkwigahPci4U5zo3JjcrBgahhGJoyiBKMXCddKQYxRYDKvYRiZMEgJRq+rb3uRcK0UxBgFFkwNw8iMfkow0jT37tXNx0pBjKwxmdcwjIkizepbk3CNScGCqWEYE0Wa5t7WIcaYFEzmNQxjokhrxB5LuHGetfzAXVTPWF7UGC02MzUMY6LoR7o1lyNj3FgwNQxjouhHujWXI2PcmMxrGMbE0br6trp6lq2Tt5vLkTGx2MzUMIyJxlyOjL2ABVPDMCYaczky9gIWTA3DmGh6KZWxEhlj3FjO1DCMicZcjoy9gM1MDcOYaEzCNfYCFkwNw5hoTMI19gJjk3lF5BXAfwAU+BzwEuBK4G3AIeCfgBeqamVcYzQMY/z02kFmHEzy2IzRMpaZqYhcBfw4cFxVbwR84HuAXwVep6rXAReBl45jfIZhTAaT7Gw0yWMzRs84Zd4cMC8iOWABOA08A7g1ev7NwPPHNDbDMCaASXY2muSxGaNnLDKvqj4oIr8GPACUgPcDnwZWVbUW7XYKuKrd8SLyMuBlAFdffXX2AzYMoy8GlUE7ORsFW+tdHZGyJL6nYP0iiEBhDonHaK5LM8u4ZN6DwPOAY8AjgH3At7TZVdsdr6pvVNXjqnr88GHLTxjGJDIMGbSds5FWKxAGY5FXm+7J80AVrZTQIJoDmOvSzDIumfebgHtV9ayqVoF3Al8HLEeyL8BR4CtjGp9hGAMyDBm0bVlMUIVcfizyavKeyBVAAAWtbFvJzowzrmD6APAUEVkQEQGeCdwF/D3wHdE+LwbeNabxGYYxIGmafHeiXVkMfg7JFQY6b78k78nL5SE/V5+hWsnObDOunOknReRWXPlLDfhn4I3Ae4G3icirom1vGsf4DGMWqa6epXzqJFreAsCb20fhqsf2HRzSNvnuRKuz0dbJ24dy3n5ovScvl0fFwysUWbj+SZlf35hcxraaV1V/QVW/SlVvVNUXqmpZVb+sqk9S1ceo6neqanlc4zOMWaK6epbte+9EtzdBFVQJS+ts3/e5vnORWTkXjdMRydyYjE6YA5JhGC4XGAYuB+iJ+xIPglrfucisnIvG6YhkbkxGJ8zo3jCMKN/YZvG8hgPlIpMSbVxSUn7groHLWcZpam+G+kY7bGZqGEaUb5SdT4g3lFykuQUZ044FU8MwXM7P893kNFT3pSH4uaHkA80tyJh2TOY1DMPJlsce37Kad3Gg1bxJOjkZtUrIozSO7+VaZmRv9IoFU8MwgGxzgb2UycRSMOI1ScEw/AU+vVxrlOMx9j4m8xqGkTm9lJSMUgru5VomTRtpsGBqGEbm9FJSMgzHpF7p5VqjHI+x9zGZ1zCMTGiXb+zmEpSUgjWoodWyq331c1RXzw4srSbHo0EVwhApFBs7tMjOw3JwGtaYLWc72djM1DCModNPKUwsBYfVslsEFYaAgPgDl9G0jgfPh1qZsFLuKDuP2+3Iyon2FhZMDcMYOv3kG2MpmDAAxEmshTknDQ+Yq2wdj5cvQr4IGnSUncftdmQ5272FybyGYQydYGvdzeoqoVsN6/kQ1AjKW2ydvL2jXJlfPkzZz0NhHtdQKiJFrrKdNNquNEdyBQhqLD7+Gzqeq76yNzpfHMhGEVB7LScyJgObmRqGMVSqq2fd7DJu6h2GUKs4EwjP21WubNcQvNdcZSdpVPxcX+ccp9Q6yOtgjB4LpoZhDJXqmXshl4/cCaPu2TG5wq5y5SC5yk7SqKr2dc5xSq3jztka6bBgahjGUAnLJSRXQArzkJRqEddQG7rKlYPkKjuVsxAGfZ1znOUx487ZGumwnKlhGEMlLikRP4f4OcLtTSdXJoPSLnJlv25M3cpZ+jnnKMpjupW/WIeavYPNTA3DGCqt8qRbRKPg5TKXK4ctjWYttVr5y/RgwdQwjKHSKk/6c/vIX/kY/Pl9mcuVw5ZGs5ZarfxlejCZ1zCMoTNqeTJLp6As78XKX6YHm5kahrGn2ctSqZW/TA8WTA3D2NPsZanUyl+mB5N5DcPom93k1Szk19ZzBlvrSGGueac9IpW61+IGM7OfAiyYGobRF7s1z86iuXa7cxIGaLXStQPMJGPlL9OBybyGYfTFbvJqFvJr23Pm8hBUTSo1xooFU8Mw+mI3d6As3IPanVNyBfBz5hRkjBWTeQ3D6Ivd3IE6PS9+js3P/wPh9gao8+2VuUWKR6/vGADjPKlWy1Ato/liw5owDPHnF+uNx+N9yw/c1TUHaY23jWFiM1PDMPpit5Wo7Z7XoEpY3iIsrdcDKYBub7B9751ty1maSl/yRdd9prJNWK3suGavZTJ7uZzGmEwsmBqG0Re7uQO1e15yhaYg2kQYtM2nJvOkXi6PFBecgX6tsuOaveZp93I5jTGZmMxrGEbf7LYStfX5jTs/7GaWbdG2+dRWlyDxczDnrAljabfTvkDbPG1yPw1qTj4OA4JKierqWZN7jdTYzNQwjJHhFeddWUtbpG05SxqXoF73jffToIZWStFs2bWLM7nX6AcLpoZhjIz8kWM7Z44xnt+2nCWNS1Cv+9b3q2xHvcvVxdJ80eReoy8smBqGMTLyy4eZu+ar8eaXmhqHy9wic8ce31ZeTdO5pdd94/3cxXGz5fycWyG8R9yTjMnCcqaGYYyUfhx/0hzT67755cNUF5cJSpsQ1qC6TVirgJdz7eIMIwU2MzUMY2bxFg9BrezyrIr7Xiu77YaRApuZGoYxs4QbF1yeNKi5Vcax5+/GhXEPzdhjWDA1DKMto3AIyvIavZw7LJeQXAHJN0zyVduX6BhGN0zmNQxjB6NwCMryGr2e25pzG8PCgqlhGDsYhUNQltfo9dzWnNsYFibzGsaM004O7dVJaBCyvEav57bm3MawsGBqGDNMpwbe4ufQMOzYEWYY7NZ1ZlTntubcxjDoWeYVkTeLyHLi8UER+cNshmUYxijoJIeqaubyZ5YSq8m3xqhJkzN9vKquxg9U9SLwNcMfkmEYo6JTA2/CoGfXoX5J42w0Sec2jHakkXk9ETkYBVFE5FDK4w3DmDC6yaGjkD+zukacBw621gElKAUQLT6ygGpkQZpg+OvAP4rIrTivkO8CXp3JqAzDGAn5I8coP3AXGhDNSMM9L4fGeWANQ2fGAECNYHuT8IG7AJuhGsOnZ5lXVd8CfDtwBjgLvEBV/ySrgRmGkT3TKIfGeWDCmjOx98Q9DmrWEcbIjJ5mpiLiAXeq6o3AXdkOyciCyj0nKH/8A4Sr5/GWVyjedDOFa28c97CMCWDaVrPWy2Jam5BrOLaOMKNwkzLGS08zU1UNgc+KyNUZj8fIgMo9Jyi97+2EG2swt0C4sUbpfW+ncs+JcQ/NMIZO3dWotQm5eGNxNxqFm5QxftKs5r0S+LyI3CYi746/shqYMTzKH/8A+D6SL7ryh3wRfN9tN4wpIy6LwctFnWBcmU88Wx11PngUblLG+EmzAOmXMhuFkSnh6nmYW2jemCu47YYxZSRdjYKtEFAQD39u31jk1VG4SRnjp+dgqqofFpFHAdep6v8TkQXA3+04Y/x4yytO4k10xqBWwVteGd+gDCND2uWB47xl+YG7Rpq3zNLpyZgc0jgg/TBwK/D70aargL/OYlDGcCnedDMEAVotOzeYahmCwG03jBlgnHlLc2OaDdLkTF8OPBW4BKCqXwIuz2JQxnApXHsj87d8N97iAdjewls8wPwt322reY2ZYZx5y2ksPzJ2kiZnWlbViogAICI5XHrf2AMUrr3Rgqcxs4w7bzlt5UfGTtLMTD8sIv8dmBeRm4F3AH+TzbAMwzCGhzUBN7ImTTD9WZzz0eeAHwH+FnhlFoMyDMMYJpa3NLImzWreUETeDHwSJ++eVFWTeScUczwyZpXObkOz0QTc3JbGQ8/BVESeBfwecA/O8fKYiPyIqv5dVoMz+iN2PML3mxyPuAULqMZU06nZeWxuP+1BZbf7N7Ijjcz768A3qurTVfUbgG8EXpfNsIxBMMcjY1aZdbehWb//cZImmD6sqncnHn8ZeHjI4zGGQLh6HnKF5o3meGTMAJ2anc+K29Cs3/84SRNMPy8ifysiPygiL8at5P2UiLxARF6Q9sIisiwit4rIv4jIF0TkJhE5JCIfEJEvRd8Ppj2v4RyPqFWaN5rjkTEDzPqq3Vm//3GSJpjO4XqZfgPwdNzK3kPAc4Bn93Ht3wTep6pfBTwB+AJuxfBtqnodcFv02EiJOR4Zs8qsr9qd9fsfJzKsBbki8nOq+is97rsf+Czw6OSKYBE5CTxdVU+LyJXAh1T1+m7nOn78uN5xxx2DDH0qsdW8xqwy66tZ9/D9y7gHMAhpHJB24zuBnoIp8GjczPaPROQJwKeBnwCOqOppgCigtrUrFJGXAS8DuPrq6WyxOo5gaAHYSMOkvmnPwqrdbsz6/Y+LNDLvbqT5VJEDvhb4XVX9GmCTFJKuqr5RVY+r6vHDh6fvj2bQZt79HG8NxI00WMNrw2hmmME0jV58Cjilqp+MHt+KC65nInmX6PtMrhYetLSln+OtnMZIg5VgGEYzY5mZqupDwL+KSJwPfSZwF/Bu4MXRthcD7xri+PYMg5a29HO8ldMYabASDMNoZpg503ek3P/HgD8VkQKuZvUluOD+FyLyUuABXB525hi0mXc/x1sDcSMVno9ub6Koc9sRD8IAgK2Tt480f7r94N1UH74PggB8n/zl1zB31WNGcm3DiNk1mIrIb9NFwlXVH4++/880F1bVzwDH2zz1zDTnmUaKN91M6X1vRym72WKtkqq0pZ/jB72mMTtUV8+itQpoCAhoALhASmF+pBZ22w/eTfX03dSFsSCIHmMB1Rgpvci8d+BW287h8ppfir6eSP0/yBgmgzbz7ud4ayBu9Er1zL2In0eKC81Sr3h4ufxI86fVh+8DBLzEFxJtN4zRsevMVFXfDCAiP4jz5q1Gj38PeH+mo5thBm3m3c/x1kA8Oya1jKQf4kbbbuFRjrC0HmlXCQGrJX+a2f0HHT7Pd9reB9P0uzOyI80CpEcAS4nHi9E2wzC6MG1lJDss68Rr/g5NFnaZ3r/vp9uekmn73RnZkSaYvgb4ZxH5YxH5Y+CfgFR5UsOYRaatjKTVsg4vByj4ubYWdlnef/7ya9y1w8QXGm0fnGn73RnZkaY5+B+JyN8BT442/WxU4mIYRhdiWbSJjMtIspQmWxtt+/P78BYfSbhxoX49b/EQ1TP3Un7gLucNnS8iJGaLnkdQ2mDr5O0DjTFeZJRczevtP0zt4kOsP/Rld6m5fRSuemxf9z+O352xN0nTHFyAb8L56f6yiFwtIk9S1duzG55h7H284nwkEyaCSYadPEbRILqbZV3r9amWoVJCoxwr4FYDB7Ud8mk/Y5y76jGNoLp6lu1774SgVn8+LK2zfd/n4JqvTn3uUf/ujL1LGpn3d4CbgO+NHq8Dbxj6iAxjyhh1J49xS5Ot13e1y4JWthv3X6uCnx/6GKtn7nX1rkJjda94ENT6Ord1YTF6JU0wfbKqvhzYBlDVi0Ch+yGGYeSXD1O8+ga8QhGCGl6hSPHq7Gowx+1O1Hp9L5dvmIFE94/nI/mWt48hjNEd36YsXsO+zj3q353RGyLyiyLyX/s4bllE/nMWY0rjgFQVEZ/oL1VEDgNh90OMrKncc4LSbX9FeMHZGHsrR5h/xvO7lrhYd5jRM8pOHuOWJttdXzwfb3GZheufBDiXpKC0iYY1Z/4gHng5/Pl9A187qJbZEVDF6/v+rQvLVLEM/Gec0toTUYpTVLVrvEszM/0t4K+AIyLyauBj2GresVK55wSb734L4fkzgIIq4bmH2PqbP+nY7cW6w0w/45Yme7m+t3gIamVXYqO477Wy2z7otT0/Ome0uldD8HMmze5hRORFInKniHxWRP6k5bkPicjx6OfLROS+6OfHicjtIvKZ6NjrcFUp10bbXhvt99Mi8qlon1+Ktl0jIl8Qkd/BVa48crcxplnN+6ci8mmc3Z8Az1fVL/R6vDF8yh//AFS2QaK8kODexMolyh//QNvZZrI7DAD5Ikq54/7G3qN1te2ojQZ6uX64ccFJv0FiZurn3PZBr33s8ZRPnUTLWwB4c4t9r+Y1xo+IPA74eeCpqnpORA4BP97Dof8R+M0odhUAH9fq80ZVfWJ07m8GrgOehHsHfbeI/HucN/z1wEtUtSdZOK3R/WXAVlQmc1hEjqmqFVyNiXD1vPtEHxfLh24GgCq1U1+mcs+JHQEyXD0PcwvNJ7LuMFPHuKXJ3a4flktIrtD4UAeo6lDyuuO+d2PoPAO4VVXPAajqBae87srHgZ8XkaPAO1X1S22O++bo65+jx4u44PoAcL+qfqLXQfYs84rILwA/A/xctCkPvLXX443h4y2vRAs91AXSMACNckUibeVbb3nFmdgnse4wxojZ4aIEVnJidELo3i+7RiOWzcUbVfXPgOcCJeD/isgzOpz7V1T1idHXY1T1TdFzm2kGmSZn+m3RwDajgX6FZntBY8QUb7oZCnMugIYJL1LPw1tabtvcu3jTzRAEaLXs8lnVsnWHMUbOuPO6xp7iNuC7RGQFIJJ5k9wH/Nvo5++IN4rIo4Evq+pv4XplPx5X0pmMW/8X+CERWYyOuUpELu9nkGmCaUVVnVeXu+hgy+6MgSlceyP7nvsivJUjjY25PN6BFaQ431a+te4wxiRgJSdGr6jq54FXAx8Wkc8Cv9Gyy68B/0lE/hGXioz5buCEiHwG+CrgLap6HvgHETkhIq9V1fcDfwZ8XEQ+B9xKn5NEUe02e07s6Gp6rgNuBn4F+CHgz1T1t/u58LA4fvy43nHHHeMcQl8Muzxl/a2vI9xYa85BVZ27jDe/b6DrWCmNMQiDWhta15aZoadE6KTSczAFEJGbcclagPer6ge67T8K9mIwjctT8P2mRtyDzBDbnVO3S6gq3vxC39fJYqzG7NBkLeh5USlM2PMsdNDjjT3Fng6maWRegM8BHwU+Ev1s9EGyPEVE3GyyTX4zDe3kW1lYxJtfGOg6WYzVmB0GtTYctzWiYfRKGqP7/wD8f8AHcZ8gfltEfllV/zCrwU0rWZWntDb3Xnv9Kwe+jpXSGIMwaNeVrLq2TIN0PA33ME2kmZn+NPA1qvqDqvpi3Oqpn8lmWNPNqMpThnEdK6UxBmHQEpgsSmimoeH3NNzDtJEmmJ7CLSuOWQf+dbjDmQ1GVZ4yjOtYKY0xCIOWwGRRQjMN0vE03MO0kcYB6UHgkyLyLlx5zPOA20XkJwFUtXW5stGBwrU3wi1ktkI2ufqW4hzhdgkiWbapjGYCxmrsDfqVFNNYG3a+xnCtESel4fcgMu2k3IPRIE0wvSf6inlX9N2MG/qgNb85LJpW384tEG5egq11ZN9+ZGEJahX3/C30fP2sxmrsDQZtNt6Lvd9u1xhmLnDcXXVg8Nd0Eu5hmhGRW4DfxPn5/oGqvma3Y9IY3f9S4kIesKiql/oZqJEdO4zsK9uAoOUS3r79ZmxvpCYpKQLg+2jgtg8ryI3iGjH5I8coP3AXGtBUbjNK96VB73cS7mFaiVqNvgHnqXAK+JSIvFtV7+p2XJrVvH+Gc+EPgE8DB0TkN1T1tf0P2xg2O1bf1mquq0xQa2yz1bhGCkYhKY5Sthx3Vx0Y/H4n4R4mhVf+ceUW3ALZY8C9wGtf9YOF9w1wyicBd6vqlwFE5G24tOZwgilwg6peEpHvB/4Wt5L304AF0xHQqwuRt7ziepXGM9NcDqpVQAnOPgh+DinO4x9qbz9pbkdGK6OQFL3iPMH2JtrSks2fy8a1dBDpeBglKcN4Ta07Tj2QvgEoAxeAK4E3vPKPKy8fIKBeRfPi2lPAk3c7KM1q3ryI5IHnA+9SVfcObWROmoberatvXaPkEFcaLFCroZuX8B/12IGuY8wOozCl9xYPQbWlWXh18Gbhw2ZYJSlm9D80fhoXSLeix1vR458e4JztnJh2jXVpgunv49z59wEfEZFHAZYzHQFpXIhanZBEFYoLkM+77jK5HCwsEdz/xYGuY8wOozClDzcuQK7o8n+C+54rDtwsfNgMqyTFjP6HxjEagTRmK9reL6eARyYeHwW+sttBaRYg/RbwW/FjEXkA+MbE4xer6pt7Pd8sMCzJNFw9j4oQrp9xuU8/hywsdcx7Jlffrr3+lcjcAsmmuKra9thB3Y5MIp58BilxyeqNvrp6lmBjFVAn7+bn8HL5gZuFZ+EQNMzcrsm0Q+FenLSbDKgL0fZ++RRwnYgcw5WEfg/wfbsdlNabt446Eqta+Il+zzWNDFUyLc6hly5CELg3myBwj4tzux6axsFoELcjk4gnn0l0zamXiIAT0jSE6jZhrTpQXjare7Wm5hPHa4EiLoASfS8ywFqeKK79KK7X6ReAv4jawHWl72Dahj3t+D9shiqZJjv7aIftHUjjYDSI25FJxJPPJLrm1MdUmIveQcT9jVfLA+UQs7pXy3VOFtEio5cDp4FD0fdBFh8BoKp/q6qPVdVrVfXVvRyTZjXvrtcf4rn2PEM1iK+UYekglNZdqUsuB/NLbnsvFIqE588A4B26nPmbv6Ot/DqI25EZ4k8+k+iaE49JRKAw7z7ARQvmBskhZnWvVpIyeUSBc6DgOQyGGUyZ+QBYAAAgAElEQVRtZppgR4kK9G0Q7y2vwMYacqhhBajVsltk1IWkG5J32ZVOwq1Wuh7Tr9vRMO/XyIZJdM1Jjkn8HOLn0CDAKxQHClBZ3qvlOo129CTziognIt+1y27/MITxTA3DNIjv91yjlF7NEH/ymUSJMqsxTeK9GtNNT8FUVUNcQrbbPl2fnzXaNeuev+W7+5r19XuucPU85ArNGzOSXod5v0Y2TGI5RlZjmsR7NaYb0R4WsQCIyP8ASsDbgc14u6qOtRDs+PHjescdd4xzCBPL+ltfR3DhYbRcciU14tXzUbmjj+4pH2rlLoaRHdbgu4k9nSpMs5r3h3Crpj6CsxH8NGBRbILxH/VYdPOSW7SkQK3qymuK8z2Vrli5i2FkxySWKhn903MwVdVjbb4eneXgjMEI7v8iLCy51b9hAIibndYqPeVPrdzFMLJjEkuVDIeI/KGIPCwiPc8c0nSNWQB+ErhaVV8mItcB16vqe/oYq9EnnWTXeHvt7FeQIIBcDt0uweIy/qEDBA8/6AKpUO8gE9YCwlNfZu31r2wr4Vq5i9GJpDyJ54KBBjWTKlMwiaVKRp0/Bl4PvKXXA9KUxvwRTtr9uujxKeAdgAXTEbGj8Xcku9Yefz/VOz/hXGPKJeflUK24wLl+kVBws9MgcHKvnyPc3oL1i84DNXGuZNNwK3cx2pFsbK2qsL2JolCYT93kepaZxFKlvciHP7+1owXbNzxuYVDTho+IyDVpjkmTM71WVf8XUI0uVmKPJ4z3Gp1k18onb3P/kFEjcDzf9TAV9+vVjTVn8qAKqsjCktsGeEvLHSVcK3cx2tHU2DqoRg2JovSBSZU9Y+U7gxMF0jfg/HnrLdii7SMlTTCtiMg8kdORiFyLa3VjjIhOpS5aKbvttRr1zzcioCGy/yCo4qF4K0fwLrvCdZJRRfYfRJKfglskXCt3MdrhpN3orUMTPrXxzyZV9oSV7wyFLFqw9UUamfcXcZZNjxSRPwWeCvxgBmPaM/RTNpI8hkLRBb3ydk/Ht5Ndw0sXQEPCM3Ev26AxK83lIKghuTxaq8D2Fur75A4/wpnnlzYJLiQ60bRpGt6vI5IxvTTJk/VyK+pKSKtUOevlH93u39yUBuYYbkaaZNAWbH2RZjXv+4EX4ALonwPHVfVD2Qxr8umnbCR5TIgQnj9DeO4h116th+NbZddg7TyUNnfuqBqt3gXdvISKoKUttFqBcona+TOEaxec1BvPZrs0DTeMJEl5Ej/f6PaSK+yQKme9/GPW738E3EujY0zMoC3Y+qLnYCoi7wa+GfiQqr5HVc9lN6zJp5+ykeQxlNajGaSgW+s9Hd8qu7Idpa2lTepaBKpVVxoTRrNVz3f7V7YjEwdxs9ddmoYbRpKkPCkiyNw+vPkl19O7Raqc9fKPWb//ETD0FmwAIvLnwMeB60XklIi8dLdj0uRMfx14GnCXiLxDRL5DRHZvqDml9GPV13RMreb6IoYBVMoEF864UpU0ZScautyVKtEqkPpT3uVH3fd9+51ZQxg4o/ug5rrNRP0iZWEJ//Kr8A8dwdu338pejJ7ILx8mf+RY1N8zQHJ5ilffwML1T2qSLZvyqzEzlFOd9fvPmmjV7o4WbENYzfu9qnqlquZV9aiqvmm3Y3rOmarqh4EPi4gPPAP4YeAPgf19j3gP00/ZSNMxInUpFnBlK+sXYeVIx+NbS2O4dKFhxkAcUHHnrlWQQhHdWodQ6dQhT9cvEgLe3IKVvRg9kyyPScqXrSUxs17+Mev3PwqiwDn2FmypmoNHq3m/HfiPwL8D3pzFoPYC/ZSNJI9pqipKFm63k2wjWqVl5hejZ7T5e2EegoDCk5+Jbm10PqfngYJurlvZi5GKXuXLWS//mPX7nyXSOCC9HXgy7hPAG3C507D7UdNLP420C9feSO3x97u60KCKqwmNZpXRSsjw7GnW/s+rnHRbKTedt9WRyCvOEW5tUA+iIi4whzXCtQuU/+F9TtbN5SEgserS5Vm9A9FMuVbFWzzQs4m9md8bvbr37JVm2lmtON4r928MTloHpO9T1WDXPWeEtGUjlXtOUL3zE8jifjQMnLQrwNw+t6BIFXyf8NxD7oClg5BwJkrKxFouEV6KHIz8HN7+ZcLSVj0H62ajUZCt1fAOXka4ecldE9wMtziP53l4iwdY+oFX9HwP7VyYks5JxvSTRr6c9PKPXiXrfpn0+zeGQxqZ9zbg5SJya/T1YyKSz2pg00hSpvUWD0TGCsDWugukQvQ9WqFbal7lm5SJw421+jGyb8ntV9l2QTleuSt+tIJXCddXd7og9SHtmvm9AdMlX9qKW2MYpAmmvwv8W+B3oq+vjbYZPZJczSvFebz9B90ne1VnmrB0ECf5Ritza86QPl4l3FQaU6vVj/Fi6TdMyLgx4kWF9TtdkPpxNBplw3Fjcpkm9x5bcWsMgzQy779T1SckHn9QRD477AFNM97yCsGFh90/aew6NLfgum4s7kfyRYKtdSfFhiGgrtuL5+EddG9SceALHvpXtLKNrp0nWD0PhQJ1WTee3boH4PvkrjrWJOXGec/Nd/w+m1EXGXCz3IXnvKhjgDXzeyNmWuRLW3FrJBGRR+K6xVwBhMAbVfU3dzsuzcw0iPx44ws+GresxeiRpmbdSdeh6766Lt/KwlLU3SVssmfTrQ0q95yo5yw1ri/VqOylUnE/5wsNByQN3M+FuSYpNz5H7cH76u3YYnRznc2//D8dnZjM/N6YNqZJsjaGQg34KVX9N8BTcOnNG3Y7KM3M9KeBvxeRL+Oye48CXtLPSGeVerPuyrYLqLkcFOZgfZX5W767vkJW8/lGDaqfc8YLntfIS/q+C4Ke74JunDtdPIDM7wMgPH8GAG/lcuaf+W1NM80470mt0n6g1Qrlj3+g7ey0n1XMhjHJ2Irbvc36He/b0YJt6fgtfdedquppnPkDqrouIl8ArgLu6nacqLYv5m+7s0gRuB4XTP9FVcuJ525W1ZGvQjl+/Ljecccdo75sX6y9/pUwt+BqRCNUFd1Yxb/synpwCs6dRhaXd+zHdtQYYW6B8OyDiZkpkTVgHqoV1wkml8O/7EpYWib40ufQShkpFCk8+ZmU7/iQsxqsdmn6Iy1dQfwcxafewsLTnjXEV8SYVUZhfj/rBvt7kNQtPaNA+gYanWNiO8GXDxJQ6wNyPU0/Atyoqpe67ZvKtEFVy6p6p6p+NhlII3411ShnEG95ZcdsULfW0fJ2k2G+lredc1GSKC/pLa80XI3qNoK4n6vu3FqtoKUtal+5n9qJ212LNs9DqxXKH30vbG06O8FuaNjcXiuoUf7Ie9j66HsHfBWMWWcU5u9mMD8zZNaCTUQWgb8E/stugRRSBtPdrj3Ec00l7fKNurXhZqvJUpO5BXRro21esnjTzS2uRi3Kguc3moPXZ56RKYTnN3KsffptVD55W7+3bxjAaEpRrNxlZjhGI5DGDNyCLSr7/EvgT1X1nb0cM8xg2rteHCEivoj8s4i8J3p8TEQ+KSJfEpG3i0hht3PsJdo126Y453KiCbx9+932Nk25C9feCMU5J+mKlwiqsS9v9CtNlse0k/I9r3mfHtGK9YM3BmMUpShW7jIzDL0Fm7j82puAL6jqb/R6XJoFSFnwE8AXaJjl/yrwOlV9m4j8HvBSpqyWtdU1af2tr2tfarKvc/+A3OFHUDt/xsXPWg0IGx9lgg7ybetio/oslZYVvQnnpDZIoTFOsxU0+mEUpShW7jIzvBaXM4XmnOkgLdieCrwQ+JyIfCba9t9V9W+7HTTMmel9aXYWkaPAs4A/iB4LrhvNrdEubwaeP8TxTSTtpN+wtIVubXRsPO4/6rHONalWaxg8pBUGglrjq4nu5/Gv+2qgv+bohgGjKUWxcpfZIFpktKMF24CreT+mqqKqj1fVJ0ZfXQMppJyZisjXAdckj1PVt0TfX5ByzP8b+G/AUvR4BVhV1fjd/RRuOfJU067UBD8HQc3lT8F58VKul6sE938R2bcfjc0f8nnnx9srsSF+chGSiJOIwy6lw3MLsL4KtDQ6bzNGw+jEKEpRrNxldogC59hbsKXpGvMnwLXAZ2iYNSjOKSIVIvJs4GFV/bSIPD3e3GbXttMkEXkZ8DKAq6++Ou3lJ45W6TcuoQm3t9CoqwtAeOEslXtOUDt9v1u5GzsdFQvpgqlqcyCNFywF7QJpYtYbBNQeuJv1t76O2tmv4C0dbN7VbAWNHhmFe9I0ODSdX69x6nyNUkWZLwhHV3KsLI07O2e0I81v5Thwg6YpTO3MU4Hnisi3AnO4nOn/BpZFJBfNTo8CX2l3sKq+EXgjuDrTIYxnovCWV1xOdGu94bcLoCGb73hjc15U1e03CN1mo8nPM4Ezmgg31qC8Tehdwl880HjebAUNY2icX69x9+mqKyH3oFxV7j7t/vctoE4eaXKmJ3BehQOjqj+nqkdV9Rrge4APqur3A38PfEe024uBdw3jenuN4k03O4OGsKV8xfPbLDAaYUWSCN7iAVfGs7AI21tmK2gYGXHqfM1lZDxBRKLvbrsxeaT5eHMZcJeI3I4rigVAVZ87xPH8DPA2EXkV8M+45clTSXIlLIWik1nL2/VVsVKcQ2MpNm6phlBX2D0/mlGOaGKeLyDFedcT9dIFd30/h7d4wFbzGjNJqwR7YF5YK+nQJNlSRcm1VveI225MHml+07+YxQBU9UPAh6Kfvww8KYvrTBLJBtshApGPruw/WF8VKwtLaHk7you2WPtBQprtYyVvGvwc3soRtLTpzCLAjSdwJvoWQI1ZpFWC3dwOWd2EQg7yvgxFkp0vuPP4CfEpVLfdmDx6/i2r6oezHMgs0bQSdv1M3TxBt9bxDh1BKYMIUpxHS5uu+0scM/18i9SbYSD1PCfnqrpAGgf2uJH53IKt3jVmkqQECxBGS0mCEAo5wRcIQuXU+VrfwfToSo67T1cJQsWThoPo0RXLl2aJiMzh/HiLuBh5q6r+wm7H9ZwzFZGniMinRGRDRCoiEojIrn6Fxk6aGmzH7dhEGjWfuQKUt1l4zgvxLruiXkvqrRxh33e+rCELg/u+sNTuMjvJ5Xe6woArk/F3/oN6B1ZcS7hK2bku+Tk3O/Z915R8335bvWvMJKWKC3AxYeg+X4aJz7aDSrIrSzkec2WeYl6ohVDMC4+5Mm+Lj7KnDDwj6t/9ROAWEXnKbgel+a28HrdY6B24lb0vAq7rY6BTT5wPDc6ddj1LNUTEa3RyKRSdI1G+6NqwBYGbYIpHcOEM1KpIvkj5xKcIL56tB9nw7FcoffCv8ZYva6pDDbe3dhrjpyEI2loLhhfPuu37D5I7/Ai3ijcMCTcvoZcuoGEIns/6W19ncq8xU7RKsJ7nZqXDlmRXlqwUZjcuvvo/7WjBdvDnf3cQ0wYFopwW+ehr109FabvG3A34qhqo6h8BT085zqknzocGFx5GS1su71kpo5UyWtpy27c2nMtRtQzzS41m3kGt7mqkQUDtxO31TjAx4cMPEq5dQLdLaLVMUNpCL13sbXC16s4VwkBX43tVuHQRlpbR7RLh2gXXvi0+j4bUzp8x9yNjpji6kkPVSbmqjVmq77l2iW67SbJZEwXSNwBXAhei72+ItvdN5Bv/GeBh4AOq+sndjkkTTLci4/nPiMj/EpFXAPv6HOvUEudDtVyKXIXiZ5zBgpZLyNw83uJ+vMUDeCjeypHIuF4gl0OWDnb22AU3K11YdEb4G6t9m9b3hJ9zM+Yvfc7lTz0vCrzS6FBT2QbfbzQvN4wpp1WC3Tfn8ajLfBaKnkmyoyWTFmzRhPGJOL+DJ4nIrrJbmt/0C3HB90eBVwCPBL69n4FOIsMybQ9XzzvbvaDWWKwDje+VciSfesx/54/Ur9HaODxYPdf5IrUq4dmvEPqRRDyMRUj1UpsWIpclrZTRzUuJrjTxDuJm0+Z+ZAyZSXf/MQl2IjiGm5EmGbgFW4yqrorIh4BbcF4LHel5Zqqq9+PeQq9U1V9S1Z+MZN89zzBN2+sNwP1cw+4PgHiBUbTYSKTpGjsah/cy0wxqDCWQinR2QQoDF7BzOShvNxZJxdJ0GD1n7kfGEIlLT8pVbXL/Ob9uhgVGE1m0YDssIsvRz/PANwH/sttxaVbzPgfny/u+6PETReTd/Q13skiWqtQbdPcpW8ZdYKQ47wJOPdZFP0SraWXxQNM1WrvHUBhhq6hdA7fCwqKTeesGEvFTIRTmzP3IGCrm/mP0yGtxJSxxQB1GC7Yrgb8XkTuBT+Fypu/Z7aA0OdNfxBkqrAKo6mdwHWT2PE2lKjF9ypZxA3D/0OXI/AJSnHOrd4m6skQ5UW9uoekarY3Dc1ccJXfjkyDfY3/0QXKm4nUvrynO4yHIwhLe/oNuJhqX2IiQWzlSb15uGMOgtfQEzP3H2Em0andHC7YBV/PeqapfE7Vgu1FVf7mX49II/jVVXZOsFrqMEW95pX2D7j5ly9YuMJBoAh6VlgRr5+t51Iuv+TG8Q5cz/8xvY+kHXuFk5w/+NeGpL7vxHX4EAOG509ExbVyP+u4/EP0+k6U1nucCrO/jLS27AA+EG2tIcR4/arCs1TLe4gGWfuAVfV7bMNpj7j9Gr0SBc+wt2FIZ3YvI9wG+iFwnIr8N/GNG4xop7Rp0D1u2LN50c6O0pFJpDn6qhOfPsPnut7D10fey9Td/QnjuoWgf91x46SJ48WefIX46z+d3NggPQ7diuDhffx1G8RoZRkxr6YmVmhiTTppg+mPA43DLjv8MWAN+IotBjZpWidVbPDB02bJw7Y2N0pIds8rQybSVbSqfvK1RVuP5IHGv0RreyuWw/1D3C/l+9+dbqXUuwfEPXV5/HUbxGhlGjLn/GHuNNH+ZN0RfuejrecBzgcdnMK6R006aHYQdLki+D+VtZGnZmSyEAXWJVSPTzaDW6BQDzd1igoDw/BnXpaXbhZMNvnN5vKVlV4rTibbysJORY8k5bgYu0ape/7IrzfHIyJw4cMblMfHiIwuoxiSS5q/yT4H/iqu16fp+PuvEpTYa1NBSVEscV5Rcuuhmp63t0zrVeNbrVUM3S51bgLUeF0bVqoS97tt8YQC2Pvpeqnd+grBWhXLJxd1qheDCw67rzS1YQDUyw5pjG3uJNDLvWVX9G1W9V1Xvj78yG9keZocLUr0XaTQTDYP2hvPQ2CdJ5DgkiwdIvQCsrX1gD8zvo/LJ25xsXNmm7ngUuTiZ45GRNVYeY+wl0ny8+wUR+QPgNpqbg79z6KPa4zS5IMXmBrGcurAE5ZLrwlKtOFm2aVbaaXGRousXuzsjDQsR5ycc1NDKdmPscWeZyMUpXD1P5Z4TNjs1MsGaYxt7iTTB9CXAV+Ec9OPpjgIWTFuol9rQxlloewtv5QgHXvY/ACcJb/7F73V2IEoS9LDPMIjl5fjn+vXjGUGzi5PJvUYWWHmMsZdIE0yfoKpfndlIpojiTTe7INNK7ByUkGqdVLrHPmm3cXGyYGoMG2uObewl0uRMPyEiN2Q2kikiLiMB6jM4vMj9aP9B53Eb4RyQInekdvlQL2WpSzfaNADv6ZjWcXVwcTKMYWLlMcZeIs1f5dcDLxaRe3E5U8H1UZ2K0phhU7j2RspHH+1cgxLOSrFrUIy3vEK4eSkyxY8+29SqRN3CXTDVcACHowSB65Vad1HK5V2rt27nFs995PJ9d3zg+q7q1joafUgwg3sjKyatM8ukd7IxxkeamektwHXANwPPAZ4dfTc60ItrUPGmm51RvCpotBgpORHsJZeahnrg1N3PHXeTUXVBPQgSAb9Wb1JuLkjGLGCdbIxupGrB1u4ry8HtdXpxDSpceyP7nvsi1yA8WtjjHX4ExX//bLzLH9EorfH8LuU0aYgidb4QzVK77yv5gjPsV0WWlpEDhxpysechC4uWLzVmAivVMbph+kTGxIEmbjwe12YWrr2xYWh//gxA3ey+9pX7E7aCnus6UylH+dNE3Whdsq1vYNfFTNEu3sISB370Vay9/pWE66stOVs3a/WWDnLgR18FtDQvn3PdjoLSFuGFh1l7/SsHaqg+rMbshpElVqpjdMOCacbEbkj4flPj8drj76fy6Y+gpU23o+DM7m99Y1SfGp1AQ1eXChC0GDD0k0eNjym4PG5TzjZpb+j5TbnQ1s464fYWrEduTon7Slsm0+n1sXIbY9KwUh2jG8PQDY0udGo83tHQvhYtCKr/f6b5R20JrtLh1xuvMKZDzlYVKc7vzO0m878bawB4S8sDNVQfZmN2w8gS62RjdMOCacZ0ajyulXK02CgZLBM/9zrrzBddDrO1iXgu377UJpdHDqzUy3Pa5mwvu4KF57xwR243mf9FFdl/0LVpS9xX2jKZYTZmN4wssVIdoxv2VzBEkrk/im62F26sRXKo73xyczkozCGFopuZ7uqd25oTxQVKDevNu+Na1PDCw27FrRetwm137loVXT2HAhd/++eRoIpuNhqDy4EV5p/x/KZA2prTnP+W73GPo9lp49yuoXqaHOiwG7MbRpZMWqmOMTnYzHRIxLm/cGMNFSE895BbWOTnXFCLW6vVarC1ju4/2MdVIvk37jhTmCMsbRFeuhgtYooCbxj2ZnB/6UJTIAXQtfNs/vUfUbnnxI77SuY0/Uc9tm3Zj/+ox7bdPz5fK9Z03DCMacCC6ZBI5v50a72Rl6zGq3Cj2WIuh+zbD+ceinKaKXKi+QKysIjkC1CcJ7dyBG9xf8OMQfx0KdZObG/Vc5adcprB/V9sW/YT3P/FVDlQazpuGMY0YHrFkKh3ioFED1IaC3sivH37oTCHrq9GVn0e1Cp0LGvJFdz2yH1IS1t1s4Xav94TXTx05xqWU1J8P0Bw7jS6XWqY3OfyyOIBwtXzbRuql/7ubY3XIXEP3XKgw27MbkwvvTgQmUuRMQ5sZjokvOWVKCgSBTaN+pAmEcJLF6OZq9cIfNIaSCWxHRcs41ZuSdei+uM42A6xeLw4R+WeEy54J89bq6JrUU64DU2vQ/0Yy4Eag9OLA5G5FBnjwoLpkEjm/mRhqRH8kngugOrWBrnHHadu6ZcsYSkkgpR4OwNoR4ZZOO7MIDp2tFHtOAO2HKiRFb04EJlLkTEuTPsYEoVrb4RbGk5HctkVhGdPN3KnIg05Np9n6XkvYevQ5a7etFJ2pS25AlS2h2QbOAhKePY0YbcAXSk3P0yuZC4U3f1ub5mjkdE3rXLt5nZIIde8KKDVgchcioxxYcF0iLTm/tbf+rquXWMWnvYsFp72LCDhBLSwzwXVWgXdLqGqLsDGq4HTsv8QB3/s1QCs/tpPotVKoxNNUr4VL5KlY8m5y5tPS6eYVhcjahUIAua/5XssiBp9Ecu1ItTl2iCEaqBNAbXVgchcioxxMe4p0FSTRvJst2pWyyWobLsm3P3OVksb9R8LT34mdWm5Nb+a7CYDdF8WLE33YC5GxrBpJ9f6HtSC7g5E5lJkjAv7C8uQVum3m+TZtBq4vjEABG9ugRCchV+aGarnu7rWiPosOGlluLCEVygSXjzbfFyXPK3MLzTdQ9uxm4uRMQDt5NpCzs06i3npuFI3/tlW8xqjxv7Chkgn559epM62TkBRnjV4+EHnnFScTxdMo4B48Vd/3NW21ipoebsxK1V1JvqFovuq1ZzU286GMIGWtlh/6+vq9zcMFyPrHGMk6STXLs57POGa9ivJY2bJpcjKgCYHk3mHRCenoE7OP620SsLh5qWoJCbaoVaDzUv9DS4qZ9HN9YaRfuK5cPVcZCyhLmDvtno4DAkuPFy/v0FX8A762hnTh8m1u2NlQJOFBdMhMWjecIeRfKUM+/bjHbzMzUp7Ko/pE1VEleLXfyu5I0cbpTrtZqieD77vZOLo/gZ1MbKcq9GKmcrvjpUBTRb2l9mGfiTHQfKG7YzkS3/3NiRqxu0X5wnOPuhM7DNCK2Wqn/24k2bn5vGWDkJluzmXGiPipOLE/Q3iYmQ5V6Md0yDXZinDWhnQZGEz0xb6lRz7df7pdD2Kc83n8zN+U9GQsFZ14yhvE66dJ1xrE8zCwAV1Pzc0ZyNzTTKmkaxl2PmCELbETSsDGh8WTFvoV3LsN2/Y6XouYdQ4nxTnd10YNDBb624cC4uw3aU9nIZuPENyNjLXJGMayVqGtbzyZGGvegv9So5pymBar6cihOtnnHTq59yq2NJmw7bP85ClZVg6CJcuDHqLnVFFyyV0e4uupg0i+IcuH9qK235fO8OYZLrJsMOQf60MaLKwV72FQco8+sobFufQcw9FloOeW7UbW/X5PqiAhuilC+Dn0507LeKM+N1q3w5dbPwcuaOPZukHXjHUS1vnGGPa6FTe43u6w93p7tOu5K2fgGrBczIwmbeFkUuOyTIVpXnVrviR9V+0X2tecdiIM7hH6Cwph6HJr4bRA51kWEFsFe4UYsG0hZE3q66UnXzr+21atsWMaHWeAn4OWTrYmCkn8XLQ4n5kGEZ7OpX31EIn9yaxVbh7H9MH2pCl5NhaBkNxDi+oIYeOoOWSM1CIZ6sa7gxoWaIhVMvoaoWmAO7nXLCvVZEgoHLPCQuoM4C56wxOOxn21PmamfFPITYzHSHtymDCjUvodolw8xLh2gWaDOaDWvbSbltaPiEHtcjjV6BQNHeiGcDcdbLDVuFOJxZMR0i7MhhvfsGVolTKgEI+DwtL2deV7qDdp+LEtlwO78AhvH37zZ1oBjB3newwd6fpxH57bcjCdL1yzwlqp77sJNxcDllYwptbcL1Lt7dQ34cw3+jyUpyH8jYEffYxTU27fE1jm3/oSGNzolTIDOqnk0HddbKQiKdJdrZVuNOHzUxbyMJ0vd48O14hGwTo+kXC7S0n4xaKUeCsNcpjttZHGEh3R8ulxoOoVMgM6qeXQdx1spCITXY2Jh0Lpi1kYboen9NbWm4qOdGNNWfNJ+KkXj17+LYAACAASURBVIlqO5Oreke5AKkL4frqjlIhM6ifXgbJ62UhEZvsbEw6k/FOPUGEq+ed9JpkQNP1+JxSnMfbf7BhF6jK/C3fDeVtJ/smn4vJMHcqB3r1vo3qT1tKhbJ4rYzJYJC8XqmiQy/9yOKchjFMTLRvoV8HpG65w+Q5pTiPX5xHq2W8xQMUrr2Rcvx8OzJsvaZb640+prm88+KtNx+XyATJmTh4K0c48MOvbDp+GE3BjcklTV4vmc8MAiUMwfOUWuD+rERgrtDb8e3yoZ3chCalnGSa8rlJpvW+ssBmpi3044C0W+5wt3MWb7rZlcesXXD5Us9vnDyrYCoC1Yo7fxi66wZR+Qvg5Gat/xhuXNqRCzWDegN25jM9D6oBlKuN/vahQrVG2xxnL/nQSS4nmdZ87rTeV1ZYMG2hHwek3XKHu52zcO2NLmfqRb+OXA727c/wLls+zYu4a8bfk9Kyn0OWV/DmF3bkQkfuFmVMJK35zELOa/yFCfgezBWEnC9tc5y95EMnuZxkWvO503pfWTH+v8QJZDcHpFZJt3b2K66ZdpKW3GG3c1buOUF44eHGzLBSbpjdI/hXPJLgoQcGvKsk2lwJE89Alw7hRU+EiOtQE9TQ1XMoQri+umPcVhYzmyTlv0pNKeTAT3xIi1sl7Cs2Pq+ratscZ69lOO1k535kyHbHwO7dVzpdq934Q1XWtpRPfLG0Z+VRaz6eDpuZpqSdpEt5m3DzUvOOPeYO62Uz9RW8rX+o6kposu5lGgSwftGV6RSKsHauxStYoVJm66PvbRq3lcXMHq3yn+Ak3VqilsaTnX+ynXKc/Zbh9CNDtjvm5KkKJx+sdD1Pt2u1jr8WKuWqe132sjxqzcfTYcE0Je0kXddMe6uv3GF8vqY8aQu6seZckbIiWX4jbd4FE1Q+eRuQTQmRsTdolf/y0YSrUm3kM30PfOktx9lvPrQfGbLdMYFCEHY/T7drtY6/UnURKJ9jT8ujk5ynnkTG8qqIyCOBtwBXACHwRlX9TRE5BLwduAa4D/guVb04jjF2ol3zcFlYgjDAWzyQWvKsny9s0zHGa5TQeLk8emAF3bzUWHHr+9EKjwFlFw3du4l46OYlpLXcJblrJD/320TdGA7jXGW5UQoJ1cm2ntfIwIcKm2VFgPkiHF7yWSvprmPst8l1qeL+Nyq1ePWwmwmWIjvrdq9RUrqsBqFbbRz9+9RCJRfV37TKmW2PC2G7qhxdgcdcma9fS4FiDvJ+40PqbvLoJK6atebj6RjXq1IDfkpV/0lEloBPi8gHgB8EblPV14jIzwI/C/zMmMbYlk7lIP5lV/bVMLt+vlwOKsleptEM0fPIXXWs6dzrb30d4cYaGgSuafigwTRGQ7S0hVy2v9HbNLm6F5BCsXncVhYzcmLJcRjNpfu5dqguAAnuM1grCtQCOLMW9rxIqB97Pd9TtsqJYB5COYSFonZ8jXxPCVUIw5BKi8FYuaqQh5wnO+TMuDQneZzipL27T1d5zJV5nnDNHACfvW/bnStBN3l0nL/P3TDbw94Zi8yrqqdV9Z+in9eBLwBXAc8D3hzt9mbg+eMYXzeGXQ4Sn4/CXIvc6rlgVpjbce76GDbW2s9oByIqiakHSaWexxWh8ORnNo/BymJGzjhXWZ46XyPnR5mANrEhzpUGUW1plmOSeADSPB5BOr5GgqBRmU58bIyqk6rbyZmx5Jk8TgQK+Z2vfVp51FbNTgdjz5mKyDXA1wCfBI6o6mlwARe4vMMxLxORO0TkjrNnz45qqMDwy0Hi8+VWjsD8AuQLTt4VwVs5wr7nvmjHueNj6jPSdo28+0E84tXE+17wUth/iPq7Tb5A8WnPYuFpz2oag5XFjJ5xugGVKkreF4p52TEGSXwPNfsxuRIZdx3FfS/m3fZOr1EtdJJs/BHRE1e2E58n1PZlN3FpTvK4Yl7IebLjPtOW8Zi703Qw1vm7iCwCfwn8F1W9JD2uWFXVNwJvBDh+/PjI/+KG3Tz83uK/4WOXXc/FOeXgonDsCrj3Ibi4oRz8kvD1xYBj5S9Q+uBfE54/A4B36HK8lSOupEa1EUw1dEYMnheZ5qcwy4/djpZX3D3+2Ku77p5lE3WjM+NyA7rvTJlKVYmLtgrRu0cs9SrNGYdqoCwUu3/IS+YKcx4oShBK08/JXF2r05IIzBca1whCF5jKVbei1vfcYqhkbvTeM1UKOTfLdGYSSj7nAmoxL3W5tpWVpRwHFlxj71CVak0ph9rW3SmNPDrp7k5Gb4wtmIpIHhdI/1RV3xltPiMiV6rqaRG5Enh4XOMbFSdPBbznEwG+D/MFOHdJuf8MLM7DvjlYLyn//P8+x8r5P8OvbLmDBBdUc3n3VSmDBpH9H+Dloo4zad2TXP2p/6jHDvcmjaFydCXH3aer9cARRsp8lqss7ztT5v5zzX9PlRpNAaCVSg2uXO68QzJXCMpmFKVznguEAMWc+/nu01XWtwLOrIX13KKquwaEFHIuz1kL3N+wSCOn25rX3Sw3Ir6AW81bUfI+XHtFvuvrcHQlx8lTFaph498tln/Pr9f6yi+O4/dpDJ+xyLzipqBvAr6gqr+ReOrdwIujn18MvGvUYxs1HzsR4vtQyLl8Sbni3gi2K0RuMsLjL34QLW9HC5J8kChpFQZ4S8t4l11RTxp5K0fwVi5vuCmlQQRZPEBw/xeHfp/G8BiHG9CDF1wgbZUjQ4VHXebvKO73Pcj7sFbqLBwlc4W1oBEAa2Hzz3EO8cELwQ6npULOBZ74dcjnIJcTinmPQpu4mBy/QH01sieulGW313BlKUchlrhld3enXphkdyejd8b123oq8ELgcyLymWjbfwdeA/yFiLwUeAD4zjGNb2Rc3FDmExJREO5cJblUPY9o4IJoHXGLjyplHnr+L/GxE6GThReF5538xSY3mvgzdLu3tab3Rs9DFpZ6Lm85eSrg/Z8OOL/mTrSyBN983Of6o51rZo3hMIxVlr2WY5xfr1GL/h5DbZQha2SkFQfMfUUX5GJaHY9ar7e57WaU9fPSsCyJfw5DV4pSqUVBs6J4ohSifGXeF2oC10WlKVtloufb33PShEBpSMSqWr/H3ajUdv4nteY4W+XraqBUo4n9QkE4dmRnTtaC595mLL89Vf0YbdcCAvDMUY5l3BxcFNZLWs8/+V5cQN7YZz2/wlxtPcppJkpVPI/t+UNNMvF6STkTrHCFrOMREr89dZofNIpfxHny9ljecvJUwF/9Q8DWdqMN69k1eOdHA17wNCygTji9lmPE+yVprcRyOUQXZIr5xr91Mu/X7npB6IJMISd1ebN+jfgHoS75Js9brigUGi5D8bnj82z3uHhnuxIyV/BSNT4PwsYCq1DdveR96vnhTvJ1zGZZOflgheuvGn/pizE8xr6ad9b5+hs9gsC9EakqxYJ7s5oruE/LlZpy58FnIMU590QYuPxoVDZz++IzmmTiQk74/GXPoEyi/rNjKE0gIMX5nstbPnYiZLviZDJPGiURlZp7zphsei3HiPfLd/hslPfdOXJ+nJ9sXw7S7nrxwqAgVHJ+Y6Yb50NbFzQlUWLHJae5xOfO59ihyXSjVh9z743Pfa/RnTCuJKsF7e+11rJsIVk6ZKUv04UF0zFz/VGfZz/FZ2le2K7AZfuFpz9BWNnvHi/NC1/zTV/N/ue/aEdudN9zX8RJ/4Ydb3Rnl2/gI1d8P97hK+vlLiE+G7lDbOQPEeLXl/iHCOQLyMIi/qHLey5vubjhXGda37hChdUNW9I/6fRajhHvV8x7O/7O8r7b7n52wbFT3q/d9eIZqZvNCvuKwkIRfL/xc0yc10yiuDKXIGyU6eT95lyp5zVWHSdJnipt4/NCTijkG8sS4g+T7e611ds2vnaoVvoybZjGMEJOngqacptff6PH9Uf9+len/f/yowGhPhZ/6b9x+SMbxwEc/FK1SSYG10ty84rHceCWJ9a3vel98X6Nt5H1klKtuVlwcjy93MPGFlFhekOSjuvvlheHs6S/0+tlDE6v5RjOnk9RtF6TWY5mhNXA5TPj2ZbArs29NSopiXOkyQCdzwlHV5qD2mfv2+bSVljXVpJysCp8/oEKClG5TnNwcouKhGqbHGcyN9spV3zfmTIPXgiohe51uOqQX78PZxUYUlHqjlCfunuLRx8pNL22yTxw8rqe9C4rm53f3sBmpiMiLoFZL2k9t/meTwScPNW+fCXe/9wlVxpQrUGpDOcvNR/XKhNXakoQuO1JWvdbLymbJfepvZfxtN7DUmTLG2qjhk/Vna/12qN4vYx09OLSc369RiXKhxJ9cNqu6A7pNYyeQzp3STm6kqMWKNsVlyslCkDlGmyVw67H+ZHsGwetJN3mdhrlTttZHsaI0LajS1wKVAsbq4rvPxeQ99z9l6sh5WqzDL1VhpOnKhyYl6hsJ9w53uj/xPd2l5WtOffewoLpiGgtgSnkBN/vnF+M9y9Hpt2eOFlpu0LTca0y8dK88Oyn7Jzptu5Xrbla1sX53sbTeg/zReHgUmNWKgKHD8ALnjac1bxpXy8jHb2UY5w6X4vKTGgytO+EWx/Xubl3PhflDInOF53QWQ92Pu76qwrRSuF09xgrJd1YKHptc8XJUqB4YRPAxU11TkiJIBk/H9esrpXcPnEapHXcnsD1VxV2nWGazeDewvSCEXFxw3XTOLepdWl031wjv3jyVMD77wg4v4771A4cWHBvNBq6zgDx/29pG1bXlV+7tVKXP196S/dic6BJTv61WytNJTngJLdu+c7WMh5nw6ZsV+CnvqNzp5l+aL1WL+ObVYYlBa5vBU3n2Si5DzSx8uB57Y3tY4IQSpWQfE7w25SKlCrEi9DJ+VCpNsrANrYbJy5VlI/etYUnsDjvcWBeqIU7Z8S9sNsx8XW3K8o//stWff9kmUxyIVS83fel3r81XiwV71yqKCtLOXy/SjHPjnKhWtjbKt7dmnObBDxZ2Mx0RBRzsLbZXEe6tulk0ZOnAt750YCza41/zDDk/2/v/GPsuK77/j0z837sLpdLaSnTBimbiqXQlf+QFBOuaiOuaqSK4gRx7SKIkyJ1AiNOAadNgaKoWxStm6JAgLRxkbYJ6sSuE8RV6iZx4jiG48I/6koCHTG2flC0KBEiJS1NUeSSS+6v9+a9mdM/zr1v7ps3M2/ee/t+7O75AOTuzpu5c9998+bMPfec78HapvxMElyEzaZd3xre/XnbAerkvVlaUfF65zDHDMskz7WbGdYVmD5usxHj5WtRl8s1iiUtxa5vZgXTpLGpIq2Ie9NikFzbbuWVvHbaMXBrS/qVTo8pS1n7y5Drq51STLJuWZfzl1u5VlpyV+V9j1pcu+h4dQHPHmpMJ4X5/tjKap2HVRLXZdhO3EXWpdv1xJtC3GTDuz/LrrWOesywTPJcu5lhXYHp4+xN23W5drAGpYxlKkgVsRVW+vqLHcoY8J3Evm0nm7uLwLin0w96FhvQBIxeXLvoeHUBzx7qExiCYaJMmy1gaQHYbCTlqYiA19dEjDuK5IsboVvmjNEbEQhI1TaL6/4s27cTx3zgQTHCaxsSDexXgD8/FeGxA3HmceljDo0xwnaS59oN5Ln0hnEFrm9FWNu0D19yvF3fizkpfs2dPRICD4VKQXbfmIGzr4ZgDjvbKj6MGtBIQzFW+hlv308iiit+9ns5d0neNwBUK1L2zdY3JQJeuhLiwpUW2jFKFU1/6UqILRM7MW9mtZN0Aas7uRzEO1VYekqcPHmST58+PbHzucL09ssURcgM+nFxU1MaIePmpmy3ikfWteQaTt8TvdBKAGw1ZJvrgrr9AFCvEcI2Y3FOjM0wfRv2PSmTwVXUcYXQ7zYSes0Wd80ko1iUiKyAunucVSvKw500pndLP9RlPeTtVdxxsQIVVlrRCjikH3zJ7GtfCHxCHDOaZvIoZd+o81nmSTlmffaex2Cm0p970TmKKLr2xmBQd/UajvrMBmTYKFPXbbm+lWw/UO+O9kvfnN51L2HbyJFZ9y8gx2w00OX+HLZvGjk72xS59AZ1BVpDmhsZS6mfGa9R0T57FeNJYogh7eTHcvdyDKM7+jdmU5XGCPbbVBsicYX3c8/2K3I+bhewupPLo8Z0QG5scKYSTL8oUzc1pW2ieZfmZWbJ3BvCH/hAvQK89/5KV+HiwAMW54y7LepOhRm2b8Mep0yGIrWiohSXrOMsWSkbbzksF0FeSoktvs2cFOUeN0TSr1r/YPWRz5O53fx0C5D7nhxQC7KLM7lNxXGSI2v/tvvYbUWFwPsVOS/7uQ9bbFwLl5dHHd8DkhamB8pHmdrUlLQake9J3qd9ovU9oFYVaUEAeMMh6lEvqlXFteumxAzSNzcVpx0BawAIjEogs2XP18jZWaFIrahoPSvrOBdrUO1D2vEjNVxb38Zms1vowK7hVwJCaNb+mq3JTEyZgVdXo05EsecBi3MewnaMrWbfwwc6T+Z287OzDEPAQl0s6GYjJ6KXe93h1iB5Zn3afWApivAt+uzzKs3sZLFxLVxeHp2ZDshORJmm2/C97nWWKAY2t4G73jjYOcvu56bixG4aAMSor21KLqtGzs4Gea7cpTkqTI/IOs69gbszpqO3+5IL2uw1EGz+a4Td662Tmpt05AOR5KQOmyqzE32JoxhLc5K6lc67zVpHtq7fKJbPiSFjL2uvxRG+w0QEjxpFPK629jo6IgOyE1Gm6TYYwHzNVtCwBYeBC68Nds6y+7mpOHHqZsCQ8x+Y1zJqs4KdfaRnoO56FgD45oa9strumrW4x731jRWsb0U9mrPHj9Tw9MVGblRRZJYi2DFs06IdJcpE04ifXG8Ani8eIBuIZGdu6e+T78m/wMQkbIfAfI1BSNyzRdGxeZ99UfDPMMdMoq29jo7IEOQJ0w/bhlUjSiulvL7G+PRXWrlpLl9/qoUnzop2b60iwUrvvV/cvjY95rEzMS5di3HhNXTauXIjERp3Z8TWkN6xJLKFyuyQ5dJ70YgHhG2p4ON5Vpi++LjlxQDHj/SeY9vo7hIAZBiqStBbWzTNpCJ87fU7DRgyVhWfupZemBmbTUYtQCdi10bqh23gB44kn4XrnrfBPEUGdVTjlVa3GsQgauHycugIzQBZa52bDblxpYXe8aAY4q8/1cI3nzZ1HEncs998mnHtZohXr6JTLPzaLcbLV0SHd6Eu7YTmi+6G8rvpOKo0tDvwPcZWMzEqcQw0Y5n5DMNclToVXbIsYhnX6qQmiuli4pOEkL+W6CExpGnOvhri3jvl9zKF2YclXYh9sxFjzaitVXza8fMpgi6KzQBZa51bTWCulp+u8sRZTgS2nZ9nLqIrzaUZyvZGmCgmzdcSmTQ3itEjCXxSpaHdAXVyVTplbru3D8ix5SA3WGnW8Kd4eR5epNy1xEpB1HHM4i4dd7pJGXUrTW/ZefSOOQNkVX6pmqhaFzddJSua0obbu2kurhawZaEuhvqOJXENSt1HmckePphddUaZPWTNrTtto1YpVigqYnkxwIlj3dUFbJv9mLRxO3F0ZwsrlOWORcK9b57LTUnq53zeDnns6Sbp9l11q3GcTxF0jj8jpNdhP/2VFlZvJfUfbVDS8kHCuZWoU2PRpgsAicG8dhNYnGPUa9RRWHJvdq1I0m2yKs3YlJlHvyF6cstLwMPvUOM6i4irsTtNwarglCUrtebQgtelqtTKKBWT5Wad1HopAJx5ZbKL+h4Bdy5LoJbFXUu8eKWJ51fCvg8y1q0rLnMZLd880NqUm1FJu6BttR9NbxkvOjOdUe56I7Cx3T2z3NgWI/mlU1FntmDTBez9rlaR39c2ge0mo1YVd269ir6pPFnVa66uAV94XItyzyKjpi3kVR6xxa2jmNFqZ6eh9FQziZ119z14j45ZioNfvNKb3OoWEi+CgKTYukMUywPu0tzODFz6urCzVCmyrukt40KN6Yxy4TVgYQ6dHFTfk79fuCRrorctUo8U4YG6zFwPHZCn0fVtcds+dB9h+WBx8XAgv3pNI1RpwVmkTIHvIvLW7mxx61qFEJo0lHqVUK9Sl/uQIEEtogiUuIQX572prmkWMWiB8TS2aHjWNvudyaIWyNJKEGQXOa8GMu47Qfq6WKh7eMthH/M1b6jrRCmHjuYYKVPBJW+fGxuMA3UxfhZmxuXrUgsyTuUteCai9/ur3dtXrjEuvCY3voPzMuN97EyMPz8VoRYAoGQG7KbDuNVrohh49Wp3MXJ1+84Go6QtFFUese2eemHblB2T6zCoUScFZL5Gne3tmNEMxX3ZbMWdtqYVcZvHqHmpWbNPuy3rvb7lsI/FeR8XrrSw2ewOk7aGlyFRtuli6lmpLGUruGg6y+TR0R4TbiWWrNSWfvtkpctsmMoxWV/amLND8ltOGszNLeDrTzEW52XtxrpzgWyNUetCtuS9D2V3UkYqLm+fwEvECqwhTV+Ws2ZId4rV9XZXvmgRL1+LII+mvdjhsQ8dPcXUU6kz61sRrtyMx5ZSo4zGjDpjdj9lKrEU7ZOVLrM9pBapkzUBAGiGksdKzpNxP9Hyg/OjFSNXZo8ya655+xy93e9sD1u9hnSv4nvdKSVl00ts/WIXm54ma5m9xdTT7vdL1yOt4DLD6OPMmLixIWILlkaTcXMLuHaT8YnfD7G8BKxvScFwF5v+cuKYj0t3x/i/z3SLjg9D+kYXZs1g+8QXXV/vbmltnXFuJdLZ6ZQYtGBz0f52e+ABTIwXL7ewstrG0hzh5jajHcnnTiQPfu2YzYxr/xHFwK2tGE9fbGA7lIfcMuS5lys+MF/zuj6PPPd7OwZqY0ypUUZDjemYcN20jSbjxkZ3zcOra0lN0sW55DirPnRuJcITZ0c3pOMiZqi7d0rkuQGBbHdfv/3tWpzsQz2qObWKJP63I8mR3Kvu27LELAZ1rkrIqAtQGo8kXzb9mZVxrbvbNcVlNlA375hw3bQbjW65PrfI93Yzu8rLY2fimdbHjRnq7p0SgyrolNm/jGqOFXVXjDYwUd/lkSICP/szK+Na1xSX2UON6ZjoKgZuPGJu6DwhKbLsKh+5hb5nHS0gPh0GVdAps38Z1Rw1pL2MEh2cjuC15KU8HT9SGykVShkv+imMATfdpRbIDC5u21qI3ftuNoALr0lR7gfulm2//vkQN7cm3+9BuXxdfv6XPw3x8Mnu3NW8lJ8y6UJKMYMWbM7avxVJpZlTL2xjrkrwPSmAEDP3CDA0wnhoicK9zEYjGRRComDkbi9is8kgAE88v4WFutcVgLQdMnyPEXjUU1lm2sZz0PX6/QLxNAoC7iAnT57k06dPT7sbHdx0lygCbm4O9kTvAdiN962FGvDBH/Y7BtOOQcWXdeAoAh64m/Dd89yzXbWAB8NdA7VpFczInaWk929FYjgrJpI8ZllHtdfpJGUB9xK1QOq+toeIzaoGZpbLIuwQx9xJdRP9ZSr8jCfFoNfegOzqxV918+4wbrqLm35Slt1oSAGJELbrp3kpP0+c5b7pQkp/BlU+Su8fx2JIaxWvsx4KyJ3MqhnZotZKeULzcDgIttqTlQSNWNatre42kRjnWUmDGXfFm92Mzs0dyrog7X6vr8kTvv0C+b6IJBCJWtF+ohWJu/pff1aipgJfxsD3ROawVoURZQeubSbi/Qt1XXcdhnRayyAFpq2qkYvVYg7MNVwminyhRkbVRwH6r5/a8XKVoWLunuEBKTcxi4HdDmMEfnfh92lQpJq139FnT4N1TaaLcacF3u1+q7cYWw258cQs/zpqQ3pdoR3BRB6K8tJGQ0L7b252i/ffNOkXymDkidT3U+QBZA21p+KL8aA0W+WXJZolcyz3C/0ie21Oak+RAO7v34xj+WzShmzSZF07mp4jqDE1lFEscveb5bSVWcE+dQOSArRg8mmtGkzHBa7fw4EZxd2WlXrh02Afg+8Ntza4l6lWKPfBMPDl4THwzbWfer3fA3iSoz7dB5hRKxXtZdSYGm5scFdRbSA79cPuF8UapFEGa1BtybilhcSt5ZH8HWaU+FKKGaXAdNaa64ljVQSp6z/rpk/mPItzeutIE3iEauB13UfsOvSBulTSqQWEWoV6tbBJApiKqAVAFE/3yXPUSkV7GR0BQ5awvFUjspxbidAIgVubakjLEsUi823lCn0PWJoH6kYXLWxzV2UcpRz90mP6pS+kUyxW19u566QeAQfnPdx3vN7VdhGHFsRa3NqK901+ajolZr4GLFQJNzYZa5vyWiviruAuu+TBLN8Rj5L1a4t9qGlFMjN9+mIjNx2lTNrKqKkts5CeM4vo46UhS1jeLaJt10orweg1EfczUQzcMIXLiwqVK8UUudsGXU9dXW/j3KWwx+jZm7pPiRvPbTvtybF4pn+2j/uVrSZwdZ17cnSjWKLf7cOLHSIbe5EeMubktaqf/3mW+dxHWWtXitG7mMFVLMoqom3XShfnCEvzyL2RKP1hlsLlRYXKlWKK3G3DyA2KbGDvax6AE8eqXdHDtu1axetZI6wGwL1vrnZmL7vdlo4iFzjIOWwajD1fYGRHAy9xrdcqQCXwcj/PYWQjNbVl59C5usOJY/k3drcKjLgoGdc3Jte3vYSsuQEfeaQy7a7savLcbZuN2Kzpy7pqJSD4znpq2s23sR131rA7ZfmMFfRMybEXL7cwVyXc2orNjFV2SNbDCfe8qYKV1TaeXwkBhLvekALiNm/H4rGq+pJLutOz7U40LwNzNTnfgz+YVL9IF2gHstfHy6StaGrL+NCZaUluO0CddT9bTk0ZHILcoN21aGXnsGuf9gZt1Y1akRjOLDefde/2uBed4wMP2GrGPW5IeV32Pn+51THk7bhcruqsI3m0IvvXbI8v7Y0h34usNJOy6Shl9tPUlvGhxrQk6SowyvDUq7pOOi5WVtvwvSSoBUYMoB3JOmaWmy/wk/2teABDtgV+sm+RcWy1kgp8igAAFJRJREFUZFa7F4ON2pGkvYwbG3yUTjMpm44ySrF3TW0ZHR3Bkpw45gMPAl/96yizuLZSEhId3zxUCH80tkNGNSB4HqMdSbK/Z8R2V1bbWNuMjeuXUTEhpRWfwCzHbTW5J5LUUmRMY5iUpz2YMia1RAm1CptZeDa+lzyQEInaV0DAeqPcmFix+7TrPq12lReBW2a/sm0pg6MjOCBhS4KPYt4bbqxJwwxshdmFxV2BfFeFSguQl8emzFT8JN+x2RLXa7OVSNlJbm+Miu8h5uRG7oqYb4diPIi4UB7Tlhbci7NSIAkKcofAo8RAVnxgvpakDlmsS71eTcYzb4wWatRzvEvZdJQy+2lqy3hQX9sAJBG90+7J7qYZZhcWL6tCpeST5cazyju+R6hWJHqTYTV4Ezdf2gVcMffbsMUdqcwsfEoKV+9FeyrqRTLTt5G3WS70NOnxLHIVT1vZSBkdNaYDYNWP6jVJj1GGI4qL1aVctAD5YGSlzPieuHIB666kzkzSTalJqypVfA+1IAlE8khSX9x9CJI6YwtXzzJE2TrQ1UBmhukqOYEn24FkzOaqhGoFHQUjOyvPmumlxzPIyLMhmg1lI2V0dK4/AK5KUr1G8LdYXb1DEMVOYfE/C/HwOyQlqYwK1awzrcLJRed9+mKjSy0p8AhUEUPquhZdVaV2zGiGyXzJI5mhVQMPnsdotbmTm/rSlRAvXm5NRYTdAwDHxWzzNbPcqczoiXdYqBHuOpLI4dlxvLUlBdHbtioOM2oViYR1XehRLNvTrK63EUXiJvc9RiUgBB51jOt8LRmsrDa0APfuQ2emA+BG9G4389c/lHIQgKtrwBcel+o8/VSoZp1pqcv0O++g0aBhO0Yj5J4UmLANNFrymn2IjFmUfqKIp1KOLUa34WQMtna72WScuxRidb3dGcf17V4JxGZb9m1H3Hcc3QLaBHl4bISMsB2LlCAVfxaqUrQ72R13qRnBVUla39YCyjuB50kFnsfOxH1VqGadaanL9DtvWXFyu1/seFtsMJKN6o2iRLzdzgIJkleaJYw/KVz1oEGPi2IZKzuORd6mSoC+42jbqVU8VCvJWMUxcOJoFSeOVQvbUJWi3Yn6DQx5KRl2+5UbMhP1PeANhwi1CnBwHnh9bdo9352YbI3Ok7tdFy1SoZp1pqUuk3XeOGbc3GKcemG74yYsiha1LC8G8P0WIjM169gnR9jBqio1W9z5HO2+9jNdqHtgFl1aux9ofKIHwxpxGzS1sR3D96mvq3qrCRAY9xRUSnE/D+sStmNhj1leDDqu3Bcvt7Cy2u64crdDiWwK2yypTZ6s3242ROReXb+ziX4SyE/JuHR3jO+etzcE2TdkYPWWrIVsqnjD0NibrlV+2U3ronn0q+QyqfO2ohjNdqLtat2EQHagTFZ7rTZ3DJ8JXO1gVZEotb2zvuol+81VCVFkxN7H+Ewx6pJLzEDVY8Tc/7OyruETR7PHs8x14LqC05+R77Ex2ubYGGia2XLa9QuU+0yV8aNOSuSnZDxxluH7ksoBmMg945acq5m1Ig1AGh6T5L9XFJGmpS6TPq9NY5E0mMHdhMeWA/gmfYa5ezZphdftdjuzC7zu3933fvT22fY0WKUnApWaObuu4SzKXAdFrlyyZtS615M/1fU7w+z+O9gOkJeSYQUabM1BIHFLHqj3L+ar9OI+9/sE3HEI+MC7d69r12VahZPT52WI+LybijGIu3l5McCJY1XM15DcyEnarFdlHdCKFizUCPM1wPep8zuIut778SO1zBvNTszX7Xqk22Z9QE9ArUqo+DJ2d7+pUlhi0eaZxpw/nmWug6Li7nJMMsZ5a8EqUD9bqDlAfmHwakV++p5TxBdyY7l6U/bby8ov46ASALUqcPgg7cmqMdNSl3HPa1NhXAZ1N2e155u7esX34BH3pNYUcXDB67Sx2Yi7XMf2e+UWzLa43y/7ug2QspG7RMB8lTrruL5H8L3yaWuBR4hi7oyPT0DERng+Qx7RGrii8cy7Duw6adhmtCD3GPvQ435GzVbyeztmRCa6ejuMEfjoKFfZ4gWaRjN9dGaK/MLg77qXEEVy8weMhGAk/+JYlJBqe88ejJUoBja3gbveOO2e7F122t28E+3ZNpqtONM4AWIsg5SDIi3hF3ipNVy28oiMpTnq9LNszqvrkl6aI5y/3IJnzhFlGFJbCMD3Bh9PN+WlaiRJGyGjFcVdY+qOdyuSVCRLFIvHrNmKu/qsaTTTRx9fkIjYP3YmxtoG45ATzXv0sETzvh4xIpNr55MY0nqNMFcDmtc157QfBHnS9z1ZI73w2rR7tHfZaTHznWjP7iu1Th29W8i1UQmAuaqH7ZCxUAPaESM0JQ/nq8Adiz5ubrO4Rx3xPc8YYI8IN7cZd5uaqtshMF+T2akVavDNLNeqOVV8dFzSrpxi1e8WprCCFa1Of7qFHsrirpP6HgEUo9WW/i3N947pymobN7dsTVqpZ2r7xIzOe7Vt2vcYxYyV1bbOTieMjrYhLyUjvf0//lGIuWp3oV4imeK7uW72i7efOXII2GhICpE7XsysEoFjZlB3s3UVbmyLb9QaOUAMVjtKaqS2WowVJDmsRe25xhdIXLhpoYWmmW0BQBMi8ff2O6td7ds2rWC/1Q6W6jiMRotxbBm473i96/yuoXK3g8VgNluMje2wy80aeAT2uWN8W5FE+v/AEXFTvXQlxJlXwk7fPALuXPZx/Eh+SaR0ClPF9xB4skaadpfb9722GXbGh8ior5m12OXFIFN5StdSp4Ma0wHJWl/1CGBK1nfGlUu327Dryr4HHHCKA+w2icC9jnU/WlH8nsvXWXu0LtZbWzHOh9mpGVlpH+cuhYii8tkxYRs4+0qIe9+c5GS6FW2SouROihUgCkZbEa7cjHvSTtztnXQdg/290RLj34ri7nJrRunp7KuhuJbTQ8TAy9ciAM1cgzpI6tTqehvnVsKubWzec+wzFurewG0q40XXTAcka321Gojrsl6VtVRNlxHsZHSruXslAvcD1lU4yFKFDfzJSs3ISvuI4l4D1PccSNp327SVbzqYFBK7/dL1KDPtxN3eTnXGepSi2KxVOm+LKFF3ijMMqcul6/kuqUHWnldW24hyPo8oTo7RYt+zw8zd0YjoESI6R0Tniejj0+5PmizJuw/+sI8PvNvH8kFZQ1VkPcpNOl/bEB1e38OukgjcD9g0jXhAa5fnTnTTPtoxY7PRq3U7SN/SbdrKNxab7xrHsk877k0nydvuYl+ygvZ22yBzvLSRdhkkdWo7FKNI6C7QDsh3yFVSmkY6ltLLTI04EfkA/huAvwtgBcCTRPRFZj473Z51U7S+em4lwqPfiNDWNdPOelPgA3csiXs3bPU/Tpks1lVoU0HKkudOtO0xc1ck6rB9c9u07sw4wzqHbSBmWZeMGT2uz6ztLlZw4r7jdTx9sYFbW92Rx2XeSb8o4rJr2a4KlTWodh3bungHbVMZL7M2M30ngPPM/BIzhwD+EMD7p9yngXjsTIz52vQEv2cFa0iBJABJi33PJtZVOIhQvEfIdSd2qs+0RjOkti23TevOTJdSs7SjpFB52vXpbu/RMjZdtWpNx5YD+EbVaZA4iJ1Se0qrUI2SkqNMhlkzpkcBvOr8vWK2dUFEHyWi00R0+urVqxPrXBlubDAW6sBti70VZfaCgV1aQNfDgpVYtNGV8zUjOWcinG870K1Io8W+Zw/rKlyoe5LraVKYAvPPLQhOkM/14LyX60607bmfsl13dPGouOpSxc93ZxZhC5WnXZ/udt8n1Jz3FXjAWw4n0bjLiwFOHK1ioUZdkn6iAkU932WPuo8flbQKFZGoTZ04WtVZ6Iwya59Klr3pufMy86cAfAoATp48OVN3ZhvtW68S6kbsIWwzNrYlovXWZneBYgLgm4fZO++Qty/RwslQhG3G4pwoBn36Ky2sbzNubTqpCjkuZSLgTbdT53jb9uqt/CdtG6VsCz97BBxeoq4+lMX21UUjeWeTnXYVLi8GWJqXItvWVQlADAPEGNt0kLTCEpBdMNvt4+Pf2+pZA7Wu3KL3M8j7TO/r9jOoU1c/yypBDYK6b3cXszYzXQFwp/P3MQDfn1JfhqKMmlLXLYK6xd77Fci2r9vI4fTSkdv2fBVdx3eOLbCH9SqwULcBHfL7sBG4u73YtzIaZV2Vw0SkWneqbTPtph3X+9HIWSWPWbsKngRwDxHdBeASgA8B+NnpdmkwyqgpxZGs+UQsTzPLS8DD73CCmnKOT7dv8wLrRqFlKxSpw8AXd6tHEm3sHm+P/f6q9MHOGg4uAO+4h3DhNXHD3rEEgCRgqKeNHRgLZe8jrkoRONgO5TrLUg8aRmFJ3KlNXLoeoR3LjPTo7TvnZs17P4P2U9k/EM+YwgARvQ/AfwbgA/gMM/+Hov1PnjzJp0+fnkjfFEVRlLGxq9d/Zu6Ripm/DODL0+6HoiiKopRFF68URVEUZUTUmCqKoijKiKgxVRRFUZQRUWOqKIqiKCOixlRRFEVRRkSNqaIoiqKMiBpTRVEURRkRNaaKoiiKMiJqTBVFURRlRNSYKoqiKMqIqDFVFEVRlBFRY6ooiqIoIzJzVWMGhYiuAnh52v2YQQ4DuDbtTswwOj7F6PgUo+NTzDDjc42ZHxlHZybBrjemSjZEdJqZT067H7OKjk8xOj7F6PgUsx/HR928iqIoijIiakwVRVEUZUTUmO5dPjXtDsw4Oj7F6PgUo+NTzL4bH10zVRRFUZQR0ZmpoiiKooyIGlNFURRFGRE1pnsAIrpIRM8S0VNEdNpsu52I/g8RvWh+3jbtfk4KIvoMEb1ORGecbZnjQcJvEtF5InqGiH5oej2fHDlj9AkiumSuo6eI6H3Oa//SjNE5IvrR6fR6MhDRnUT0DSL6HhE9R0S/YrbrNWQoGKN9ew2pMd07/B1mvt/J7fo4gK8x8z0Avmb+3i98FkA6+TtvPH4MwD3m30cB/PaE+jhtPoveMQKAT5rr6H5m/jIAENG9AD4E4O3mmN8iIn9iPZ08bQD/jJn/BoAHAXzMjIFeQwl5YwTs02tIjene5f0Afs/8/nsA/t4U+zJRmPlbAK6nNueNx/sB/D4LpwAcIqI3Taan0yNnjPJ4P4A/ZOYmM18AcB7AO8fWuSnDzJeZ+Tvm93UA3wNwFHoNdSgYozz2/DWkxnRvwAC+SkR/TUQfNduOMPNlQC58AG+YWu9mg7zxOArgVWe/FRTfFPY6v2xclZ9xlgb27RgR0XEADwD4NvQayiQ1RsA+vYbUmO4N3s3MPwRxN32MiN4z7Q7tIihj237NF/ttAG8FcD+AywD+k9m+L8eIiA4A+GMA/5SZbxXtmrFtz48PkDlG+/YaUmO6B2Dm75ufrwP4AsR9csW6mszP16fXw5kgbzxWANzp7HcMwPcn3LeZgJmvMHPEzDGA30Hihtt3Y0REFYiR+Bwz/4nZrNeQQ9YY7edrSI3pLoeIFoho0f4O4GEAZwB8EcCHzW4fBvBn0+nhzJA3Hl8E8A9NROaDAG5aV95+I7XO9wHIdQTIGH2IiGpEdBck0OavJt2/SUFEBODTAL7HzL/hvKTXkCFvjPbzNRRMuwPKyBwB8AW5thEA+J/M/BUiehLA54noIwBeAfBTU+zjRCGiRwE8BOAwEa0A+LcAfg3Z4/FlAO+DBERsAfiFiXd4CuSM0UNEdD/E/XYRwC8BADM/R0SfB3AWEsX5MWaOptHvCfFuAD8H4Fkiesps+1fQa8glb4x+Zr9eQyonqCiKoigjom5eRVEURRkRNaaKoiiKMiJqTBVFURRlRNSYKoqiKMqIqDFVFEVRlBFRY6ooiqIoI6LGVFFmHCJ6iIi+ZH7/SSLKrQBERPe7Za9Ktn+RiA6P2k9F2c+oMVWUXQQzf5GZf61gl/shAgKKokwQNaaKkoKIjhPR80T0u0R0hog+R0Q/QkSPm8LQ7zQyjp8hoieJ6LtE9H7n2P9HRN8x/95ltj9ERN8koj8ybX/OSLLl9eERs99jAD7obP95Ivqv5vefMv17moi+RURVAL8K4KdNYeafzml7mYi+avr93+GIkBPRn5rqQ8/ZCkRE9BEi+qSzzy8S0W9kNK0o+xZVQFKUFKak1HlIWannADwJ4GkAHwHwkxC5uLMAzjLzHxDRIYjO6AMQGbWYmRtEdA+AR5n5JBE9BNFyfTtE4PtxAP+cmR/LOH8dwIsA3mv68b8AzDPzTxDRzwM4ycy/TETPAniEmS8R0SFmXnNfL3h/vwngGjP/KhH9OIAvAbiDma8R0e3MfJ2I5sz7/tsAGgCeAfA2Zm4R0RMAfomZnx1mfBVlL6IzU0XJ5gIzP2uqXzwH4GssT57PAjgOKSjwcaNL+k0AdQBvBlAB8DvG0P1vAPc6bf4VM6+YNp8y7WTxNnP+F805/yBnv8cBfJaIfhGAP8B7e49tk5n/AsAN57V/QkRPAzgFqfJxDzNvAvg6gJ8gorcBqKghVZRuVOheUbJpOr/Hzt8x5HsTAfj7zHzOPYiIPgHgCoD7IA+rjZw2IxR///q6jJj5HxHR3wTw4wCeMgLjZelp38yefwTA32LmLSL6JuQhAQB+FyJk/jyA/zHAeRRlX6AzU0UZjr8E8I/tuicRPWC2LwG4bGafP4fBZoyW5wHcRURvNX//TNZORPRWZv42M/8bANcgM8l1AIt92v8WgH9g2vgxALc5fb9hDOnbADxoD2Dmb5v2fxbAo0O8J0XZ06gxVZTh+PcQl+4zRHTG/A0AvwXgw0R0CsAPAtgctGFmbgD4KIC/MAFIL+fs+utE9Kw5/7cg67rfAHBvUQASgH8H4D1E9B2Iu/oVs/0rAAIiesa8n1Op4z4P4HFmvgFFUbrQACRFUUphcl0/ycxfm3ZfFGXW0JmpoiiFENEhInoBwLYaUkXJRmemijJFiOgLAO5Kbf4XzPyXO9D2LwD4ldTmx5n5Y6O2rShKN2pMFUVRFGVE1M2rKIqiKCOixlRRFEVRRkSNqaIoiqKMiBpTRVEURRmR/w/jU/dQRxYS8wAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.lmplot('mean_dist_day', 'mean_over_speed_perc', data=data, hue='cluster', palette='coolwarm', size=6, aspect=1, fit_reg=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "719601.5383469037" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Inertia is the sum of squared error for each cluster. \n", + "# Therefore the smaller the inertia the denser the cluster(closer together all the points are)\n", + "\n", + "kmeans.inertia_" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kmeans.score" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mean_dist_daymean_over_speed_perccluster
id
342331193571.24283
342331321252.53253
342331372464.54273
342331137355.69223
342331099954.58253
342331385741.91100
342331243258.64203
342331143452.0280
342331132831.25343
342331248844.31193
342331125449.35403
342331294358.07453
342331253644.22223
342331154255.73193
342331217646.63433
342331417652.97323
342331420246.25353
342331134651.55273
342331066657.05263
342331352758.45303
342331218243.42233
342331359055.68373
342331226855.15180
342331425543.84223
342331197659.26323
342331266937.14413
342331069764.30293
342331211345.75160
342331334345.97233
342331143156.04393
............
3423313079180.29201
3423312932222.08111
3423311728165.10131
3423314434212.7591
3423311292170.64131
3423314315196.20141
3423310889189.99121
3423310851208.96101
3423313381219.3911
3423311448188.25101
3423311551187.13121
3423313834187.2881
3423312123192.74131
3423310590211.2081
3423312146189.8891
3423312648165.5861
3423310473191.8871
3423312226194.22121
3423310647167.2221
3423311832185.37141
3423311103203.80221
3423311192167.05101
3423314043177.48191
3423312391170.22161
3423312567209.76181
3423310685160.04101
3423312600176.1751
3423312921170.91121
3423313630176.1451
3423311533168.0391
\n", + "

4000 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " mean_dist_day mean_over_speed_perc cluster\n", + "id \n", + "3423311935 71.24 28 3\n", + "3423313212 52.53 25 3\n", + "3423313724 64.54 27 3\n", + "3423311373 55.69 22 3\n", + "3423310999 54.58 25 3\n", + "3423313857 41.91 10 0\n", + "3423312432 58.64 20 3\n", + "3423311434 52.02 8 0\n", + "3423311328 31.25 34 3\n", + "3423312488 44.31 19 3\n", + "3423311254 49.35 40 3\n", + "3423312943 58.07 45 3\n", + "3423312536 44.22 22 3\n", + "3423311542 55.73 19 3\n", + "3423312176 46.63 43 3\n", + "3423314176 52.97 32 3\n", + "3423314202 46.25 35 3\n", + "3423311346 51.55 27 3\n", + "3423310666 57.05 26 3\n", + "3423313527 58.45 30 3\n", + "3423312182 43.42 23 3\n", + "3423313590 55.68 37 3\n", + "3423312268 55.15 18 0\n", + "3423314255 43.84 22 3\n", + "3423311976 59.26 32 3\n", + "3423312669 37.14 41 3\n", + "3423310697 64.30 29 3\n", + "3423312113 45.75 16 0\n", + "3423313343 45.97 23 3\n", + "3423311431 56.04 39 3\n", + "... ... ... ...\n", + "3423313079 180.29 20 1\n", + "3423312932 222.08 11 1\n", + "3423311728 165.10 13 1\n", + "3423314434 212.75 9 1\n", + "3423311292 170.64 13 1\n", + "3423314315 196.20 14 1\n", + "3423310889 189.99 12 1\n", + "3423310851 208.96 10 1\n", + "3423313381 219.39 1 1\n", + "3423311448 188.25 10 1\n", + "3423311551 187.13 12 1\n", + "3423313834 187.28 8 1\n", + "3423312123 192.74 13 1\n", + "3423310590 211.20 8 1\n", + "3423312146 189.88 9 1\n", + "3423312648 165.58 6 1\n", + "3423310473 191.88 7 1\n", + "3423312226 194.22 12 1\n", + "3423310647 167.22 2 1\n", + "3423311832 185.37 14 1\n", + "3423311103 203.80 22 1\n", + "3423311192 167.05 10 1\n", + "3423314043 177.48 19 1\n", + "3423312391 170.22 16 1\n", + "3423312567 209.76 18 1\n", + "3423310685 160.04 10 1\n", + "3423312600 176.17 5 1\n", + "3423312921 170.91 12 1\n", + "3423313630 176.14 5 1\n", + "3423311533 168.03 9 1\n", + "\n", + "[4000 rows x 3 columns]" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.7.4 ('base')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + }, + "vscode": { + "interpreter": { + "hash": "b1e6b76b6e736d29445d5c5f779c1dafb0f59893c5766b7198bc0a87a8e7acf4" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}