Skip to content
This repository was archived by the owner on Apr 3, 2025. It is now read-only.
139 changes: 139 additions & 0 deletions Clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
from array import array
import numpy as np
from sklearn.cluster import MeanShift
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture

def mean_shift(centers, predict_data=None):
"""Function that perform mean shift clustering, can also predict values if predict_data is passed

Parameters
----------
centers : 2D array like
centers of data to perform clustering on
predict_data : 2D array like, optional
data to be predicted by the clustering, by default None

Returns
-------
cluster_centers, labels, num_features, predict
cluster_centers: centers after clustering
labels: labels of each point
num_features: number of features seen during fit
predict: predicted values by the clustering for predict_data

Raises
------
Exception
raise exception when normal array (non 2D array) is passed in as predict data
"""
ms = MeanShift()
clustering = ms.fit(centers)
cluster_centers = clustering.cluster_centers_
labels = clustering.labels_
num_features = clustering.n_features_in_
if type(predict_data) == type(array) or type(np.array):
try: predicted = clustering.predict(predict_data)
except: raise Exception ('Use 2D array for predict_data')
else:
predicted = None
return cluster_centers, labels, num_features, predicted

def perform_DBSCAN(data, eps, min_samples):
"""Perform DBSCAN algorithm on a given set of data

Parameters
----------
data : 2D array-like
array of data of interest to perform DBSCAN
eps : float
The maximum distance between two samples for one to be considered as in the neighborhood of the other.
This is not a maximum bound on the distances of points within a cluster.
This is the most important DBSCAN parameter to choose appropriately for your data set and distance function.
min_samples : int
The number of samples (or total weight) in a neighborhood for a point to be considered as a core point.
This includes the point itself.

Returns
-------
labels, num_features, core_sample_indices, components
labels: Cluster labels for each point in the dataset given to fit(). Noisy samples are given the label -1.
num_features: Number of features seen during fit.
core_sample_indices: Indices of core samples.
components: Copy of each core sample found by training.
"""
clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(data)
labels = clustering.labels_
num_features = clustering.n_features_in_
core_sample_indices = clustering.core_sample_indices_
components = clustering.components_
return labels, num_features, core_sample_indices, components

def gaussian_mixture_model(data, num_components, num_random_state=0, predict_data=None):
"""Perform unsupervised learning with gaussian mixture model for a given data, and make prediction if needed

Parameters
----------
data : 2D array
Array of data to be fitted with Gaussian Mixture Model
num_components : int
number of underlying Gaussian distributions
num_random_state : int
random seed for initialization, by default 0
predict_data : 2D array, optional
array of data to be predicted from the model, by default None

Returns
-------
predicted
predicted is the predicted data of data passed into the model, which is predict_data
"""
GMM = GaussianMixture(n_components=num_components, random_state=num_random_state).fit(data)
if type(predict_data) == type(array) or type(np.array):
predicted = GMM.predict(predict_data)
else: predicted = None
return predicted

def hierarchical_clustering(data, n_clusters=2, linkage='ward', distance_threshold=None):
"""Function that performs hiearchical clustering and fit to an array of data

Parameters
----------
data : 2D array
data to be fitted
n_clusters : int, default=2
number of clusters to find
linkage : {'ward', 'complete', 'average', 'single'}, default='ward'
Which linkage criterion to use. The linkage criterion determines which distance to use between sets of observation.
The algorithm will merge the pairs of cluster that minimize this criterion.

'ward' minimizes the variance of the clusters being merged.
'average' uses the average of the distances of each observation of the two sets.
'complete' or 'maximum' linkage uses the maximum distances between all observations of the two sets.
'single' uses the minimum of the distances between all observations of the two sets.
distance_threshold : float, default=None
The linkage distance threshold above which, clusters will not be merged.
If not None, n_clusters must be None and compute_full_tree must be True.

Returns
-------
num_clusters : int
The number of clusters found by the algorithm
labels : ndarray of shape (n_samples)
Cluster labels for each point.
num_leaves : int
Number of leaves in the hierarchical tree
num_connected_components : int
The estimated number of connected components in the graph
num_features : int
number of features seen during fit
"""
model = AgglomerativeClustering(linkage=linkage, n_clusters=n_clusters, distance_threshold=distance_threshold)
model.fit(data)
num_clusters = model.n_clusters_
labels = model.labels_
num_leaves = model.n_leaves_
num_connected_components = model.n_connected_components_
num_features = model.n_features_in_
return num_clusters, labels, num_leaves, num_connected_components, num_features
51 changes: 51 additions & 0 deletions LinearRegression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

class LinearRegression():
"""
Regression class takes in a dataframe of values with two columns, which are respectively x and y
User can call respective functions to get regression analysis outputs

Parameters
----------
df : (pandas.DataFrame) a pandas dataframe containing two columns, first being x-values, second
being y-values
"""

def __init__(self, data) -> None:
self.df = pd.DataFrame({'x': data.iloc[:,0], 'y': data.iloc[:,1]})
self.beta = None
self.alpha = None

def get_alpha_beta(self):
"""
Function that gets alpha and beta of the data in DataFrame

Returns
-------
a tuple (paried values) of beta and alpha, with beta first, alpha second"""
x_mean = np.mean(self.df['x'])
y_mean = np.mean(self.df['y'])
self.df['xy_cov'] = (self.df['x'] - x_mean)* (self.df['y'] - y_mean)
self.df['x_var'] = (self.df['x'] - x_mean)**2
beta = self.df['xy_cov'].sum() / self.df['x_var'].sum()
alpha = y_mean - (beta * x_mean)
self.beta, self.alpha = beta, alpha

return beta, alpha

def predict_y(self):
"""
Obtain regression results, store into data frame, and return as an output

Returns
-------
A column of DataFrame of predicted y-values
"""
self.get_alpha_beta()
self.df['y_pred'] = self.alpha + self.beta*self.df['x']
return self.df['y_pred']



22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,25 @@
# Stats Models
## T-Test Tutorial
1. User get array (or numpy array) of data from pre-processed module, then pass into t_test() class, t_test() can have either 1 data or 2 data. For instance, t_test(data1) and t_test(data1, data2) both works depending on whether user want to test one sample or two samples
2. Call functions on t_test() class to get desired values

```python
# For one sample t-test, call below function to get t-test statistic based on a population mean that user wants to test
t_test(data1).one_sample_t_test(mean, 'two-sided') # For two-sided test
t_test(data1).one_sample_t_test(mean, 'less') # For one-sided, less than
t_test(data1).one_sample_t_test(mean, 'greater') # For one-sided, greater than

# For two sample t-test, call below function to get t-test statistic based on side of the test
t_test(data1, data2).two_sample_t_test('two-sided') # For two-sided test
t_test(data1, data2).two_sample_t_test('less') # For one-sided, less than
t_test(data1, data2).two_sample_t_test('greater') # For one-sided, greater than

# For paired sample t-test, simply call below function to get t-test statistic
t_test(data1, data2).paired_sample_t_test()
```

# ML Models
ml_regression.py: contain 5 most popular machine learning regression functions, implemented using scikit-learn standard library

unsupervised_clustering.py: contain most popular unsupervised learning clustering functions, implemented using scikit-learn standard library
# DL Models
58 changes: 58 additions & 0 deletions T-tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import pandas as pd
import scipy.stats as stats
import numpy as np

class t_test():
"""
A class containing methods that perform various t-tests

Parameters
----------
data1 : (array) array of data of interest
data2 : (array) [optional] array of data of interest, only need to pass it for two sample test
"""
def __init__(self, data1, data2=None) -> None:
self.data1 = data1
self.data2 = data2

def one_sample_t_test(self, population_mean, side):
"""
Perform one sample t-test with a side and population mean

Parameters
----------
population_mean : (float) population mean to be tested
side : (str) only allows 'two-sided', 'less', 'greater', side of the test to perform

Returns
-------
t-statistic (float)
"""
if side not in ['two-sided', 'less', 'greater']:
raise Exception("Only accept 'two-sided', 'less', or 'greater' for parameter 'side'")
return stats.ttest_1samp(self.data1, population_mean, alternative=side)

def two_sample_t_test(self, side):
"""
Perform two sample t-test between data1 and data2

Parameters
----------
side : (str) only allows 'two-sided', 'less', 'greater', side of the test to perform

Returns
-------
t-statistic (float)
"""
if side not in ['two-sided', 'less', 'greater']:
raise Exception("Only accept 'two-sided', 'less', or 'greater' as a parameter")
return stats.ttest_ind(self.data1, self.data2, alternative=side)

def paired_sample_t_test(self):
"""Perform paired sample t-test between data1 and data2

Returns
-------
t-statistic (float)
"""
return stats.ttest_rel(self.data1, self.data2)
Loading