Skip to content
This repository was archived by the owner on Apr 3, 2025. It is now read-only.
Empty file added ANOVA.py
Empty file.
51 changes: 51 additions & 0 deletions KMeans.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

'''

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please refer to the style guide for the conventions we are hoping to follow (https://numpydoc.readthedocs.io/en/latest/format.html#short-summary). block comments below def statement

Function that gets data points and cluster number(centroids), returns coordinates
of cluster centers
Default values: number of runs on different centroid seeds = 10, max runs = 300
'''
def run_kmeans(data, centroids, n_init=10, max_iter=300):
KM = KMeans(n_clusters = centroids, n_init=n_init, max_iter=max_iter)
y_KM = KM.fit_predict(data)
return KM.cluster_centers_

'''
Function that helps to determine how many clusters to use by using trials of K clusters
The idea is to find the cluster number that gives the maximum reduction in inertia
'''
def elbow_method(data, num_k, n_init=10, max_iter=300):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

considering adding "saving the generated elbow plot" option ; (i.e. pass another argument "save" with default save = False )

inertia = []
for i in range(1, num_k):
KM = KMeans(
n_clusters=i,
n_init=n_init, max_iter=max_iter
)
KM.fit_predict(data)
inertia.append(KM.inertia_)

plt.plot(range(1, num_k), inertia, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()


'''Generate random sample (write another method to get data later?), just to show an example'''
# Assume we get this from the pre-processed data?
data, y = make_blobs(n_samples = 400, centers = 6, cluster_std = 0.60, random_state = 0)

# plt.scatter(data[:, 0], data[:, 1])
# plt.show()
elbow_method(data, 10)
# print(run_kmeans(data, 6))







35 changes: 35 additions & 0 deletions LinearRegression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

'''

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

refer to KMeans.py for annotations on code conventions we want to follow

Regression class takes in a dataframe of values with two columns, which are respectively x and y
User can call respective functions to get regression analysis outputs
'''
class LinearRegression():

def __init__(self, data) -> None:
self.df = pd.DataFrame({'x': data.iloc[:,0], 'y': data.iloc[:,1]})
self.beta = None
self.alpha = None

def get_alpha_beta(self):
'''return a tuple (paried values) of beta and alpha, with beta first, alpha second'''
x_mean = np.mean(self.df['x'])
y_mean = np.mean(self.df['y'])
self.df['xy_cov'] = (self.df['x'] - x_mean)* (self.df['y'] - y_mean)
self.df['x_var'] = (self.df['x'] - x_mean)**2
beta = self.df['xy_cov'].sum() / self.df['x_var'].sum()
alpha = y_mean - (beta * x_mean)
self.beta, self.alpha = beta, alpha

return beta, alpha

def predict_y(self):
'''Obtain regression results, store into data frame, and return as an output'''
self.get_alpha_beta()
self.df['y_pred'] = self.alpha + self.beta*self.df['x']
return self.df['y_pred']



19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,22 @@
# Stats Models
## T-Test Tutorial
1. User get array (or numpy array) of data from pre-processed module, then pass into t_test() class, t_test() can have either 1 data or 2 data. For instance, t_test(data1) and t_test(data1, data2) both works depending on whether user want to test one sample or two samples
2. Call functions on t_test() class to get desired values

```python
# For one sample t-test, call below function to get t-test statistic based on a population mean that user wants to test
t_test(data1).one_sample_t_test(mean, 'two-sided') # For two-sided test
t_test(data1).one_sample_t_test(mean, 'less') # For one-sided, less than
t_test(data1).one_sample_t_test(mean, 'greater') # For one-sided, greater than

# For two sample t-test, call below function to get t-test statistic based on side of the test
t_test(data1, data2).two_sample_t_test('two-sided') # For two-sided test
t_test(data1, data2).two_sample_t_test('less') # For one-sided, less than
t_test(data1, data2).two_sample_t_test('greater') # For one-sided, greater than

# For paired sample t-test, simply call below function to get t-test statistic
t_test(data1, data2).paired_sample_t_test()
```

# ML Models
# DL Models
32 changes: 32 additions & 0 deletions T-tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import pandas as pd
import scipy.stats as stats
import numpy as np

'''
GUIDELINE: pass data as an array(s) into T-test class
Then use functions in this class to get desired results
'''

class t_test():

def __init__(self, data1, data2=None) -> None:
self.data1 = data1
self.data2 = data2

def one_sample_t_test(self, population_mean, side):
if side not in ['two-sided', 'less', 'greater']:
raise Exception("Only accept 'two-sided', 'less', or 'greater' for parameter 'side'")
return stats.ttest_1samp(self.data1, population_mean, alternative=side)

def two_sample_t_test(self, side):
if side not in ['two-sided', 'less', 'greater']:
raise Exception("Only accept 'two-sided', 'less', or 'greater' as a parameter")
return stats.ttest_ind(self.data1, self.data2, alternative=side)

def paired_sample_t_test(self):
return stats.ttest_rel(self.data1, self.data2)