DigitalBiomarkerDiscoveryPipeline · MuhangTian · Jun 3, 2022 · Jun 22, 2022 · Jun 22, 2022 · Jun 22, 2022
diff --git a/ANOVA.py b/ANOVA.py
diff --git a/KMeans.py b/KMeans.py
@@ -0,0 +1,51 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.cluster import KMeans
+from sklearn.datasets import make_blobs
+
+'''
+Function that gets data points and cluster number(centroids), returns coordinates
+of cluster centers
+Default values: number of runs on different centroid seeds = 10, max runs = 300
+'''
+def run_kmeans(data, centroids, n_init=10, max_iter=300):
+    KM = KMeans(n_clusters = centroids, n_init=n_init, max_iter=max_iter)
+    y_KM = KM.fit_predict(data)
+    return KM.cluster_centers_
+
+'''
+Function that helps to determine how many clusters to use by using trials of K clusters
+The idea is to find the cluster number that gives the maximum reduction in inertia
+'''
+def elbow_method(data, num_k, n_init=10, max_iter=300):
+    inertia = []
+    for i in range(1, num_k):
+        KM = KMeans(
+        n_clusters=i,
+        n_init=n_init, max_iter=max_iter
+        )
+        KM.fit_predict(data)
+        inertia.append(KM.inertia_)
+
+    plt.plot(range(1, num_k), inertia, marker='o')
+    plt.xlabel('Number of clusters')
+    plt.ylabel('Inertia')
+    plt.show()
+
+
+'''Generate random sample (write another method to get data later?), just to show an example'''
+# Assume we get this from the pre-processed data?
+data, y = make_blobs(n_samples = 400, centers = 6, cluster_std = 0.60, random_state = 0)
+
+# plt.scatter(data[:, 0], data[:, 1])
+# plt.show()
+elbow_method(data, 10)
+# print(run_kmeans(data, 6))
+
+
+
+
+
+
+
diff --git a/LinearRegression.py b/LinearRegression.py
@@ -0,0 +1,35 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+
+'''
+Regression class takes in a dataframe of values with two columns, which are respectively x and y
+User can call respective functions to get regression analysis outputs
+'''
+class LinearRegression():
+
+    def __init__(self, data) -> None:
+        self.df = pd.DataFrame({'x': data.iloc[:,0], 'y': data.iloc[:,1]})
+        self.beta = None
+        self.alpha = None
+
+    def get_alpha_beta(self):
+        '''return a tuple (paried values) of beta and alpha, with beta first, alpha second'''
+        x_mean = np.mean(self.df['x'])
+        y_mean = np.mean(self.df['y'])
+        self.df['xy_cov'] = (self.df['x'] - x_mean)* (self.df['y'] - y_mean)
+        self.df['x_var'] = (self.df['x'] - x_mean)**2
+        beta = self.df['xy_cov'].sum() / self.df['x_var'].sum()
+        alpha = y_mean - (beta * x_mean)
+        self.beta, self.alpha = beta, alpha
+
+        return beta, alpha
+
+    def predict_y(self):
+        '''Obtain regression results, store into data frame, and return as an output'''
+        self.get_alpha_beta()
+        self.df['y_pred'] = self.alpha + self.beta*self.df['x']
+        return self.df['y_pred']
+
+
+
diff --git a/README.md b/README.md
@@ -1,3 +1,22 @@
 # Stats Models
+## T-Test Tutorial
+1. User get array (or numpy array) of data from pre-processed module, then pass into t_test() class, t_test() can have either 1 data or 2 data. For instance, t_test(data1) and t_test(data1, data2) both works depending on whether user want to test one sample or two samples
+2. Call functions on t_test() class to get desired values
+
+```python
+# For one sample t-test, call below function to get t-test statistic based on a population mean that user wants to test
+t_test(data1).one_sample_t_test(mean, 'two-sided')      # For two-sided test
+t_test(data1).one_sample_t_test(mean, 'less')           # For one-sided, less than
+t_test(data1).one_sample_t_test(mean, 'greater')        # For one-sided, greater than 
+
+# For two sample t-test, call below function to get t-test statistic based on side of the test
+t_test(data1, data2).two_sample_t_test('two-sided')     # For two-sided test
+t_test(data1, data2).two_sample_t_test('less')          # For one-sided, less than
+t_test(data1, data2).two_sample_t_test('greater')       # For one-sided, greater than
+
+# For paired sample t-test, simply call below function to get t-test statistic
+t_test(data1, data2).paired_sample_t_test()
+```
+
 # ML Models
 # DL Models
diff --git a/T-tests.py b/T-tests.py
@@ -0,0 +1,32 @@
+import pandas as pd
+import scipy.stats as stats
+import numpy as np
+
+'''
+GUIDELINE: pass data as an array(s) into T-test class
+Then use functions in this class to get desired results
+'''
+
+class t_test():
+
+    def __init__(self, data1, data2=None) -> None:
+        self.data1 = data1
+        self.data2 = data2
+
+    def one_sample_t_test(self, population_mean, side):
+        if side not in ['two-sided', 'less', 'greater']:
+            raise Exception("Only accept 'two-sided', 'less', or 'greater' for parameter 'side'")
+        return stats.ttest_1samp(self.data1, population_mean, alternative=side)
+
+    def two_sample_t_test(self, side):
+        if side not in ['two-sided', 'less', 'greater']:
+            raise Exception("Only accept 'two-sided', 'less', or 'greater' as a parameter")
+        return stats.ttest_ind(self.data1, self.data2, alternative=side)
+
+    def paired_sample_t_test(self):
+        return stats.ttest_rel(self.data1, self.data2)
+
+
+
+
+