This repository was archived by the owner on Apr 3, 2025. It is now read-only.
forked from TeddovanMierle/ML-Methods
-
Notifications
You must be signed in to change notification settings - Fork 0
Module Development, changed doc string as required, two new Jupyter notebooks #5
Open
MuhangTian
wants to merge
13
commits into
DigitalBiomarkerDiscoveryPipeline:master
Choose a base branch
from
MuhangTian:master
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from 6 commits
Commits
Show all changes
13 commits
Select commit
Hold shift + click to select a range
2ee591b
add Kmeans module
5c84ba9
Finish with T-test
cd7967f
Update README.md
MuhangTian 30b3582
Update README.md
MuhangTian 0744f80
Update README.md
MuhangTian ddd95f0
Update README.md
MuhangTian 9896cd7
Finish with clustering module, edited docstring
32ec47f
Merge branch 'master' of https://github.com/MuhangTian/Model-Development
ff4d832
Include K-means
d59e0c2
Edited Changes
2f8abfc
Finish jupyter notebooks
85cdb38
Change README, change ipynb to py
2df95a6
Update README.md
MuhangTian File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
Empty file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,51 @@ | ||
| import pandas as pd | ||
| import numpy as np | ||
| import matplotlib.pyplot as plt | ||
| from sklearn.cluster import KMeans | ||
| from sklearn.datasets import make_blobs | ||
|
|
||
| ''' | ||
| Function that gets data points and cluster number(centroids), returns coordinates | ||
| of cluster centers | ||
| Default values: number of runs on different centroid seeds = 10, max runs = 300 | ||
| ''' | ||
| def run_kmeans(data, centroids, n_init=10, max_iter=300): | ||
| KM = KMeans(n_clusters = centroids, n_init=n_init, max_iter=max_iter) | ||
| y_KM = KM.fit_predict(data) | ||
| return KM.cluster_centers_ | ||
|
|
||
| ''' | ||
| Function that helps to determine how many clusters to use by using trials of K clusters | ||
| The idea is to find the cluster number that gives the maximum reduction in inertia | ||
| ''' | ||
| def elbow_method(data, num_k, n_init=10, max_iter=300): | ||
|
||
| inertia = [] | ||
| for i in range(1, num_k): | ||
| KM = KMeans( | ||
| n_clusters=i, | ||
| n_init=n_init, max_iter=max_iter | ||
| ) | ||
| KM.fit_predict(data) | ||
| inertia.append(KM.inertia_) | ||
|
|
||
| plt.plot(range(1, num_k), inertia, marker='o') | ||
| plt.xlabel('Number of clusters') | ||
| plt.ylabel('Inertia') | ||
| plt.show() | ||
|
|
||
|
|
||
| '''Generate random sample (write another method to get data later?), just to show an example''' | ||
| # Assume we get this from the pre-processed data? | ||
| data, y = make_blobs(n_samples = 400, centers = 6, cluster_std = 0.60, random_state = 0) | ||
|
|
||
| # plt.scatter(data[:, 0], data[:, 1]) | ||
| # plt.show() | ||
| elbow_method(data, 10) | ||
| # print(run_kmeans(data, 6)) | ||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,35 @@ | ||
| import pandas as pd | ||
| import numpy as np | ||
| import matplotlib.pyplot as plt | ||
|
|
||
| ''' | ||
|
||
| Regression class takes in a dataframe of values with two columns, which are respectively x and y | ||
| User can call respective functions to get regression analysis outputs | ||
| ''' | ||
| class LinearRegression(): | ||
|
|
||
| def __init__(self, data) -> None: | ||
| self.df = pd.DataFrame({'x': data.iloc[:,0], 'y': data.iloc[:,1]}) | ||
| self.beta = None | ||
| self.alpha = None | ||
|
|
||
| def get_alpha_beta(self): | ||
| '''return a tuple (paried values) of beta and alpha, with beta first, alpha second''' | ||
| x_mean = np.mean(self.df['x']) | ||
| y_mean = np.mean(self.df['y']) | ||
| self.df['xy_cov'] = (self.df['x'] - x_mean)* (self.df['y'] - y_mean) | ||
| self.df['x_var'] = (self.df['x'] - x_mean)**2 | ||
| beta = self.df['xy_cov'].sum() / self.df['x_var'].sum() | ||
| alpha = y_mean - (beta * x_mean) | ||
| self.beta, self.alpha = beta, alpha | ||
|
|
||
| return beta, alpha | ||
|
|
||
| def predict_y(self): | ||
| '''Obtain regression results, store into data frame, and return as an output''' | ||
| self.get_alpha_beta() | ||
| self.df['y_pred'] = self.alpha + self.beta*self.df['x'] | ||
| return self.df['y_pred'] | ||
|
|
||
|
|
||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,22 @@ | ||
| # Stats Models | ||
| ## T-Test Tutorial | ||
| 1. User get array (or numpy array) of data from pre-processed module, then pass into t_test() class, t_test() can have either 1 data or 2 data. For instance, t_test(data1) and t_test(data1, data2) both works depending on whether user want to test one sample or two samples | ||
| 2. Call functions on t_test() class to get desired values | ||
|
|
||
| ```python | ||
| # For one sample t-test, call below function to get t-test statistic based on a population mean that user wants to test | ||
| t_test(data1).one_sample_t_test(mean, 'two-sided') # For two-sided test | ||
| t_test(data1).one_sample_t_test(mean, 'less') # For one-sided, less than | ||
| t_test(data1).one_sample_t_test(mean, 'greater') # For one-sided, greater than | ||
|
|
||
| # For two sample t-test, call below function to get t-test statistic based on side of the test | ||
| t_test(data1, data2).two_sample_t_test('two-sided') # For two-sided test | ||
| t_test(data1, data2).two_sample_t_test('less') # For one-sided, less than | ||
| t_test(data1, data2).two_sample_t_test('greater') # For one-sided, greater than | ||
|
|
||
| # For paired sample t-test, simply call below function to get t-test statistic | ||
| t_test(data1, data2).paired_sample_t_test() | ||
| ``` | ||
|
|
||
| # ML Models | ||
| # DL Models |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| import pandas as pd | ||
| import scipy.stats as stats | ||
| import numpy as np | ||
|
|
||
| ''' | ||
| GUIDELINE: pass data as an array(s) into T-test class | ||
| Then use functions in this class to get desired results | ||
| ''' | ||
|
|
||
| class t_test(): | ||
|
|
||
| def __init__(self, data1, data2=None) -> None: | ||
| self.data1 = data1 | ||
| self.data2 = data2 | ||
|
|
||
| def one_sample_t_test(self, population_mean, side): | ||
| if side not in ['two-sided', 'less', 'greater']: | ||
| raise Exception("Only accept 'two-sided', 'less', or 'greater' for parameter 'side'") | ||
| return stats.ttest_1samp(self.data1, population_mean, alternative=side) | ||
|
|
||
| def two_sample_t_test(self, side): | ||
| if side not in ['two-sided', 'less', 'greater']: | ||
| raise Exception("Only accept 'two-sided', 'less', or 'greater' as a parameter") | ||
| return stats.ttest_ind(self.data1, self.data2, alternative=side) | ||
|
|
||
| def paired_sample_t_test(self): | ||
| return stats.ttest_rel(self.data1, self.data2) | ||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please refer to the style guide for the conventions we are hoping to follow (https://numpydoc.readthedocs.io/en/latest/format.html#short-summary). block comments below def statement