From 15075821ba4eab3321ed02a4e7797ee7d5359c28 Mon Sep 17 00:00:00 2001 From: saanikat Date: Tue, 17 Sep 2024 09:03:30 -0400 Subject: [PATCH 01/18] readme updated --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0f95dca..300606a 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ assert results To see the available schemas, you can run: ``` -from bedms.constants import AVAILABLE_SCHEMAS +from bedms.const import AVAILABLE_SCHEMAS print(AVAILABLE_SCHEMAS) # >> ['ENCODE', 'FAIRTRACKS', 'BEDBASE'] From 50969451a450643de1445ed086b5d24ab8c100e9 Mon Sep 17 00:00:00 2001 From: saanikat Date: Tue, 17 Sep 2024 09:07:15 -0400 Subject: [PATCH 02/18] updated readme --- README.md | 8 ++++++++ bedms/attr_standardizer.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 300606a..5b07838 100644 --- a/README.md +++ b/README.md @@ -33,5 +33,13 @@ print(AVAILABLE_SCHEMAS) # >> ['ENCODE', 'FAIRTRACKS', 'BEDBASE'] +``` +Or you can run: + +``` +schemas = model.show_available_schemas() + +print(schemas) + ``` AVAILABLE_SCHEMAS is a list of available schemas that you can use to standardize your metadata. diff --git a/bedms/attr_standardizer.py b/bedms/attr_standardizer.py index 6fa3b2e..a95fd83 100644 --- a/bedms/attr_standardizer.py +++ b/bedms/attr_standardizer.py @@ -248,7 +248,7 @@ def standardize( ) @staticmethod - def get_available_schemas() -> list[str]: + def show_available_schemas() -> list[str]: """ Stores a list of available schemas. From 6a585aebf95d963f49c7e747ee447feb826a9752 Mon Sep 17 00:00:00 2001 From: saanikat Date: Tue, 17 Sep 2024 13:38:39 -0400 Subject: [PATCH 03/18] changes --- README.md | 7 ------- bedms/attr_standardizer.py | 2 +- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/README.md b/README.md index 5b07838..3075eb6 100644 --- a/README.md +++ b/README.md @@ -34,12 +34,5 @@ print(AVAILABLE_SCHEMAS) # >> ['ENCODE', 'FAIRTRACKS', 'BEDBASE'] ``` -Or you can run: -``` -schemas = model.show_available_schemas() - -print(schemas) - -``` AVAILABLE_SCHEMAS is a list of available schemas that you can use to standardize your metadata. diff --git a/bedms/attr_standardizer.py b/bedms/attr_standardizer.py index a95fd83..6fa3b2e 100644 --- a/bedms/attr_standardizer.py +++ b/bedms/attr_standardizer.py @@ -248,7 +248,7 @@ def standardize( ) @staticmethod - def show_available_schemas() -> list[str]: + def get_available_schemas() -> list[str]: """ Stores a list of available schemas. From 273fa20dd627298ac1d3de0353741935da8076de Mon Sep 17 00:00:00 2001 From: saanikat Date: Tue, 1 Oct 2024 16:49:18 -0400 Subject: [PATCH 04/18] example configs for custom training and std --- custom_config.yaml | 10 ++++++++++ training_config.yaml | 30 ++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 custom_config.yaml create mode 100644 training_config.yaml diff --git a/custom_config.yaml b/custom_config.yaml new file mode 100644 index 0000000..a7e06b9 --- /dev/null +++ b/custom_config.yaml @@ -0,0 +1,10 @@ +paths: + model_pth: "path/to/custom/trained/model.pth" #Path to where you saved the custom model + label_encoder_pth: "/path/to/label_encoder.pkl" #Path to where you saved the Label Encoder + vectorizer_pth: "path/to/vectorizer.pkl" #Path to where you saved the Bag of Words vectorizer +model: + input_size_bow: 1857 #Size of the vocabulary for Bag of Words encoding + input_size_embeddings: 384 #Size of the input embeddings for values and attributes + hidden_size: 32 #Hidden size the model was trained on + output_size: 18 #Number of classes the model predicts into + dropout_prob: 0.113 #Dropout probability you had set for the model \ No newline at end of file diff --git a/training_config.yaml b/training_config.yaml new file mode 100644 index 0000000..1236709 --- /dev/null +++ b/training_config.yaml @@ -0,0 +1,30 @@ +dataset: + values_dir_pth: "/path/to/training/values/directory" #Path to the values directory + headers_dir_pth: "path/to/training/headers/directory" #Path to the attributes directory + +data_split: + train_set: 8000 #Number of csv value-attribute file pairs for training set + test_set: 100 #Number of csv value-attribute file pairs for testing set + val_set: 100 #Number of csv value-attribute file pairs for validation set + +model: + hidden_size: 32 #Hidden size for training the model + dropout_prob: 0.113 #Dropout probability for training the model + +training: + batch_size: 32 #Batch size for training + num_epochs: 20 #Number of training epochs + learning_rate: 0.001 #Learning rate of the model + l2_regularization: 0.001 #L2 regularization strength applied to the optimizer (Avoids overfitting, can be set to 0) + model_pth: "path/to/custom/trained/model.pth" #Path to where you want to save the custom model + num_cluster: 3 #Number of clusters for KMeans + vectorizer_pth: "path/to/vectorizer.pkl" #Path to where you want to save the Bag of Words vectorizer + label_encoder_pth: "/path/to/label_encoder.pkl" #Path to where you want to save the Label Encoder + sentence_transformer_model: "all-MiniLM-L6-v2" #Name of the sentence transformer model you wish to use fro HuggingFace + bow_drops: 2 #Number of Bag of Words columns you wish to drop out during the training process (Avoids overfitting, can be set to 0) + +visualization: + accuracy_fig_pth: "/path/to/accuracy_fig.svg" #Path to where you wish to save the Accuracy Curve image + loss_fig_pth: "/path/to/loss_fig.svg" #Path to where you wish to save the Loss Curve image + confusion_matrix_fig_pth: "/path/to/confusion_matrix.svg" #Path to where you wish to save the confusion matrix image + roc_fig_pth: "/path/to/roc_fig.svg" #Path to where you wish to save the ROC curve image \ No newline at end of file From 089755a900d826b8cee3695dbbe52a0e84eed219 Mon Sep 17 00:00:00 2001 From: saanikat Date: Wed, 2 Oct 2024 11:29:01 -0400 Subject: [PATCH 05/18] training module --- bedms/attr_standardizer.py | 78 +++-- bedms/const.py | 4 +- bedms/train.py | 242 +++++++++++++ bedms/utils.py | 7 +- bedms/utils_train.py | 694 +++++++++++++++++++++++++++++++++++++ 5 files changed, 999 insertions(+), 26 deletions(-) create mode 100644 bedms/train.py create mode 100644 bedms/utils_train.py diff --git a/bedms/attr_standardizer.py b/bedms/attr_standardizer.py index 6fa3b2e..2372f0a 100644 --- a/bedms/attr_standardizer.py +++ b/bedms/attr_standardizer.py @@ -3,13 +3,14 @@ """ import logging -from typing import Dict, Tuple, Union +from typing import Dict, Tuple, Union, Optional import pickle import peppy import torch from torch import nn import torch.nn.functional as torch_functional - +import yaml +from huggingface_hub import hf_hub_download from .const import ( AVAILABLE_SCHEMAS, CONFIDENCE_THRESHOLD, @@ -40,7 +41,7 @@ get_any_pep, load_from_huggingface, ) -from huggingface_hub import hf_hub_download + logging.basicConfig(level=logging.INFO) logger = logging.getLogger(PROJECT_NAME) @@ -51,16 +52,37 @@ class AttrStandardizer: This is the AttrStandardizer class which holds the models for Attribute Standardization. """ - def __init__(self, schema: str, confidence: int = CONFIDENCE_THRESHOLD) -> None: + def __init__( + self, + schema: str, + custom_param: Optional[str] = None, + confidence: int = CONFIDENCE_THRESHOLD, + ) -> None: """ Initializes the attribute standardizer with user provided schema, loads the model. :param str schema: User provided schema, can be "ENCODE" or "FAIRTRACKS" + :param str custom_param: User provided config file for + custom parameters, if they choose "CUSTOM" schema. :param int confidence: Confidence threshold for the predictions. """ self.schema = schema - self.model, self.vectorizer, self.label_encoder = self._load_model() self.conf_threshold = confidence + self.custom_param = custom_param + + if self.schema == "CUSTOM" and self.custom_param: + self.custom_param = self._load_custom_param(self.custom_param) + self.model, self.vectorizer, self.label_encoder = self._load_model() + + def _load_custom_param(self, config_pth: str) -> Dict[str, Tuple]: + """ + Loads the custom parameters from the config file provided by the user. + + :param str config_pth: Path to the config file which has the custom parameters. + :return Dict[str, Tuple]: Custom Parameters dictionary. + """ + with open(config_pth, "r", encoding="utf-8") as file: + return yaml.safe_load(file) def _get_parameters(self) -> Tuple[int, int, int, int, int, float]: """ @@ -95,9 +117,19 @@ def _get_parameters(self) -> Tuple[int, int, int, int, int, float]: OUTPUT_SIZE_BEDBASE, DROPOUT_PROB, ) + if self.schema == "CUSTOM": + return ( + self.custom_param["model"]["input_size_bow"], + self.custom_param["model"]["input_size_embeddings"], + self.custom_param["model"]["input_size_embeddings"], + self.custom_param["model"]["hidden_size"], + self.custom_param["model"]["output_size"], + self.custom_param["model"]["dropout_prob"], + ) + raise ValueError( f"Schema not available: {self.schema}." - "Presently, three schemas are available: ENCODE , FAIRTRACKS, BEDBASE" + "Presently, four schemas are available: ENCODE , FAIRTRACKS, BEDBASE, CUSTOM" ) def _load_model(self) -> Tuple[nn.Module, object, object]: @@ -118,29 +150,31 @@ def _load_model(self) -> Tuple[nn.Module, object, object]: elif self.schema == "BEDBASE": filename_vc = BEDBASE_VECTORIZER_FILENAME filename_lb = BEDBASE_LABEL_ENCODER_FILENAME - - vectorizer = None - label_encoder = None - - vc_path = hf_hub_download( - repo_id=REPO_ID, - filename=filename_vc, - ) + elif self.schema == "CUSTOM": + vc_path = self.custom_param["paths"]["vectorizer_pth"] + lb_path = self.custom_param["paths"]["label_encoder_pth"] + state_dict = torch.load(self.custom_param["paths"]["model_pth"]) + else: + raise ValueError(f"Schema not available: {self.schema}") + + if self.schema != "CUSTOM": + vc_path = hf_hub_download( + repo_id=REPO_ID, + filename=filename_vc, + ) + lb_path = hf_hub_download( + repo_id=REPO_ID, + filename=filename_lb, + ) + model = load_from_huggingface(self.schema) + state_dict = torch.load(model) with open(vc_path, "rb") as f: vectorizer = pickle.load(f) - lb_path = hf_hub_download( - repo_id=REPO_ID, - filename=filename_lb, - ) - with open(lb_path, "rb") as f: label_encoder = pickle.load(f) - model = load_from_huggingface(self.schema) - state_dict = torch.load(model) - ( input_size_values, input_size_values_embeddings, diff --git a/bedms/const.py b/bedms/const.py index 86916c6..e200fda 100644 --- a/bedms/const.py +++ b/bedms/const.py @@ -2,9 +2,9 @@ This module contains constant values used in the 'bedms' package. """ -PROJECT_NAME = "bedmess" +PROJECT_NAME = "bedms" -AVAILABLE_SCHEMAS = ["ENCODE", "FAIRTRACKS", "BEDBASE"] +AVAILABLE_SCHEMAS = ["ENCODE", "FAIRTRACKS", "BEDBASE", "CUSTOM"] PEP_FILE_TYPES = ["yaml", "csv"] REPO_ID = "databio/attribute-standardizer-model6" MODEL_ENCODE = "model_encode.pth" diff --git a/bedms/train.py b/bedms/train.py new file mode 100644 index 0000000..d2816fd --- /dev/null +++ b/bedms/train.py @@ -0,0 +1,242 @@ +""" This is the training script with which the user can train their own models.""" + +import logging +import torch +from torch import nn +from torch import optim +from sklearn.metrics import ( + precision_score, + recall_score, + f1_score, +) +import yaml +from .utils_train import ( + load_from_dir, + accumulate_data, + training_encoding, + data_loader, + train_model, + plot_learning_curve, + model_testing, + plot_confusion_matrix, + auc_roc_curve, +) +from .const import PROJECT_NAME, EMBEDDING_SIZE +from .model import BoWSTModel + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(PROJECT_NAME) + + +class TrainStandardizer: + """ + This is the training class responsible for + managing the training process for the standardizer model. + """ + + def __init__(self, config: str) -> None: + """ + Initializes the TrainStandardizer object with the given configuration. + + :param str config: Path to the config file which has the training parameters provided by the user. + """ + self.label_encoder = None + self.vectorizer = None + self.train_loader = None + self.val_loader = None + self.test_loader = None + self.output_size = None + self.criterion = None + self.train_accuracies = None + self.val_accuracies = None + self.train_losses = None + self.val_losses = None + self.model = None + self.fpr = None + self.tpr = None + self.roc_auc = None + self.all_labels = None + self.all_preds = None + + with open(config, "r") as file: + self.config = yaml.safe_load(file) + + def load_encode_data(self) -> None: + """ + Loads and prepares the encoded training, testing and validation datasets. + """ + values_files_list = load_from_dir(self.config["dataset"]["values_dir_pth"]) + headers_files_list = load_from_dir(self.config["dataset"]["headers_dir_pth"]) + + if len(values_files_list) != len(headers_files_list): + logger.error( + f"Mismatch in number of value files ({len(values_files_list)}) \ + and header files ({len(headers_files_list)})" + ) + return + + total_files = len(values_files_list) + + paired_files = list(zip(values_files_list, headers_files_list)) + + train_size = self.config["data_split"]["train_set"] + test_size = self.config["data_split"]["test_set"] + val_size = self.config["data_split"]["val_set"] + + if train_size + val_size + test_size > total_files: + logger.error( + f"Data split sizes exceed total number of files: " + f"train({train_size}) + val({val_size}) + \ + test({test_size}) > total_files({total_files})" + ) + return + + train_files = paired_files[:train_size] + val_files = paired_files[train_size : train_size + val_size] + test_files = paired_files[ + train_size + val_size : train_size + val_size + test_size + ] + + logger.info(f"Training on {len(train_files)} file sets") + logger.info(f"Validating on {len(val_files)} file sets") + logger.info(f"Testing on {len(test_files)} file sets") + + x_values_train_list, x_headers_train_list, y_train_list = accumulate_data( + train_files + ) + x_values_test_list, x_headers_test_list, y_test_list = accumulate_data( + test_files + ) + x_values_val_list, x_headers_val_list, y_val_list = accumulate_data(val_files) + + logger.info("Accumulation Done.") + + num_cluster = self.config["training"]["num_cluster"] + vectorizer_pth = self.config["training"]["vectorizer_pth"] + label_encoder_pth = self.config["training"]["label_encoder_pth"] + sentence_transformer_model = self.config["training"][ + "sentence_transformer_model" + ] + + ( + train_encoded_data, + test_encoded_data, + val_encoded_data, + self.label_encoder, + self.vectorizer, + ) = training_encoding( + x_values_train_list, + x_headers_train_list, + y_train_list, + x_values_test_list, + x_headers_test_list, + y_test_list, + x_values_val_list, + x_headers_val_list, + y_val_list, + num_cluster, + vectorizer_pth, + label_encoder_pth, + sentence_transformer_model, + ) + logger.info("Encoding Done.") + + batch_size = self.config["training"]["batch_size"] + self.train_loader = data_loader(train_encoded_data, batch_size) + self.test_loader = data_loader(test_encoded_data, batch_size) + self.val_loader = data_loader(val_encoded_data, batch_size) + + logger.info("Loading Done.") + + def training(self): + """ + Trains the model. + """ + input_size_values = len(self.vectorizer.vocabulary_) + input_size_values_embeddings = EMBEDDING_SIZE + input_size_headers = EMBEDDING_SIZE + hidden_size = self.config["model"]["hidden_size"] + self.output_size = len(self.label_encoder.classes_) # Number of classes + dropout_prob = self.config["model"]["dropout_prob"] + + self.model = BoWSTModel( + input_size_values, + input_size_values_embeddings, + input_size_headers, + hidden_size, + self.output_size, + dropout_prob, + ) + + learning_rate = self.config["training"]["learning_rate"] + self.criterion = nn.CrossEntropyLoss() + l2_reg_lambda = self.config["training"]["l2_regularization"] + optimizer = optim.Adam( + self.model.parameters(), lr=learning_rate, weight_decay=l2_reg_lambda + ) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + # Training the model + num_epochs = self.config["training"]["num_epochs"] + + model_pth = self.config["training"]["model_pth"] + bow_drops = self.config["training"]["bow_drops"] + + ( + self.train_accuracies, + self.val_accuracies, + self.train_losses, + self.val_losses, + self.fpr, + self.tpr, + self.roc_auc, + ) = train_model( + self.model, + self.train_loader, + self.val_loader, + self.criterion, + optimizer, + device, + num_epochs, + self.output_size, + model_pth, + bow_drops, + ) + + logger.info("Training Done.") + + def testing(self): + """ + Model testing. + """ + self.all_preds, self.all_labels = model_testing( + self.model, self.test_loader, self.criterion + ) + precision = precision_score(self.all_labels, self.all_preds, average="macro") + recall = recall_score(self.all_labels, self.all_preds, average="macro") + f1 = f1_score(self.all_labels, self.all_preds, average="macro") + logger.info(f"Precision:{precision}, Recall: {recall}, F1 Score: {f1}") + + def plot_visualizations(self): + """ + Generates visualizations for training ( accuracy and loss curves) + and testing( confusion matrix, roc curve) + """ + num_epochs = self.config["training"]["num_epochs"] + accuracy_fig_pth = self.config["visualization"]["accuracy_fig_pth"] + loss_fig_pth = self.config["visualization"]["loss_fig_pth"] + cm_pth = self.config["visualization"]["confusion_matrix_fig_pth"] + roc_pth = self.config["visualization"]["roc_fig_pth"] + plot_learning_curve( + num_epochs, + self.train_accuracies, + self.val_accuracies, + self.train_losses, + self.val_losses, + accuracy_fig_pth, + loss_fig_pth, + ) + plot_confusion_matrix( + self.all_labels, self.all_preds, self.label_encoder.classes_, cm_pth + ) + auc_roc_curve(self.fpr, self.tpr, self.roc_auc, self.output_size, roc_pth) diff --git a/bedms/utils.py b/bedms/utils.py index 0dcb613..bd8c0a9 100644 --- a/bedms/utils.py +++ b/bedms/utils.py @@ -1,7 +1,7 @@ """ This module has all util functions for 'bedms' """ - +import logging import warnings from collections import Counter from typing import Any, List, Optional, Tuple, Union @@ -15,7 +15,6 @@ from sentence_transformers import SentenceTransformer from sklearn.cluster import KMeans from sklearn.preprocessing import LabelEncoder - from .const import ( MODEL_BEDBASE, MODEL_ENCODE, @@ -23,8 +22,12 @@ NUM_CLUSTERS, REPO_ID, PEP_FILE_TYPES, + PROJECT_NAME, ) +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(PROJECT_NAME) + # TODO : convert to single np array before converting to tensor warnings.filterwarnings( "ignore", diff --git a/bedms/utils_train.py b/bedms/utils_train.py new file mode 100644 index 0000000..f3adda4 --- /dev/null +++ b/bedms/utils_train.py @@ -0,0 +1,694 @@ +""" +This module has all training util functions for 'bedms' +""" + +import os +import logging +from glob import glob +import warnings +from collections import Counter +from typing import List, Tuple, Iterator, Dict +import pickle +import random + + +import numpy as np +import pandas as pd +import torch +from torch.utils.data import TensorDataset, DataLoader +from sentence_transformers import SentenceTransformer +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.cluster import KMeans +from sklearn.preprocessing import LabelEncoder, label_binarize +from sklearn.metrics import ( + confusion_matrix, + auc, + roc_curve, +) +import matplotlib.pyplot as plt +import seaborn as sns +from .const import PROJECT_NAME + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(PROJECT_NAME) + +warnings.filterwarnings( + "ignore", + category=UserWarning, + message="Creating a tensor from a list of numpy.ndarrays is extremely slow.", +) + +def load_from_dir(dir: str) -> List[str]: + """ + Loads each file from the directory path. + + :param str dir: Path to the directory. + :return: List:paths to each file in the directory. + """ + return glob(os.path.join(dir, "*.csv")) + + +def load_and_preprocess(file_path: str) -> pd.DataFrame: + """ + Loads and Preprocesses each csv file as a Pandas DataFrame. + + :param str file_path: Path to each csv file. + :return pandas.DataFrame: df of each csv file. + """ + df = pd.read_csv(file_path, sep=",") + df.replace("NA", np.nan, inplace=True) + for column in df.columns: + most_common_val = df[column].mode().iloc[0] + df[column] = df[column].fillna(most_common_val) + return df + + +def accumulate_data( + files: List[Tuple[str, str]] +) -> Tuple[List[List[List[str]]], List[List[List[str]]], List[pd.Index]]: + """ + Accumulates data from multiple files into lists. + + :param List[Tuple[str, str]] files: List containing + sublists of values or header files. + :return Tuple[List[List[List[str]]], List[List[List[str]]],[List[str]]: + Lists of values, headers, labels. + A tuple containing three lists: + - A nested list of values (list of tables where + each table is a list of lists for columns), + - A nested list of headers (similar structure to values), + - A list of Pandas Index objects containing column labels. + """ + x_values_list = [] + x_headers_list = [] + y_list = [] + for values_file, headers_file in files: + df_values = load_and_preprocess(values_file) + df_headers = load_and_preprocess(headers_file) + df_values = df_values.fillna("") + df_headers = df_headers.fillna("") + y = df_values.columns + table_list = [] + # values list + for col in df_values.columns: + sublist_list = df_values[col].tolist() + table_list.append(sublist_list) + x_values_list.append(table_list) + # headers list + table_list = [] + for col in df_headers.columns: + sublist_list = df_headers[col].tolist() + table_list.append(sublist_list) + x_headers_list.append(table_list) + # y list + y_list.append(y) + + return x_values_list, x_headers_list, y_list + + +def lazy_loading(data_list: List, batch_size: int) -> Iterator[List]: + """ + Lazy loading for data in batches. + + :param List data_list: List of data to be loaded lazily. + :param int batch_size: Size of batch. + """ + for i in range(0, len(data_list), batch_size): + yield data_list[i : i + batch_size] + + +def get_top_training_cluster_averaged( + embeddings: List[torch.tensor], num: int +) -> torch.Tensor: + """ + Computes the clutser-averaged top training embeddings using k-means clustering. + + :param List[torch.tensor] embeddings: List of embedding tensors to cluster. + :param int num: Number of clusters to be created using k-means. + :return torch.Tensor: A tensor representing the + average of embeddings in the most common cluster. + """ + flattened_embeddings = [embedding.tolist() for embedding in embeddings] + kmeans = KMeans(n_clusters=num, random_state=0).fit(flattened_embeddings) + labels_kmeans = kmeans.labels_ + cluster_counts = Counter(labels_kmeans) + most_common_cluster = max(cluster_counts, key=cluster_counts.get) + most_common_indices = [ + idx for idx, label in enumerate(labels_kmeans) if label == most_common_cluster + ] + most_common_embeddings = [ + torch.tensor(embeddings[idx]) for idx in most_common_indices + ] + + if most_common_embeddings: + top_k_average = torch.mean( + torch.stack(most_common_embeddings), dim=0 + ).unsqueeze(0) + else: + top_k_average = torch.zeros_like(most_common_embeddings[0]).unsqueeze(0) + return top_k_average + + +def training_encoding( + x_values_train_list: List[List[List[str]]], + x_headers_train_list: List[List[List[str]]], + y_train_list: List[pd.Index], + x_values_test_list: List[List[List[str]]], + x_headers_test_list: List[List[List[str]]], + y_test_list: List[pd.Index], + x_values_val_list: List[List[List[str]]], + x_headers_val_list: List[List[List[str]]], + y_val_list: List[pd.Index], + num_cluster: int, + vectorizer_pth: str, + label_encoder_pth: str, + sentence_transformer_model: str, +) -> Tuple[ + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + LabelEncoder, + List[str], + CountVectorizer, +]: + """ + Generates encoded headers and values. + + :param List[List[List[str]]] x_values_train_list: + Nested list containing the training set for values. + :param List[List[List[str]]] x_headers_train_list: + Nested list containing the training set for headers. + :param List[pd.Index] y_train_list: + List of the column labels ( attributes) for training. + :param List[List[List[str]]] x_values_test_list: + Nested list containing the testing set for values. + :param List[List[List[str]]] x_headers_test_list: + Nested list containing the testing set for headers. + :param List[pd.Index] y_test_list: + List of the column labels ( attributes) for testing. + :param List[List[List[str]]] x_values_val_list: + Nested list containing the validation set for values. + :param List[List[List[str]]] x_headers_val_list: + Nested list containing the validation set for headers. + :param List[pd.Index] y_val_list: + List of the column labels ( attributes) for validation. + :return Tuple[ + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + LabelEncoder, + List[str], + CountVectorizer]: Returns a tuple of + - training dataset tensor + - testing dataset tensor + - validation dataset tensor + - trained label encoder + - list of unique values encountered during training + - Trained vectorizer for Bag of Words representation + + """ + # Bag of Words + flattened_list = [ + item for sublist in x_values_train_list for col in sublist for item in col + ] + vectorizer = CountVectorizer() + vectorizer.fit(flattened_list) + with open(vectorizer_pth, "wb") as f: + pickle.dump(vectorizer, f) + vocabulary_size = len(vectorizer.vocabulary_) + logger.info(f"Vocabulary size: {vocabulary_size}") + + # Sentence Transformers + model_name = sentence_transformer_model + sentence_encoder = SentenceTransformer(model_name) + + # Label Encoders + label_encoder = LabelEncoder() + flat_y_train = [",".join(y) for y in y_train_list] + individual_values = [value.strip() for y in flat_y_train for value in y.split(",")] + unique_values = set(individual_values) + unique_values_list = list(unique_values) + label_encoder.fit(unique_values_list) + + with open(label_encoder_pth, "wb") as f: + pickle.dump(label_encoder, f) + + def encode_data( + x_values_list: List[List[List[str]]], + x_headers_list: List[List[List[str]]], + y_list: List[pd.Index], + num_cluster: int, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + This nested function encodes the values, headers and labels data. + It is called for thrice - training, testing, validation. + + :param List[List[List[str]]] x_values_list: Nested list containing values. + :param List[List[List[str]]] x_headers_list: Nested list containing headers. + :param List[pd.Index] y_list: Labels (attributes) list. + :param int num_cluster: Number of clusters to be generated. + """ + x_values_bow_tensors = [] + x_values_embeddings_tensors = [] + x_headers_embeddings_tensors = [] + y_tensors = [] + + for x_values, x_headers, y in zip(x_values_list, x_headers_list, y_list): + + for i in range(len(x_values)): # Iterate over columns + # BoW Representation + x_values_bow = vectorizer.transform(x_values[i]).toarray() + x_values_bow_tensor = ( + torch.tensor(x_values_bow, dtype=torch.float32) + .mean(dim=0) + .unsqueeze(0) + .clone() + .detach() + ) + + # Embeddings for Values + embeddings_values = [ + sentence_encoder.encode(str(value), show_progress_bar=False) + for value in x_values[i] + ] + + top_k_average_values = get_top_training_cluster_averaged( + embeddings_values, num_cluster + ) # Average of all embeddings + x_values_embeddings_tensor = top_k_average_values.clone().detach() + + # Embeddings for Headers + embeddings_headers = [ + sentence_encoder.encode(str(header), show_progress_bar=False) + for header in x_headers[i] + ] + + top_k_average_headers = get_top_training_cluster_averaged( + embeddings_headers, num_cluster + ) # Average of all embeddings + x_headers_embeddings_tensor = top_k_average_headers.clone().detach() + + # Labels + y_col = label_encoder.transform([y[i]]) + y_col_tensor = torch.tensor(y_col, dtype=torch.long).clone().detach() + + x_values_bow_tensors.append(x_values_bow_tensor) + x_values_embeddings_tensors.append(x_values_embeddings_tensor) + x_headers_embeddings_tensors.append(x_headers_embeddings_tensor) + y_tensors.append(y_col_tensor) + + x_values_bow_tensor = torch.cat( + x_values_bow_tensors, dim=0 + ) # this has [num_cols, vocab_size] + x_values_embeddings_tensor = torch.cat( + x_values_embeddings_tensors, dim=0 + ) # [num_cols, embedding_dim] + x_headers_embeddings_tensor = torch.cat(x_headers_embeddings_tensors, dim=0) + y_tensor = torch.cat(y_tensors, dim=0) # [num_cols] + + return ( + x_values_bow_tensor, + x_values_embeddings_tensor, + x_headers_embeddings_tensor, + y_tensor, + ) + + train_data = encode_data( + x_values_train_list, x_headers_train_list, y_train_list, num_cluster + ) + test_data = encode_data( + x_values_test_list, x_headers_test_list, y_test_list, num_cluster + ) + val_data = encode_data( + x_values_val_list, x_headers_val_list, y_val_list, num_cluster + ) + + return ( + train_data, + test_data, + val_data, + label_encoder, + vectorizer, + ) + + +def data_loader( + encoded_data: Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + batch_size: int, +) -> DataLoader: + """ + Creates a DataLoader from encoded tensor data. + + :param [torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor] encoded_data: + Tuple containing tensors for + values bag of words, values embeddings, headers embeddings, and labels. + :param int batch_size: The number of samples per batch for the DataLoader. + :return DataLoader: A PyTorch DataLoader which yields + batches of data from the given tensors. + """ + ( + x_values_bow_tensor, + x_values_embeddings_tensor, + x_headers_embeddings_tensor, + y_tensor, + ) = encoded_data + # Convert data to TensorDataset + dataset = TensorDataset( + x_values_bow_tensor, + x_values_embeddings_tensor, + x_headers_embeddings_tensor, + y_tensor, + ) + # Create DataLoader + return DataLoader(dataset, batch_size=batch_size, shuffle=True) + + +def drop_bow(bow_tensor: torch.Tensor, num_drops: int) -> torch.Tensor: + """ + Randomly drops a specified number of columns in the + Bag of Words tensor for regularization. + + :param torch.Tensor bow_tensor: Bag of Words tensor. + :param int num_drops: Number of columns to be randomly + dropped from the Bag of Words tensor. + :return torch.Tensor: Bag of Words tensor with dropped columns. + """ + num_columns = bow_tensor.size(0) + columns = list(range(num_columns)) + columns_to_drop = random.sample(columns, num_drops) + + mask = torch.ones(num_columns, dtype=torch.bool) + mask[columns_to_drop] = False + mask = mask.unsqueeze(1).expand_as(bow_tensor) + + # Apply the mask to the BoW tensor + dropped_bow_tensor = bow_tensor.clone() + dropped_bow_tensor[~mask] = 0.0 + + return dropped_bow_tensor + + +def train_model( + model: torch.nn.Module, + train_loader: DataLoader, + val_loader: DataLoader, + criterion: torch.nn.Module, + optimizer: torch.optim.Optimizer, + device: torch.device, + num_epochs: int, + output_size: int, + model_pth: str, + bow_drops: int, +) -> Tuple[ + List[float], + List[float], + List[float], + List[float], + Dict[int, np.ndarray], + Dict[int, np.ndarray], + Dict[int, float], +]: + """ + Trains and validates the neural network model. + + :param torch.nn.Module model: The neural network model to be trained. + :param DataLoader train_loader: DataLoader for the training set. + :param DataLoader val_loader: DataLoader for the validation set. + :param torch.nn.Module criterion: The loss function used to compute loss during training. + :param torch.optim.Optimizer optimizer: The optimizer to update the model parameters. + :param torch.device device: The device (CPU or GPU) on which the model will be trained. + :param int num_epochs: The number of epochs to train the model. + :param int output_size: The size of the model's output layer. + :param str model_pth: The file path to where the model would be saved. + :param int bow_drops: The number of Bag of Words columns to be dropped. + :return Tuple: + - List[float]: Train accuracy per epoch. + - List[float]: Validation accuracy per epoch. + - List[float]: Train loss per epoch. + - List[float]: Validation loss per epoch. + - Dict[int, np.ndarray]: Dictionary of False Positive Rates (FPR). + - Dict[int, np.ndarray]: Dictionary of True Positive Rates (TPR). + - Dict[int, float]: Dictionary of Area Under the ROC Curve for different classes. + """ + patience = 3 + train_accuracies = [] + val_accuracies = [] + train_losses = [] + val_losses = [] + + best_val_loss = float("inf") + best_epoch = 0 + early_stop = False + + model.train() + + for epoch in range(num_epochs): + total_samples = 0 + correct_predictions = 0 + train_loss = 0.0 + for x_values_bow, x_values_embeddings, x_headers_embeddings, y in train_loader: + x_values_bow = x_values_bow.to(device) + x_values_embeddings = x_values_embeddings.to(device) + x_headers_embeddings = x_headers_embeddings.to(device) + y = y.to(device) + + x_values_bow = drop_bow(x_values_bow, bow_drops) + + optimizer.zero_grad() + outputs = model(x_values_bow, x_values_embeddings, x_headers_embeddings) + + loss = criterion(outputs, y) + loss.backward() + optimizer.step() + train_loss += loss.item() * x_values_bow.size(0) + + _, predicted = torch.max(outputs, 1) + total_samples += y.size(0) + correct_predictions += (predicted == y).sum().item() + + train_accuracy = correct_predictions / total_samples * 100 + train_accuracies.append(train_accuracy) + train_loss = train_loss / len(train_loader.dataset) + train_losses.append(train_loss) + + model.eval() + val_loss = 0.0 + correct_predictions_val = 0 + total_samples_val = 0 + y_true = [] + y_scores = [] + with torch.no_grad(): + for ( + x_values_bow, + x_values_embeddings, + x_headers_embeddings, + y, + ) in val_loader: + x_values_bow = x_values_bow.to(device) + x_values_embeddings = x_values_embeddings.to(device) + x_headers_embeddings = x_headers_embeddings.to(device) + y = y.to(device) + outputs = model(x_values_bow, x_values_embeddings, x_headers_embeddings) + loss = criterion(outputs, y) + val_loss += loss.item() * x_values_bow.size(0) + + _, predicted = torch.max(outputs, 1) + total_samples_val += y.size(0) + correct_predictions_val += (predicted == y).sum().item() + y_true.extend(y.cpu().numpy()) + y_scores.extend(outputs.cpu().numpy()) + + val_loss = val_loss / len(val_loader.dataset) + val_accuracy = correct_predictions_val / total_samples_val * 100 + val_accuracies.append(val_accuracy) + val_losses.append(val_loss) + + print( + f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {train_loss:.4f}, \ + Training Accuracy: {train_accuracy:.2f}%, Validation Loss: {val_loss:.4f}, \ + Validation Accuracy: {val_accuracy:.2f}%" + ) + + # Early stop + + if val_loss < best_val_loss: + best_val_loss = val_loss + best_epoch = epoch + torch.save(model.state_dict(), model_pth) + elif epoch - best_epoch >= patience: + early_stop = True + if early_stop: + print(f"Early stop at {best_epoch + 1} epoch.") + y_true = label_binarize(y_true, classes=list(range(output_size))) + + # Convert to numpy arrays + y_true = np.array(y_true) + y_scores = np.array(y_scores) + + # Calculate ROC curves and AUC + fpr = {} + tpr = {} + roc_auc = {} + + for i in range( + output_size + ): # Replace output_size with your actual number of classes + fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_scores[:, i]) + roc_auc[i] = auc(fpr[i], tpr[i]) + + return train_accuracies, val_accuracies, train_losses, val_losses, fpr, tpr, roc_auc + + +def model_testing( + model: torch.nn.Module, test_loader: DataLoader, loss_fn: torch.nn.Module +) -> Tuple[List[int], List[int], torch.Tensor]: + """ + This functions tests the model. + + :param torch.nn.Module model: The trained model. + :param DataLoader test_loader: DataLoader for the testing set. + :param torch.nn.Module loss_fn: The loss function used to compute loss. + :return Tuple: + - List[int]: List of all the predictions made by the model. + - List[int]: List of all the true labels ( Ground truth) + - torch.Tensor: Logist from the model for the test dataset. + """ + all_preds = [] + all_labels = [] + model.eval() + total_loss_test = 0.0 + total_correct_test = 0 + total_samples_test = 0 + with torch.no_grad(): + for values_batch, bow_batch, headers_batch, labels in test_loader: + outputs = model(values_batch, bow_batch, headers_batch) + loss = loss_fn(outputs, labels) + total_loss_test += loss.item() + _, predicted_test = torch.max(outputs, 1) + correct_test = (predicted_test == labels).sum().item() + total_correct_test += correct_test + total_samples_test += labels.size(0) + all_preds.extend(predicted_test.cpu().numpy()) + all_labels.extend(labels.cpu().numpy()) + test_accuracy = total_correct_test / total_samples_test + test_loss = total_loss_test / len(test_loader) + logger.info(f"Test Accuracy: {test_accuracy}, Test Loss: {test_loss}") + + return all_preds, all_labels + + +def plot_learning_curve( + num_epochs: int, + train_accuracies: List[float], + val_accuracies: List[float], + train_losses: List[float], + val_losses: List[float], + accuracy_fig_pth: str, + loss_fig_pth: str, +) -> None: + """ + Plots the learning curves - accuracy and loss for Training and Validation of the model. + + :param int num_epochs: Number of epochs for which the model was trained. + :param List[float] train_accuracies: List of training accuracies for each epoch. + :param List[float] val_accuracies: List of validation accuracies for each epoch. + :param List[float] train_losses: List of training losses for each epoch. + :param List[float] val_losses: List of validation losses for each epoch. + :param str accuracy_fig_pth: Path where the accuracy curve figure will be saved. + :param str loss_fig_pth: Path where the loss curve figure will be saved. + """ + + # accuracy + plt.plot(range(1, num_epochs + 1), train_accuracies, label="Training Accuracy") + plt.plot(range(1, num_epochs + 1), val_accuracies, label="Validation Accuracy") + plt.xlabel("Epoch") + plt.ylabel("Accuracy") + plt.title("Learning Curve") + plt.legend() + plt.grid(True) + plt.savefig(accuracy_fig_pth, format="svg") + plt.show() + plt.close() + # loss + plt.plot(range(1, num_epochs + 1), train_losses, label="Training Loss") + plt.plot(range(1, num_epochs + 1), val_losses, label="Validation Loss") + plt.xlabel("Epoch") + plt.ylabel("Loss") + plt.title("Learning Curve") + plt.legend() + plt.grid(True) + plt.savefig(loss_fig_pth, format="svg") + plt.show() + plt.close() + + +def plot_confusion_matrix( + y_true: List[int], + y_pred: List[int], + unique_values_list: List[str], + confusion_matrix_fig_pth: str, +) -> None: + """ + Plots confusion matrix for the test data. + + :param List[int] y_true: List of true labels ( Ground Truth) + :param List[int] y_pred: List of predictions made by the model. + :param List[str] unique_values_list: List of all the classes that the model predicted. + :param str confusion_matrix_fig_pth: Path where the confusion matrix figure will be saved. + """ + conf_matrix = confusion_matrix(y_true, y_pred) + plt.figure(figsize=(12, 12)) + sns.heatmap( + conf_matrix, + annot=True, + fmt="d", + cmap="Blues", + xticklabels=np.unique(unique_values_list), + yticklabels=np.unique(unique_values_list), + ) + plt.title("Confusion Matrix") + plt.xlabel("Predicted Label") + plt.ylabel("True Label") + plt.savefig(confusion_matrix_fig_pth, format="svg") + plt.show() + plt.close() + class_accuracy = conf_matrix.diagonal() / conf_matrix.sum(axis=1) + for i, acc in enumerate(class_accuracy): + print(f"Accuracy for class {i}: {acc:.4f}") + + +def auc_roc_curve( + fpr: Dict[int, np.ndarray], + tpr: Dict[int, np.ndarray], + roc_auc: Dict[int, float], + output_size: int, + roc_fig_pth: str, +) -> None: + """ + Plots the ROC Curve. + + :param Dict[int, np.ndarray] fpr: Dictionary of False Positive Rates + :param Dicr[int, np.ndarray] tpr: Dictionary of True Positive Rates + :param Dict[int, float] roc_auc: Dictionary of Area Under Curve for ROC for different classes. + :param int output_size: The number of classes the model predicted into. + :param str roc_fig_pth: Path to where the ROC figure will be saved. + """ + plt.figure(figsize=(12, 12)) + for i in range(output_size): + plt.plot( + fpr[i], + tpr[i], + lw=2, + label="ROC curve (class %d) (AUC = %0.2f)" % (i, roc_auc[i]), + ) + + plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--") + plt.xlim([0.0, 1.0]) + plt.ylim([0.0, 1.05]) + plt.xlabel("False Positive Rate") + plt.ylabel("True Positive Rate") + plt.title("Receiver Operating Characteristic (ROC) Curve") + plt.legend(loc="lower right") + plt.savefig(roc_fig_pth, format="svg") + plt.show() + plt.close() From a4863b080de024086789dd8b96d0954f27f022d3 Mon Sep 17 00:00:00 2001 From: saanikat Date: Wed, 2 Oct 2024 11:31:59 -0400 Subject: [PATCH 06/18] black --- bedms/attr_standardizer.py | 2 +- bedms/utils.py | 1 + bedms/utils_train.py | 37 +++++++++++++++++++------------------ 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/bedms/attr_standardizer.py b/bedms/attr_standardizer.py index 2372f0a..7b93f1d 100644 --- a/bedms/attr_standardizer.py +++ b/bedms/attr_standardizer.py @@ -62,7 +62,7 @@ def __init__( Initializes the attribute standardizer with user provided schema, loads the model. :param str schema: User provided schema, can be "ENCODE" or "FAIRTRACKS" - :param str custom_param: User provided config file for + :param str custom_param: User provided config file for custom parameters, if they choose "CUSTOM" schema. :param int confidence: Confidence threshold for the predictions. """ diff --git a/bedms/utils.py b/bedms/utils.py index bd8c0a9..fb63805 100644 --- a/bedms/utils.py +++ b/bedms/utils.py @@ -1,6 +1,7 @@ """ This module has all util functions for 'bedms' """ + import logging import warnings from collections import Counter diff --git a/bedms/utils_train.py b/bedms/utils_train.py index f3adda4..09f861b 100644 --- a/bedms/utils_train.py +++ b/bedms/utils_train.py @@ -38,6 +38,7 @@ message="Creating a tensor from a list of numpy.ndarrays is extremely slow.", ) + def load_from_dir(dir: str) -> List[str]: """ Loads each file from the directory path. @@ -69,12 +70,12 @@ def accumulate_data( """ Accumulates data from multiple files into lists. - :param List[Tuple[str, str]] files: List containing + :param List[Tuple[str, str]] files: List containing sublists of values or header files. - :return Tuple[List[List[List[str]]], List[List[List[str]]],[List[str]]: + :return Tuple[List[List[List[str]]], List[List[List[str]]],[List[str]]: Lists of values, headers, labels. A tuple containing three lists: - - A nested list of values (list of tables where + - A nested list of values (list of tables where each table is a list of lists for columns), - A nested list of headers (similar structure to values), - A list of Pandas Index objects containing column labels. @@ -125,7 +126,7 @@ def get_top_training_cluster_averaged( :param List[torch.tensor] embeddings: List of embedding tensors to cluster. :param int num: Number of clusters to be created using k-means. - :return torch.Tensor: A tensor representing the + :return torch.Tensor: A tensor representing the average of embeddings in the most common cluster. """ flattened_embeddings = [embedding.tolist() for embedding in embeddings] @@ -174,23 +175,23 @@ def training_encoding( """ Generates encoded headers and values. - :param List[List[List[str]]] x_values_train_list: + :param List[List[List[str]]] x_values_train_list: Nested list containing the training set for values. - :param List[List[List[str]]] x_headers_train_list: + :param List[List[List[str]]] x_headers_train_list: Nested list containing the training set for headers. - :param List[pd.Index] y_train_list: + :param List[pd.Index] y_train_list: List of the column labels ( attributes) for training. - :param List[List[List[str]]] x_values_test_list: + :param List[List[List[str]]] x_values_test_list: Nested list containing the testing set for values. - :param List[List[List[str]]] x_headers_test_list: + :param List[List[List[str]]] x_headers_test_list: Nested list containing the testing set for headers. - :param List[pd.Index] y_test_list: + :param List[pd.Index] y_test_list: List of the column labels ( attributes) for testing. - :param List[List[List[str]]] x_values_val_list: + :param List[List[List[str]]] x_values_val_list: Nested list containing the validation set for values. - :param List[List[List[str]]] x_headers_val_list: + :param List[List[List[str]]] x_headers_val_list: Nested list containing the validation set for headers. - :param List[pd.Index] y_val_list: + :param List[pd.Index] y_val_list: List of the column labels ( attributes) for validation. :return Tuple[ Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], @@ -240,7 +241,7 @@ def encode_data( num_cluster: int, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """ - This nested function encodes the values, headers and labels data. + This nested function encodes the values, headers and labels data. It is called for thrice - training, testing, validation. :param List[List[List[str]]] x_values_list: Nested list containing values. @@ -339,11 +340,11 @@ def data_loader( """ Creates a DataLoader from encoded tensor data. - :param [torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor] encoded_data: + :param [torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor] encoded_data: Tuple containing tensors for values bag of words, values embeddings, headers embeddings, and labels. :param int batch_size: The number of samples per batch for the DataLoader. - :return DataLoader: A PyTorch DataLoader which yields + :return DataLoader: A PyTorch DataLoader which yields batches of data from the given tensors. """ ( @@ -365,11 +366,11 @@ def data_loader( def drop_bow(bow_tensor: torch.Tensor, num_drops: int) -> torch.Tensor: """ - Randomly drops a specified number of columns in the + Randomly drops a specified number of columns in the Bag of Words tensor for regularization. :param torch.Tensor bow_tensor: Bag of Words tensor. - :param int num_drops: Number of columns to be randomly + :param int num_drops: Number of columns to be randomly dropped from the Bag of Words tensor. :return torch.Tensor: Bag of Words tensor with dropped columns. """ From e67e094153075866b779784cf1d90aa155d1a394 Mon Sep 17 00:00:00 2001 From: saanikat Date: Wed, 2 Oct 2024 11:33:53 -0400 Subject: [PATCH 07/18] README updated with custom training --- README.md | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3075eb6..ac46292 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,6 @@ # BEDMS -BEDMS (BED Metadata Standardizer) is a tool used to standardize genomics/epigenomics metadata based on a schema chosen by the user ( eg. ENCODE, FAIRTRACKS, BEDBASE). - +BEDMS (BED Metadata Standardizer) is a tool desgined to standardize genomics and epigenomics metadata attributes according to user-selected schemas such as `ENCODE`, `FAIRTRACKS` and `BEDBASE`. BEDMS ensures consistency and FAIRness of metadata across different platforms. Additionally, users have the option to train their own standardizer model using a custom schema (`CUSTOM`), allowing for the standardization of attributes based on users' specific research requirements. ## Installation @@ -16,6 +15,7 @@ pip install git+https://github.com/databio/bedms.git ## Usage +### Standardizing based on available schemas ```python from bedms import AttrStandardizer @@ -25,7 +25,57 @@ results = model.standardize(pep="geo/gse228634:default") assert results ``` +### Training custom schemas +Training your custom schema is very easy with `BEDMS`. You would need two things to get started: +1. Training Sets +2. `training_config.yaml` + +To instantiate `TrainStandardizer` class: + +```python +from bedms.train import TrainStandardizer + +trainer = TrainStandardizer("training_config.yaml") + +``` +To load the datasets and encode them: + +```python +trainer.load_encode_data() +``` + +To train the custom model: + +```python +trainer.training() +``` + +To test the custom model: + +```python +trainer.testing() +``` + +To generate visualizations such as Learning Curves, Confusion Matrices, and ROC Curve: + +```python +trainer.plot_visualizations() +``` + +### Standardizing based on custom schema +For standardizing based on custom schema, you would require a `custom_config.yaml`. + +```python +from bedms import AttrStandardizer + +model = AttrStandardizer("CUSTOM", "custom_config.yaml") + +results = model.standardize(pep="geo/gse228634:default") + +assert results +``` +### Available schemas To see the available schemas, you can run: ``` from bedms.const import AVAILABLE_SCHEMAS From 24a5d77f3db667b26461a4db01048ccbb5a628d4 Mon Sep 17 00:00:00 2001 From: saanikat Date: Wed, 2 Oct 2024 15:10:43 -0400 Subject: [PATCH 08/18] minor changes in comments --- bedms/utils_train.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/bedms/utils_train.py b/bedms/utils_train.py index 09f861b..ab64566 100644 --- a/bedms/utils_train.py +++ b/bedms/utils_train.py @@ -522,7 +522,6 @@ def train_model( print(f"Early stop at {best_epoch + 1} epoch.") y_true = label_binarize(y_true, classes=list(range(output_size))) - # Convert to numpy arrays y_true = np.array(y_true) y_scores = np.array(y_scores) @@ -531,9 +530,7 @@ def train_model( tpr = {} roc_auc = {} - for i in range( - output_size - ): # Replace output_size with your actual number of classes + for i in range(output_size): fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_scores[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) From 03ad055be5f86e9f1cffd808005b38b78457862e Mon Sep 17 00:00:00 2001 From: saanikat Date: Mon, 7 Oct 2024 11:25:41 -0400 Subject: [PATCH 09/18] separating schemas from bedms --- bedms/__init__.py | 3 + bedms/attr_standardizer.py | 209 +++++++++++++------------------------ bedms/const.py | 19 ---- bedms/train.py | 2 +- bedms/utils.py | 20 ---- 5 files changed, 79 insertions(+), 174 deletions(-) diff --git a/bedms/__init__.py b/bedms/__init__.py index d0d13a5..99bc695 100644 --- a/bedms/__init__.py +++ b/bedms/__init__.py @@ -3,3 +3,6 @@ """ from .attr_standardizer import AttrStandardizer +from .train import AttrStandardizerTrainer + +__all__ = ["AttrStandardizer", "AttrStandardizerTrainer"] diff --git a/bedms/attr_standardizer.py b/bedms/attr_standardizer.py index 7b93f1d..c823890 100644 --- a/bedms/attr_standardizer.py +++ b/bedms/attr_standardizer.py @@ -3,6 +3,9 @@ """ import logging +import glob +import os +import yaml from typing import Dict, Tuple, Union, Optional import pickle import peppy @@ -14,33 +17,11 @@ from .const import ( AVAILABLE_SCHEMAS, CONFIDENCE_THRESHOLD, - DROPOUT_PROB, - EMBEDDING_SIZE, - HIDDEN_SIZE, - INPUT_SIZE_BOW_BEDBASE, - INPUT_SIZE_BOW_ENCODE, - INPUT_SIZE_BOW_FAIRTRACKS, - OUTPUT_SIZE_BEDBASE, - OUTPUT_SIZE_ENCODE, - OUTPUT_SIZE_FAIRTRACKS, PROJECT_NAME, SENTENCE_TRANSFORMER_MODEL, - REPO_ID, - ENCODE_VECTORIZER_FILENAME, - ENCODE_LABEL_ENCODER_FILENAME, - FAIRTRACKS_VECTORIZER_FILENAME, - FAIRTRACKS_LABEL_ENCODER_FILENAME, - BEDBASE_VECTORIZER_FILENAME, - BEDBASE_LABEL_ENCODER_FILENAME, ) from .model import BoWSTModel -from .utils import ( - data_encoding, - data_preprocessing, - fetch_from_pephub, - get_any_pep, - load_from_huggingface, -) +from .utils import data_encoding, data_preprocessing, fetch_from_pephub, get_any_pep logging.basicConfig(level=logging.INFO) @@ -54,82 +35,53 @@ class AttrStandardizer: def __init__( self, - schema: str, + repo_id: str, + model_name: str, custom_param: Optional[str] = None, confidence: int = CONFIDENCE_THRESHOLD, ) -> None: """ Initializes the attribute standardizer with user provided schema, loads the model. - :param str schema: User provided schema, can be "ENCODE" or "FAIRTRACKS" + :param str repo_id: HuggingFace repository ID + :param str model_name: Name of the schema model :param str custom_param: User provided config file for custom parameters, if they choose "CUSTOM" schema. :param int confidence: Confidence threshold for the predictions. """ - self.schema = schema + self.repo_id = repo_id + self.model_name = model_name self.conf_threshold = confidence self.custom_param = custom_param - - if self.schema == "CUSTOM" and self.custom_param: - self.custom_param = self._load_custom_param(self.custom_param) self.model, self.vectorizer, self.label_encoder = self._load_model() - def _load_custom_param(self, config_pth: str) -> Dict[str, Tuple]: - """ - Loads the custom parameters from the config file provided by the user. - - :param str config_pth: Path to the config file which has the custom parameters. - :return Dict[str, Tuple]: Custom Parameters dictionary. - """ - with open(config_pth, "r", encoding="utf-8") as file: - return yaml.safe_load(file) - def _get_parameters(self) -> Tuple[int, int, int, int, int, float]: """ Get the model parameters as per the chosen schema. :return Tuple[int, int, int, int, int, int, float]: Tuple containing the model parameters. """ - if self.schema == "ENCODE": - return ( - INPUT_SIZE_BOW_ENCODE, - EMBEDDING_SIZE, - EMBEDDING_SIZE, - HIDDEN_SIZE, - OUTPUT_SIZE_ENCODE, - DROPOUT_PROB, - ) - if self.schema == "FAIRTRACKS": - return ( - INPUT_SIZE_BOW_FAIRTRACKS, - EMBEDDING_SIZE, - EMBEDDING_SIZE, - HIDDEN_SIZE, - OUTPUT_SIZE_FAIRTRACKS, - DROPOUT_PROB, - ) - if self.schema == "BEDBASE": - return ( - INPUT_SIZE_BOW_BEDBASE, - EMBEDDING_SIZE, - EMBEDDING_SIZE, - HIDDEN_SIZE, - OUTPUT_SIZE_BEDBASE, - DROPOUT_PROB, - ) - if self.schema == "CUSTOM": - return ( - self.custom_param["model"]["input_size_bow"], - self.custom_param["model"]["input_size_embeddings"], - self.custom_param["model"]["input_size_embeddings"], - self.custom_param["model"]["hidden_size"], - self.custom_param["model"]["output_size"], - self.custom_param["model"]["dropout_prob"], - ) - - raise ValueError( - f"Schema not available: {self.schema}." - "Presently, four schemas are available: ENCODE , FAIRTRACKS, BEDBASE, CUSTOM" + config_filename = f"config_{self.model_name}.yaml" + config_pth = hf_hub_download( + repo_id=self.repo_id, + filename=os.path.join(self.model_name, config_filename), + ) + with open(config_pth, "r") as file: + config = yaml.safe_load(file) + + input_size_bow = config["params"]["input_size_bow"] + embedding_size = config["params"]["embedding_size"] + hidden_size = config["params"]["hidden_size"] + output_size = config["params"]["output_size"] + dropout_prob = config["params"]["dropout_prob"] + + return ( + input_size_bow, + embedding_size, + embedding_size, + hidden_size, + output_size, + dropout_prob, ) def _load_model(self) -> Tuple[nn.Module, object, object]: @@ -140,65 +92,54 @@ def _load_model(self) -> Tuple[nn.Module, object, object]: :return object: The scikit learn vectorizer for bag of words encoding. :return object: Label encoder object for the labels (y). """ - try: - if self.schema == "ENCODE": - filename_vc = ENCODE_VECTORIZER_FILENAME - filename_lb = ENCODE_LABEL_ENCODER_FILENAME - elif self.schema == "FAIRTRACKS": - filename_vc = FAIRTRACKS_VECTORIZER_FILENAME - filename_lb = FAIRTRACKS_LABEL_ENCODER_FILENAME - elif self.schema == "BEDBASE": - filename_vc = BEDBASE_VECTORIZER_FILENAME - filename_lb = BEDBASE_LABEL_ENCODER_FILENAME - elif self.schema == "CUSTOM": - vc_path = self.custom_param["paths"]["vectorizer_pth"] - lb_path = self.custom_param["paths"]["label_encoder_pth"] - state_dict = torch.load(self.custom_param["paths"]["model_pth"]) - else: - raise ValueError(f"Schema not available: {self.schema}") - - if self.schema != "CUSTOM": - vc_path = hf_hub_download( - repo_id=REPO_ID, - filename=filename_vc, - ) - lb_path = hf_hub_download( - repo_id=REPO_ID, - filename=filename_lb, - ) - model = load_from_huggingface(self.schema) - state_dict = torch.load(model) + model_filename = f"model_{self.model_name}.pth" + label_encoder_filename = f"label_encoder_{self.model_name}.pkl" + vectorizer_filename = f"vectorizer_{self.model_name}.pkl" - with open(vc_path, "rb") as f: - vectorizer = pickle.load(f) + model_pth = hf_hub_download( + repo_id=self.repo_id, filename=os.path.join(self.model_name, model_filename) + ) - with open(lb_path, "rb") as f: - label_encoder = pickle.load(f) + vc_path = hf_hub_download( + repo_id=self.repo_id, + filename=os.path.join(self.model_name, vectorizer_filename), + ) - ( - input_size_values, - input_size_values_embeddings, - input_size_headers, - hidden_size, - output_size, - dropout_prob, - ) = self._get_parameters() - - model = BoWSTModel( - input_size_values, - input_size_values_embeddings, - input_size_headers, - hidden_size, - output_size, - dropout_prob, - ) - model.load_state_dict(state_dict) - model.eval() - return model, vectorizer, label_encoder + lb_path = hf_hub_download( + repo_id=self.repo_id, + filename=os.path.join(self.model_name, label_encoder_filename), + ) - except Exception as e: - logger.error(f"Error loading the model: {str(e)}") - raise + with open(vc_path, "rb") as f: + vectorizer = pickle.load(f) + + with open(lb_path, "rb") as f: + label_encoder = pickle.load(f) + + state_dict = torch.load(model_pth) + + ( + input_size_values, + input_size_values_embeddings, + input_size_headers, + hidden_size, + output_size, + dropout_prob, + ) = self._get_parameters() + + model = BoWSTModel( + input_size_values, + input_size_values_embeddings, + input_size_headers, + hidden_size, + output_size, + dropout_prob, + ) + + model.load_state_dict(state_dict) + model.eval() + + return model, vectorizer, label_encoder def standardize( self, pep: Union[str, peppy.Project] diff --git a/bedms/const.py b/bedms/const.py index e200fda..c36f5f4 100644 --- a/bedms/const.py +++ b/bedms/const.py @@ -6,25 +6,6 @@ AVAILABLE_SCHEMAS = ["ENCODE", "FAIRTRACKS", "BEDBASE", "CUSTOM"] PEP_FILE_TYPES = ["yaml", "csv"] -REPO_ID = "databio/attribute-standardizer-model6" -MODEL_ENCODE = "model_encode.pth" -MODEL_FAIRTRACKS = "model_fairtracks.pth" -MODEL_BEDBASE = "model_bedbase.pth" -ENCODE_VECTORIZER_FILENAME = "vectorizer_encode.pkl" -FAIRTRACKS_VECTORIZER_FILENAME = "vectorizer_fairtracks.pkl" -BEDBASE_VECTORIZER_FILENAME = "vectorizer_bedbase.pkl" -ENCODE_LABEL_ENCODER_FILENAME = "label_encoder_encode.pkl" -FAIRTRACKS_LABEL_ENCODER_FILENAME = "label_encoder_fairtracks.pkl" -BEDBASE_LABEL_ENCODER_FILENAME = "label_encoder_bedbase.pkl" SENTENCE_TRANSFORMER_MODEL = "all-MiniLM-L6-v2" -HIDDEN_SIZE = 32 -DROPOUT_PROB = 0.113 CONFIDENCE_THRESHOLD = 0.70 -EMBEDDING_SIZE = 384 -INPUT_SIZE_BOW_ENCODE = 10459 -INPUT_SIZE_BOW_FAIRTRACKS = 13617 -OUTPUT_SIZE_FAIRTRACKS = 15 -OUTPUT_SIZE_ENCODE = 18 NUM_CLUSTERS = 3 -INPUT_SIZE_BOW_BEDBASE = 13708 -OUTPUT_SIZE_BEDBASE = 12 diff --git a/bedms/train.py b/bedms/train.py index d2816fd..9bc3438 100644 --- a/bedms/train.py +++ b/bedms/train.py @@ -29,7 +29,7 @@ logger = logging.getLogger(PROJECT_NAME) -class TrainStandardizer: +class AttrStandardizerTrainer: """ This is the training class responsible for managing the training process for the standardizer model. diff --git a/bedms/utils.py b/bedms/utils.py index fb63805..20e7128 100644 --- a/bedms/utils.py +++ b/bedms/utils.py @@ -17,11 +17,7 @@ from sklearn.cluster import KMeans from sklearn.preprocessing import LabelEncoder from .const import ( - MODEL_BEDBASE, - MODEL_ENCODE, - MODEL_FAIRTRACKS, NUM_CLUSTERS, - REPO_ID, PEP_FILE_TYPES, PROJECT_NAME, ) @@ -50,22 +46,6 @@ def fetch_from_pephub(project: peppy.Project) -> pd.DataFrame: return csv_file_df -def load_from_huggingface(schema: str) -> Optional[Any]: - """ - Load a model from HuggingFace based on the schema of choice. - - :param str schema: Schema Type - :return Optional[Any]: Loaded model object - """ - if schema == "ENCODE": - model = hf_hub_download(repo_id=REPO_ID, filename=MODEL_ENCODE) - elif schema == "FAIRTRACKS": - model = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FAIRTRACKS) - elif schema == "BEDBASE": - model = hf_hub_download(repo_id=REPO_ID, filename=MODEL_BEDBASE) - return model - - def data_preprocessing( df: pd.DataFrame, ) -> Tuple[List[List[str]], List[str], List[List[str]], int]: From 27df60707949d1d394875b349eee670317a3551d Mon Sep 17 00:00:00 2001 From: saanikat Date: Mon, 7 Oct 2024 11:32:12 -0400 Subject: [PATCH 10/18] const error solved --- bedms/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bedms/train.py b/bedms/train.py index 9bc3438..b50b9f5 100644 --- a/bedms/train.py +++ b/bedms/train.py @@ -21,7 +21,7 @@ plot_confusion_matrix, auc_roc_curve, ) -from .const import PROJECT_NAME, EMBEDDING_SIZE +from .const import PROJECT_NAME from .model import BoWSTModel From 3cd2e7d2c627d2ee687950793c6488f117c4602e Mon Sep 17 00:00:00 2001 From: saanikat Date: Mon, 7 Oct 2024 11:37:30 -0400 Subject: [PATCH 11/18] matplotlib --- requirements/requirements-all.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 3f373a4..397ce51 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -4,3 +4,4 @@ torch sentence-transformers pephubclient>=0.4.2 peppy>=0.40.6 +matplotlib \ No newline at end of file From 37d1fe690a9776a6fb3225af7795d97371d22da1 Mon Sep 17 00:00:00 2001 From: saanikat Date: Mon, 7 Oct 2024 11:40:57 -0400 Subject: [PATCH 12/18] req --- requirements/requirements-all.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 397ce51..daeede2 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -4,4 +4,5 @@ torch sentence-transformers pephubclient>=0.4.2 peppy>=0.40.6 -matplotlib \ No newline at end of file +matplotlib +seaborn \ No newline at end of file From 5edee67e4ee5121f2617c59d582a70caaffab9bf Mon Sep 17 00:00:00 2001 From: saanikat Date: Mon, 7 Oct 2024 11:47:17 -0400 Subject: [PATCH 13/18] updated test --- tests/test_bedms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_bedms.py b/tests/test_bedms.py index a47dfb1..b2d84d1 100755 --- a/tests/test_bedms.py +++ b/tests/test_bedms.py @@ -3,7 +3,7 @@ class TestBEDMES: def test_bedmes(self): - model = AttrStandardizer("ENCODE") + model = AttrStandardizer(repo_id='databio/attribute-standardizer-model6', model_name='encode') results = model.standardize(pep="geo/gse228634:default") assert results From e6ace1291ce06b36a957f0140b1309e9446c8fdd Mon Sep 17 00:00:00 2001 From: saanikat Date: Mon, 7 Oct 2024 11:48:36 -0400 Subject: [PATCH 14/18] lint --- tests/test_bedms.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_bedms.py b/tests/test_bedms.py index b2d84d1..fc654a1 100755 --- a/tests/test_bedms.py +++ b/tests/test_bedms.py @@ -3,7 +3,9 @@ class TestBEDMES: def test_bedmes(self): - model = AttrStandardizer(repo_id='databio/attribute-standardizer-model6', model_name='encode') + model = AttrStandardizer( + repo_id="databio/attribute-standardizer-model6", model_name="encode" + ) results = model.standardize(pep="geo/gse228634:default") assert results From 732b5053314a9a3bffce31f888f2cad5fedddcc4 Mon Sep 17 00:00:00 2001 From: saanikat Date: Mon, 7 Oct 2024 12:04:46 -0400 Subject: [PATCH 15/18] README updated --- README.md | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index ac46292..a56e07a 100644 --- a/README.md +++ b/README.md @@ -16,10 +16,15 @@ pip install git+https://github.com/databio/bedms.git ## Usage ### Standardizing based on available schemas + +To choose the schema you want to standardize according to, please refer to the [HuggingFace repository](https://huggingface.co/databio/attribute-standardizer-model6). Based on the schema design `.yaml` files, you can select which schema best represents your attributes. In the example below, we have chosen `encode` schema. + ```python from bedms import AttrStandardizer -model = AttrStandardizer("ENCODE") +model = AttrStandardizer( + repo_id="databio/attribute-standardizer-model6", model_name="encode" +) results = model.standardize(pep="geo/gse228634:default") assert results @@ -33,9 +38,9 @@ Training your custom schema is very easy with `BEDMS`. You would need two things To instantiate `TrainStandardizer` class: ```python -from bedms.train import TrainStandardizer +from bedms.train import AttrStandardizerTrainer -trainer = TrainStandardizer("training_config.yaml") +trainer = AttrStandardizerTrainer("training_config.yaml") ``` To load the datasets and encode them: @@ -63,26 +68,16 @@ trainer.plot_visualizations() ``` ### Standardizing based on custom schema -For standardizing based on custom schema, you would require a `custom_config.yaml`. + +For standardizing based on custom schema, your model should be on HuggingFace. The directory structure should follow the instructions mentioned on [HuggingFace](https://huggingface.co/databio/attribute-standardizer-model6). ```python from bedms import AttrStandardizer -model = AttrStandardizer("CUSTOM", "custom_config.yaml") - +model = AttrStandardizer( + repo_id="name/of/your/hf/repo", model_name="model/name" +) results = model.standardize(pep="geo/gse228634:default") assert results -``` - -### Available schemas -To see the available schemas, you can run: -``` -from bedms.const import AVAILABLE_SCHEMAS -print(AVAILABLE_SCHEMAS) - -# >> ['ENCODE', 'FAIRTRACKS', 'BEDBASE'] - -``` - -AVAILABLE_SCHEMAS is a list of available schemas that you can use to standardize your metadata. +``` \ No newline at end of file From dfa1a02008b1e4ac56508559209834237eda2f4f Mon Sep 17 00:00:00 2001 From: saanikat Date: Tue, 15 Oct 2024 11:27:02 -0400 Subject: [PATCH 16/18] reviewer request changes --- README.md | 13 +++-- bedms/train.py | 118 +++++++++++++++++++++++++++++++------------ bedms/utils_train.py | 49 +++++++++++------- training_config.yaml | 1 + 4 files changed, 127 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index a56e07a..a1e6789 100644 --- a/README.md +++ b/README.md @@ -46,27 +46,30 @@ trainer = AttrStandardizerTrainer("training_config.yaml") To load the datasets and encode them: ```python -trainer.load_encode_data() +train_data, val_data, test_data, label_encoder, vectorizer = trainer.load_data() ``` To train the custom model: ```python -trainer.training() +trainer.train() ``` To test the custom model: ```python -trainer.testing() +test_results_dict = trainer.test() ``` To generate visualizations such as Learning Curves, Confusion Matrices, and ROC Curve: ```python -trainer.plot_visualizations() +acc_fig, loss_fig, conf_fig, roc_fig = trainer.plot_visualizations() ``` +Where `acc_fig` is Accuracy Curve figure object, `loss_fig` is Loss Curve figure object, `conf_fig` is the Confusion Matrix figure object, and `roc_fig` is the ROC Curve figure object. + + ### Standardizing based on custom schema For standardizing based on custom schema, your model should be on HuggingFace. The directory structure should follow the instructions mentioned on [HuggingFace](https://huggingface.co/databio/attribute-standardizer-model6). @@ -79,5 +82,5 @@ model = AttrStandardizer( ) results = model.standardize(pep="geo/gse228634:default") -assert results +print(results) #Dictionary of suggested predictions with their confidence: {'attr_1':{'prediction_1': 0.70, 'prediction_2':0.30}} ``` \ No newline at end of file diff --git a/bedms/train.py b/bedms/train.py index b50b9f5..b7a5c77 100644 --- a/bedms/train.py +++ b/bedms/train.py @@ -4,14 +4,19 @@ import torch from torch import nn from torch import optim +from torch.utils.data import DataLoader from sklearn.metrics import ( precision_score, recall_score, f1_score, ) +from sklearn.preprocessing import LabelEncoder +from sklearn.feature_extraction.text import CountVectorizer +import matplotlib.pyplot as plt +from typing import List, Dict, Tuple import yaml from .utils_train import ( - load_from_dir, + load_training_files_from_dir, accumulate_data, training_encoding, data_loader, @@ -41,33 +46,56 @@ def __init__(self, config: str) -> None: :param str config: Path to the config file which has the training parameters provided by the user. """ - self.label_encoder = None - self.vectorizer = None - self.train_loader = None - self.val_loader = None - self.test_loader = None - self.output_size = None - self.criterion = None - self.train_accuracies = None - self.val_accuracies = None - self.train_losses = None - self.val_losses = None - self.model = None - self.fpr = None - self.tpr = None - self.roc_auc = None - self.all_labels = None - self.all_preds = None + self.label_encoder: LabelEncoder = None + self.vectorizer: CountVectorizer = None + self.train_loader: DataLoader = None + self.val_loader: DataLoader = None + self.test_loader: DataLoader = None + self.output_size: int = 0 + self.criterion: nn.Module = None + self.train_accuracies: List[float] = [] + self.val_accuracies: List[float] = [] + self.train_losses: List[float] = [] + self.val_losses: List[float] = [] + self.model: BoWSTModel = None + self.fpr: Dict[int, float] = {} + self.tpr: Dict[int, float] = {} + self.roc_auc: Dict[int, float] = {} + self.all_labels: List[int] = [] + self.all_preds: List[int] = [] with open(config, "r") as file: self.config = yaml.safe_load(file) - def load_encode_data(self) -> None: + def load_data( + self, + ) -> Tuple[ + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + LabelEncoder, + CountVectorizer, + ]: """ Loads and prepares the encoded training, testing and validation datasets. + :return Tuple[ + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], + LabelEncoder, + CountVectorizer]: A tuple containing: + - training dataset tensor + - validation dataset tensor + - testing dataset tensor + - label encoder + - bag of words vectorizer """ - values_files_list = load_from_dir(self.config["dataset"]["values_dir_pth"]) - headers_files_list = load_from_dir(self.config["dataset"]["headers_dir_pth"]) + values_files_list = load_training_files_from_dir( + self.config["dataset"]["values_dir_pth"] + ) + headers_files_list = load_training_files_from_dir( + self.config["dataset"]["headers_dir_pth"] + ) if len(values_files_list) != len(headers_files_list): logger.error( @@ -149,13 +177,21 @@ def load_encode_data(self) -> None: logger.info("Loading Done.") - def training(self): + return ( + train_encoded_data, + val_encoded_data, + test_encoded_data, + self.label_encoder, + self.vectorizer, + ) + + def train(self) -> None: """ Trains the model. """ input_size_values = len(self.vectorizer.vocabulary_) - input_size_values_embeddings = EMBEDDING_SIZE - input_size_headers = EMBEDDING_SIZE + input_size_values_embeddings = self.config["training"]["embedding_size"] + input_size_headers = self.config["training"]["embedding_size"] hidden_size = self.config["model"]["hidden_size"] self.output_size = len(self.label_encoder.classes_) # Number of classes dropout_prob = self.config["model"]["dropout_prob"] @@ -175,7 +211,9 @@ def training(self): optimizer = optim.Adam( self.model.parameters(), lr=learning_rate, weight_decay=l2_reg_lambda ) - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.model.to(self.device) + # Training the model num_epochs = self.config["training"]["num_epochs"] @@ -196,7 +234,7 @@ def training(self): self.val_loader, self.criterion, optimizer, - device, + self.device, num_epochs, self.output_size, model_pth, @@ -205,29 +243,41 @@ def training(self): logger.info("Training Done.") - def testing(self): + def test(self) -> Dict[str, float]: """ Model testing. + + :return Dict[str, float]: Precision, Recall, and F1 values """ self.all_preds, self.all_labels = model_testing( - self.model, self.test_loader, self.criterion + self.model, self.device, self.test_loader, self.criterion ) precision = precision_score(self.all_labels, self.all_preds, average="macro") recall = recall_score(self.all_labels, self.all_preds, average="macro") f1 = f1_score(self.all_labels, self.all_preds, average="macro") logger.info(f"Precision:{precision}, Recall: {recall}, F1 Score: {f1}") + return {"precision": precision, "recall": recall, "f1": f1} - def plot_visualizations(self): + def plot_visualizations( + self, + ) -> Tuple[plt.Figure, plt.Figure, plt.Figure, plt.Figure]: """ Generates visualizations for training ( accuracy and loss curves) and testing( confusion matrix, roc curve) + + :return Tuple[plt.Figure, plt.Figure, plt.Figure, plt.Figure]: + A Tuple containing: + - accuracy figure + - loss figure + - confusion matrix figure + - ROC curve figure """ num_epochs = self.config["training"]["num_epochs"] accuracy_fig_pth = self.config["visualization"]["accuracy_fig_pth"] loss_fig_pth = self.config["visualization"]["loss_fig_pth"] cm_pth = self.config["visualization"]["confusion_matrix_fig_pth"] roc_pth = self.config["visualization"]["roc_fig_pth"] - plot_learning_curve( + acc_fig, loss_fig = plot_learning_curve( num_epochs, self.train_accuracies, self.val_accuracies, @@ -236,7 +286,11 @@ def plot_visualizations(self): accuracy_fig_pth, loss_fig_pth, ) - plot_confusion_matrix( + conf_fig = plot_confusion_matrix( self.all_labels, self.all_preds, self.label_encoder.classes_, cm_pth ) - auc_roc_curve(self.fpr, self.tpr, self.roc_auc, self.output_size, roc_pth) + roc_fig = auc_roc_curve( + self.fpr, self.tpr, self.roc_auc, self.output_size, roc_pth + ) + + return acc_fig, loss_fig, conf_fig, roc_fig diff --git a/bedms/utils_train.py b/bedms/utils_train.py index ab64566..9661988 100644 --- a/bedms/utils_train.py +++ b/bedms/utils_train.py @@ -39,7 +39,7 @@ ) -def load_from_dir(dir: str) -> List[str]: +def load_training_files_from_dir(dir: str) -> List[str]: """ Loads each file from the directory path. @@ -49,7 +49,7 @@ def load_from_dir(dir: str) -> List[str]: return glob(os.path.join(dir, "*.csv")) -def load_and_preprocess(file_path: str) -> pd.DataFrame: +def load_and_preprocess_files(file_path: str) -> pd.DataFrame: """ Loads and Preprocesses each csv file as a Pandas DataFrame. @@ -84,8 +84,8 @@ def accumulate_data( x_headers_list = [] y_list = [] for values_file, headers_file in files: - df_values = load_and_preprocess(values_file) - df_headers = load_and_preprocess(headers_file) + df_values = load_and_preprocess_files(values_file) + df_headers = load_and_preprocess_files(headers_file) df_values = df_values.fillna("") df_headers = df_headers.fillna("") y = df_values.columns @@ -129,8 +129,8 @@ def get_top_training_cluster_averaged( :return torch.Tensor: A tensor representing the average of embeddings in the most common cluster. """ - flattened_embeddings = [embedding.tolist() for embedding in embeddings] - kmeans = KMeans(n_clusters=num, random_state=0).fit(flattened_embeddings) + embeddings_list = [embedding.tolist() for embedding in embeddings] + kmeans = KMeans(n_clusters=num, random_state=0).fit(embeddings_list) labels_kmeans = kmeans.labels_ cluster_counts = Counter(labels_kmeans) most_common_cluster = max(cluster_counts, key=cluster_counts.get) @@ -198,13 +198,11 @@ def training_encoding( Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], LabelEncoder, - List[str], CountVectorizer]: Returns a tuple of - training dataset tensor - testing dataset tensor - validation dataset tensor - trained label encoder - - list of unique values encountered during training - Trained vectorizer for Bag of Words representation """ @@ -538,7 +536,10 @@ def train_model( def model_testing( - model: torch.nn.Module, test_loader: DataLoader, loss_fn: torch.nn.Module + model: torch.nn.Module, + device: torch.device, + test_loader: DataLoader, + loss_fn: torch.nn.Module, ) -> Tuple[List[int], List[int], torch.Tensor]: """ This functions tests the model. @@ -559,6 +560,10 @@ def model_testing( total_samples_test = 0 with torch.no_grad(): for values_batch, bow_batch, headers_batch, labels in test_loader: + values_batch = values_batch.to(device) + bow_batch = bow_batch.to(device) + headers_batch = headers_batch.to(device) + labels = labels.to(device) outputs = model(values_batch, bow_batch, headers_batch) loss = loss_fn(outputs, labels) total_loss_test += loss.item() @@ -583,7 +588,7 @@ def plot_learning_curve( val_losses: List[float], accuracy_fig_pth: str, loss_fig_pth: str, -) -> None: +) -> Tuple[plt.Figure, plt.Figure]: """ Plots the learning curves - accuracy and loss for Training and Validation of the model. @@ -594,10 +599,14 @@ def plot_learning_curve( :param List[float] val_losses: List of validation losses for each epoch. :param str accuracy_fig_pth: Path where the accuracy curve figure will be saved. :param str loss_fig_pth: Path where the loss curve figure will be saved. + + :return Tuple[plt.Figure, plt.Figure]: Accuracy and Loss curves """ # accuracy - plt.plot(range(1, num_epochs + 1), train_accuracies, label="Training Accuracy") + acc = plt.plot( + range(1, num_epochs + 1), train_accuracies, label="Training Accuracy" + ) plt.plot(range(1, num_epochs + 1), val_accuracies, label="Validation Accuracy") plt.xlabel("Epoch") plt.ylabel("Accuracy") @@ -607,8 +616,9 @@ def plot_learning_curve( plt.savefig(accuracy_fig_pth, format="svg") plt.show() plt.close() + # loss - plt.plot(range(1, num_epochs + 1), train_losses, label="Training Loss") + loss = plt.plot(range(1, num_epochs + 1), train_losses, label="Training Loss") plt.plot(range(1, num_epochs + 1), val_losses, label="Validation Loss") plt.xlabel("Epoch") plt.ylabel("Loss") @@ -618,6 +628,7 @@ def plot_learning_curve( plt.savefig(loss_fig_pth, format="svg") plt.show() plt.close() + return acc, loss def plot_confusion_matrix( @@ -625,7 +636,7 @@ def plot_confusion_matrix( y_pred: List[int], unique_values_list: List[str], confusion_matrix_fig_pth: str, -) -> None: +) -> plt.Figure: """ Plots confusion matrix for the test data. @@ -633,6 +644,8 @@ def plot_confusion_matrix( :param List[int] y_pred: List of predictions made by the model. :param List[str] unique_values_list: List of all the classes that the model predicted. :param str confusion_matrix_fig_pth: Path where the confusion matrix figure will be saved. + + :return plt.Figure: Confusion matrix figure """ conf_matrix = confusion_matrix(y_true, y_pred) plt.figure(figsize=(12, 12)) @@ -653,6 +666,7 @@ def plot_confusion_matrix( class_accuracy = conf_matrix.diagonal() / conf_matrix.sum(axis=1) for i, acc in enumerate(class_accuracy): print(f"Accuracy for class {i}: {acc:.4f}") + return conf_matrix def auc_roc_curve( @@ -661,7 +675,7 @@ def auc_roc_curve( roc_auc: Dict[int, float], output_size: int, roc_fig_pth: str, -) -> None: +) -> plt.Figure: """ Plots the ROC Curve. @@ -670,8 +684,10 @@ def auc_roc_curve( :param Dict[int, float] roc_auc: Dictionary of Area Under Curve for ROC for different classes. :param int output_size: The number of classes the model predicted into. :param str roc_fig_pth: Path to where the ROC figure will be saved. + + :return plt.Figure: Figure for the ROC Curve. """ - plt.figure(figsize=(12, 12)) + fig = plt.figure(figsize=(12, 12)) for i in range(output_size): plt.plot( fpr[i], @@ -688,5 +704,4 @@ def auc_roc_curve( plt.title("Receiver Operating Characteristic (ROC) Curve") plt.legend(loc="lower right") plt.savefig(roc_fig_pth, format="svg") - plt.show() - plt.close() + return fig diff --git a/training_config.yaml b/training_config.yaml index 1236709..75910d2 100644 --- a/training_config.yaml +++ b/training_config.yaml @@ -21,6 +21,7 @@ training: vectorizer_pth: "path/to/vectorizer.pkl" #Path to where you want to save the Bag of Words vectorizer label_encoder_pth: "/path/to/label_encoder.pkl" #Path to where you want to save the Label Encoder sentence_transformer_model: "all-MiniLM-L6-v2" #Name of the sentence transformer model you wish to use fro HuggingFace + embedding_size: 384 #Dimensionality of the embedding produced by the chosen sentence transformer bow_drops: 2 #Number of Bag of Words columns you wish to drop out during the training process (Avoids overfitting, can be set to 0) visualization: From c009c70103a9160c98a71c9572128356fe3083d6 Mon Sep 17 00:00:00 2001 From: Oleksandr <41573628+khoroshevskyi@users.noreply.github.com> Date: Tue, 3 Dec 2024 15:10:48 -0500 Subject: [PATCH 17/18] updated version --- bedms/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bedms/_version.py b/bedms/_version.py index 3dc1f76..d3ec452 100644 --- a/bedms/_version.py +++ b/bedms/_version.py @@ -1 +1 @@ -__version__ = "0.1.0" +__version__ = "0.2.0" From fef2ed7bb0c0368f1bf38b87abe96716c7fddd0d Mon Sep 17 00:00:00 2001 From: Oleksandr <41573628+khoroshevskyi@users.noreply.github.com> Date: Tue, 3 Dec 2024 15:11:23 -0500 Subject: [PATCH 18/18] updated changelog --- docs/changelog.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/changelog.md b/docs/changelog.md index 747b21b..25fa872 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,7 +2,15 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. +## [0.2.0] - 2024-12-03 +### Added +- Added generic way of initialization of all schemas +- Added TrainStandardizer module for custom model creation + +### Fixed +- Fixed Typo in README #23 + ## [0.1.0] - 2024-09-16 ### Added -- initial project release \ No newline at end of file +- initial project release