From 15075821ba4eab3321ed02a4e7797ee7d5359c28 Mon Sep 17 00:00:00 2001
From: saanikat <saanikatambe@gmail.com>
Date: Tue, 17 Sep 2024 09:03:30 -0400
Subject: [PATCH 01/18] readme updated

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0f95dca..300606a 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ assert results
 
 To see the available schemas, you can run:
 ```
-from bedms.constants import AVAILABLE_SCHEMAS
+from bedms.const import AVAILABLE_SCHEMAS
 print(AVAILABLE_SCHEMAS)
 
 # >> ['ENCODE', 'FAIRTRACKS', 'BEDBASE'] 

From 50969451a450643de1445ed086b5d24ab8c100e9 Mon Sep 17 00:00:00 2001
From: saanikat <saanikatambe@gmail.com>
Date: Tue, 17 Sep 2024 09:07:15 -0400
Subject: [PATCH 02/18] updated readme

---
 README.md                  | 8 ++++++++
 bedms/attr_standardizer.py | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 300606a..5b07838 100644
--- a/README.md
+++ b/README.md
@@ -33,5 +33,13 @@ print(AVAILABLE_SCHEMAS)
 
 # >> ['ENCODE', 'FAIRTRACKS', 'BEDBASE'] 
 
+```
+Or you can run:
+
+```
+schemas = model.show_available_schemas()
+
+print(schemas)
+
 ```
 AVAILABLE_SCHEMAS is a list of available schemas that you can use to standardize your metadata.
diff --git a/bedms/attr_standardizer.py b/bedms/attr_standardizer.py
index 6fa3b2e..a95fd83 100644
--- a/bedms/attr_standardizer.py
+++ b/bedms/attr_standardizer.py
@@ -248,7 +248,7 @@ def standardize(
             )
 
     @staticmethod
-    def get_available_schemas() -> list[str]:
+    def show_available_schemas() -> list[str]:
         """
         Stores a list of available schemas.
 

From 6a585aebf95d963f49c7e747ee447feb826a9752 Mon Sep 17 00:00:00 2001
From: saanikat <saanikatambe@gmail.com>
Date: Tue, 17 Sep 2024 13:38:39 -0400
Subject: [PATCH 03/18] changes

---
 README.md                  | 7 -------
 bedms/attr_standardizer.py | 2 +-
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 5b07838..3075eb6 100644
--- a/README.md
+++ b/README.md
@@ -34,12 +34,5 @@ print(AVAILABLE_SCHEMAS)
 # >> ['ENCODE', 'FAIRTRACKS', 'BEDBASE'] 
 
 ```
-Or you can run:
 
-```
-schemas = model.show_available_schemas()
-
-print(schemas)
-
-```
 AVAILABLE_SCHEMAS is a list of available schemas that you can use to standardize your metadata.
diff --git a/bedms/attr_standardizer.py b/bedms/attr_standardizer.py
index a95fd83..6fa3b2e 100644
--- a/bedms/attr_standardizer.py
+++ b/bedms/attr_standardizer.py
@@ -248,7 +248,7 @@ def standardize(
             )
 
     @staticmethod
-    def show_available_schemas() -> list[str]:
+    def get_available_schemas() -> list[str]:
         """
         Stores a list of available schemas.
 

From 273fa20dd627298ac1d3de0353741935da8076de Mon Sep 17 00:00:00 2001
From: saanikat <saanikatambe@gmail.com>
Date: Tue, 1 Oct 2024 16:49:18 -0400
Subject: [PATCH 04/18] example configs for custom training and std

---
 custom_config.yaml   | 10 ++++++++++
 training_config.yaml | 30 ++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)
 create mode 100644 custom_config.yaml
 create mode 100644 training_config.yaml

diff --git a/custom_config.yaml b/custom_config.yaml
new file mode 100644
index 0000000..a7e06b9
--- /dev/null
+++ b/custom_config.yaml
@@ -0,0 +1,10 @@
+paths:
+  model_pth: "path/to/custom/trained/model.pth" #Path to where you saved the custom model
+  label_encoder_pth: "/path/to/label_encoder.pkl" #Path to where you saved the Label Encoder
+  vectorizer_pth: "path/to/vectorizer.pkl" #Path to where you saved the Bag of Words vectorizer
+model:
+  input_size_bow: 1857 #Size of the vocabulary for Bag of Words encoding
+  input_size_embeddings: 384 #Size of the input embeddings for values and attributes
+  hidden_size: 32 #Hidden size the model was trained on
+  output_size: 18 #Number of classes the model predicts into
+  dropout_prob: 0.113 #Dropout probability you had set for the model
\ No newline at end of file
diff --git a/training_config.yaml b/training_config.yaml
new file mode 100644
index 0000000..1236709
--- /dev/null
+++ b/training_config.yaml
@@ -0,0 +1,30 @@
+dataset:
+  values_dir_pth: "/path/to/training/values/directory" #Path to the values directory
+  headers_dir_pth: "path/to/training/headers/directory" #Path to the attributes directory
+
+data_split:
+  train_set: 8000 #Number of csv value-attribute file pairs for training set
+  test_set: 100 #Number of csv value-attribute file pairs for testing set
+  val_set: 100 #Number of csv value-attribute file pairs for validation set
+
+model:
+  hidden_size: 32 #Hidden size for training the model
+  dropout_prob: 0.113 #Dropout probability for training the model
+
+training:
+  batch_size: 32 #Batch size for training
+  num_epochs: 20 #Number of training epochs
+  learning_rate: 0.001 #Learning rate of the model
+  l2_regularization: 0.001 #L2 regularization strength applied to the optimizer (Avoids overfitting, can be set to 0)
+  model_pth: "path/to/custom/trained/model.pth" #Path to where you want to save the custom model
+  num_cluster: 3 #Number of clusters for KMeans 
+  vectorizer_pth: "path/to/vectorizer.pkl" #Path to where you want to save the Bag of Words vectorizer
+  label_encoder_pth: "/path/to/label_encoder.pkl" #Path to where you want to save the Label Encoder
+  sentence_transformer_model:  "all-MiniLM-L6-v2" #Name of the sentence transformer model you wish to use fro HuggingFace
+  bow_drops: 2 #Number of Bag of Words columns you wish to drop out during the training process (Avoids overfitting, can be set to 0)
+
+visualization:
+  accuracy_fig_pth: "/path/to/accuracy_fig.svg" #Path to where you wish to save the Accuracy Curve image
+  loss_fig_pth: "/path/to/loss_fig.svg" #Path to where you wish to save the Loss Curve image
+  confusion_matrix_fig_pth: "/path/to/confusion_matrix.svg" #Path to where you wish to save the confusion matrix image
+  roc_fig_pth: "/path/to/roc_fig.svg" #Path to where you wish to save the ROC curve image 
\ No newline at end of file

From 089755a900d826b8cee3695dbbe52a0e84eed219 Mon Sep 17 00:00:00 2001
From: saanikat <saanikatambe@gmail.com>
Date: Wed, 2 Oct 2024 11:29:01 -0400
Subject: [PATCH 05/18] training module

---
 bedms/attr_standardizer.py |  78 +++--
 bedms/const.py             |   4 +-
 bedms/train.py             | 242 +++++++++++++
 bedms/utils.py             |   7 +-
 bedms/utils_train.py       | 694 +++++++++++++++++++++++++++++++++++++
 5 files changed, 999 insertions(+), 26 deletions(-)
 create mode 100644 bedms/train.py
 create mode 100644 bedms/utils_train.py

diff --git a/bedms/attr_standardizer.py b/bedms/attr_standardizer.py
index 6fa3b2e..2372f0a 100644
--- a/bedms/attr_standardizer.py
+++ b/bedms/attr_standardizer.py
@@ -3,13 +3,14 @@
 """
 
 import logging
-from typing import Dict, Tuple, Union
+from typing import Dict, Tuple, Union, Optional
 import pickle
 import peppy
 import torch
 from torch import nn
 import torch.nn.functional as torch_functional
-
+import yaml
+from huggingface_hub import hf_hub_download
 from .const import (
     AVAILABLE_SCHEMAS,
     CONFIDENCE_THRESHOLD,
@@ -40,7 +41,7 @@
     get_any_pep,
     load_from_huggingface,
 )
-from huggingface_hub import hf_hub_download
+
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(PROJECT_NAME)
@@ -51,16 +52,37 @@ class AttrStandardizer:
     This is the AttrStandardizer class which holds the models for Attribute Standardization.
     """
 
-    def __init__(self, schema: str, confidence: int = CONFIDENCE_THRESHOLD) -> None:
+    def __init__(
+        self,
+        schema: str,
+        custom_param: Optional[str] = None,
+        confidence: int = CONFIDENCE_THRESHOLD,
+    ) -> None:
         """
         Initializes the attribute standardizer with user provided schema, loads the model.
 
         :param str schema: User provided schema, can be "ENCODE" or "FAIRTRACKS"
+        :param str custom_param: User provided config file for 
+            custom parameters, if they choose "CUSTOM" schema.
         :param int confidence: Confidence threshold for the predictions.
         """
         self.schema = schema
-        self.model, self.vectorizer, self.label_encoder = self._load_model()
         self.conf_threshold = confidence
+        self.custom_param = custom_param
+
+        if self.schema == "CUSTOM" and self.custom_param:
+            self.custom_param = self._load_custom_param(self.custom_param)
+        self.model, self.vectorizer, self.label_encoder = self._load_model()
+
+    def _load_custom_param(self, config_pth: str) -> Dict[str, Tuple]:
+        """
+        Loads the custom parameters from the config file provided by the user.
+
+        :param str config_pth: Path to the config file which has the custom parameters.
+        :return Dict[str, Tuple]: Custom Parameters dictionary.
+        """
+        with open(config_pth, "r", encoding="utf-8") as file:
+            return yaml.safe_load(file)
 
     def _get_parameters(self) -> Tuple[int, int, int, int, int, float]:
         """
@@ -95,9 +117,19 @@ def _get_parameters(self) -> Tuple[int, int, int, int, int, float]:
                 OUTPUT_SIZE_BEDBASE,
                 DROPOUT_PROB,
             )
+        if self.schema == "CUSTOM":
+            return (
+                self.custom_param["model"]["input_size_bow"],
+                self.custom_param["model"]["input_size_embeddings"],
+                self.custom_param["model"]["input_size_embeddings"],
+                self.custom_param["model"]["hidden_size"],
+                self.custom_param["model"]["output_size"],
+                self.custom_param["model"]["dropout_prob"],
+            )
+
         raise ValueError(
             f"Schema not available: {self.schema}."
-            "Presently, three schemas are available: ENCODE , FAIRTRACKS, BEDBASE"
+            "Presently, four schemas are available: ENCODE , FAIRTRACKS, BEDBASE, CUSTOM"
         )
 
     def _load_model(self) -> Tuple[nn.Module, object, object]:
@@ -118,29 +150,31 @@ def _load_model(self) -> Tuple[nn.Module, object, object]:
             elif self.schema == "BEDBASE":
                 filename_vc = BEDBASE_VECTORIZER_FILENAME
                 filename_lb = BEDBASE_LABEL_ENCODER_FILENAME
-
-            vectorizer = None
-            label_encoder = None
-
-            vc_path = hf_hub_download(
-                repo_id=REPO_ID,
-                filename=filename_vc,
-            )
+            elif self.schema == "CUSTOM":
+                vc_path = self.custom_param["paths"]["vectorizer_pth"]
+                lb_path = self.custom_param["paths"]["label_encoder_pth"]
+                state_dict = torch.load(self.custom_param["paths"]["model_pth"])
+            else:
+                raise ValueError(f"Schema not available: {self.schema}")
+
+            if self.schema != "CUSTOM":
+                vc_path = hf_hub_download(
+                    repo_id=REPO_ID,
+                    filename=filename_vc,
+                )
+                lb_path = hf_hub_download(
+                    repo_id=REPO_ID,
+                    filename=filename_lb,
+                )
+                model = load_from_huggingface(self.schema)
+                state_dict = torch.load(model)
 
             with open(vc_path, "rb") as f:
                 vectorizer = pickle.load(f)
 
-            lb_path = hf_hub_download(
-                repo_id=REPO_ID,
-                filename=filename_lb,
-            )
-
             with open(lb_path, "rb") as f:
                 label_encoder = pickle.load(f)
 
-            model = load_from_huggingface(self.schema)
-            state_dict = torch.load(model)
-
             (
                 input_size_values,
                 input_size_values_embeddings,
diff --git a/bedms/const.py b/bedms/const.py
index 86916c6..e200fda 100644
--- a/bedms/const.py
+++ b/bedms/const.py
@@ -2,9 +2,9 @@
 This module contains constant values used in the 'bedms' package.
 """
 
-PROJECT_NAME = "bedmess"
+PROJECT_NAME = "bedms"
 
-AVAILABLE_SCHEMAS = ["ENCODE", "FAIRTRACKS", "BEDBASE"]
+AVAILABLE_SCHEMAS = ["ENCODE", "FAIRTRACKS", "BEDBASE", "CUSTOM"]
 PEP_FILE_TYPES = ["yaml", "csv"]
 REPO_ID = "databio/attribute-standardizer-model6"
 MODEL_ENCODE = "model_encode.pth"
diff --git a/bedms/train.py b/bedms/train.py
new file mode 100644
index 0000000..d2816fd
--- /dev/null
+++ b/bedms/train.py
@@ -0,0 +1,242 @@
+""" This is the training script with which the user can train their own models."""
+
+import logging
+import torch
+from torch import nn
+from torch import optim
+from sklearn.metrics import (
+    precision_score,
+    recall_score,
+    f1_score,
+)
+import yaml
+from .utils_train import (
+    load_from_dir,
+    accumulate_data,
+    training_encoding,
+    data_loader,
+    train_model,
+    plot_learning_curve,
+    model_testing,
+    plot_confusion_matrix,
+    auc_roc_curve,
+)
+from .const import PROJECT_NAME, EMBEDDING_SIZE
+from .model import BoWSTModel
+
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(PROJECT_NAME)
+
+
+class TrainStandardizer:
+    """
+    This is the training class responsible for
+    managing the training process for the standardizer model.
+    """
+
+    def __init__(self, config: str) -> None:
+        """
+        Initializes the TrainStandardizer object with the given configuration.
+
+        :param str config: Path to the config file which has the training parameters provided by the user.
+        """
+        self.label_encoder = None
+        self.vectorizer = None
+        self.train_loader = None
+        self.val_loader = None
+        self.test_loader = None
+        self.output_size = None
+        self.criterion = None
+        self.train_accuracies = None
+        self.val_accuracies = None
+        self.train_losses = None
+        self.val_losses = None
+        self.model = None
+        self.fpr = None
+        self.tpr = None
+        self.roc_auc = None
+        self.all_labels = None
+        self.all_preds = None
+
+        with open(config, "r") as file:
+            self.config = yaml.safe_load(file)
+
+    def load_encode_data(self) -> None:
+        """
+        Loads and prepares the encoded training, testing and validation datasets.
+        """
+        values_files_list = load_from_dir(self.config["dataset"]["values_dir_pth"])
+        headers_files_list = load_from_dir(self.config["dataset"]["headers_dir_pth"])
+
+        if len(values_files_list) != len(headers_files_list):
+            logger.error(
+                f"Mismatch in number of value files ({len(values_files_list)}) \
+                and header files ({len(headers_files_list)})"
+            )
+            return
+
+        total_files = len(values_files_list)
+
+        paired_files = list(zip(values_files_list, headers_files_list))
+
+        train_size = self.config["data_split"]["train_set"]
+        test_size = self.config["data_split"]["test_set"]
+        val_size = self.config["data_split"]["val_set"]
+
+        if train_size + val_size + test_size > total_files:
+            logger.error(
+                f"Data split sizes exceed total number of files: "
+                f"train({train_size}) + val({val_size}) + \
+                test({test_size}) > total_files({total_files})"
+            )
+            return
+
+        train_files = paired_files[:train_size]
+        val_files = paired_files[train_size : train_size + val_size]
+        test_files = paired_files[
+            train_size + val_size : train_size + val_size + test_size
+        ]
+
+        logger.info(f"Training on {len(train_files)} file sets")
+        logger.info(f"Validating on {len(val_files)} file sets")
+        logger.info(f"Testing on {len(test_files)} file sets")
+
+        x_values_train_list, x_headers_train_list, y_train_list = accumulate_data(
+            train_files
+        )
+        x_values_test_list, x_headers_test_list, y_test_list = accumulate_data(
+            test_files
+        )
+        x_values_val_list, x_headers_val_list, y_val_list = accumulate_data(val_files)
+
+        logger.info("Accumulation Done.")
+
+        num_cluster = self.config["training"]["num_cluster"]
+        vectorizer_pth = self.config["training"]["vectorizer_pth"]
+        label_encoder_pth = self.config["training"]["label_encoder_pth"]
+        sentence_transformer_model = self.config["training"][
+            "sentence_transformer_model"
+        ]
+
+        (
+            train_encoded_data,
+            test_encoded_data,
+            val_encoded_data,
+            self.label_encoder,
+            self.vectorizer,
+        ) = training_encoding(
+            x_values_train_list,
+            x_headers_train_list,
+            y_train_list,
+            x_values_test_list,
+            x_headers_test_list,
+            y_test_list,
+            x_values_val_list,
+            x_headers_val_list,
+            y_val_list,
+            num_cluster,
+            vectorizer_pth,
+            label_encoder_pth,
+            sentence_transformer_model,
+        )
+        logger.info("Encoding Done.")
+
+        batch_size = self.config["training"]["batch_size"]
+        self.train_loader = data_loader(train_encoded_data, batch_size)
+        self.test_loader = data_loader(test_encoded_data, batch_size)
+        self.val_loader = data_loader(val_encoded_data, batch_size)
+
+        logger.info("Loading Done.")
+
+    def training(self):
+        """
+        Trains the model.
+        """
+        input_size_values = len(self.vectorizer.vocabulary_)
+        input_size_values_embeddings = EMBEDDING_SIZE
+        input_size_headers = EMBEDDING_SIZE
+        hidden_size = self.config["model"]["hidden_size"]
+        self.output_size = len(self.label_encoder.classes_)  # Number of classes
+        dropout_prob = self.config["model"]["dropout_prob"]
+
+        self.model = BoWSTModel(
+            input_size_values,
+            input_size_values_embeddings,
+            input_size_headers,
+            hidden_size,
+            self.output_size,
+            dropout_prob,
+        )
+
+        learning_rate = self.config["training"]["learning_rate"]
+        self.criterion = nn.CrossEntropyLoss()
+        l2_reg_lambda = self.config["training"]["l2_regularization"]
+        optimizer = optim.Adam(
+            self.model.parameters(), lr=learning_rate, weight_decay=l2_reg_lambda
+        )
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Training the model
+        num_epochs = self.config["training"]["num_epochs"]
+
+        model_pth = self.config["training"]["model_pth"]
+        bow_drops = self.config["training"]["bow_drops"]
+
+        (
+            self.train_accuracies,
+            self.val_accuracies,
+            self.train_losses,
+            self.val_losses,
+            self.fpr,
+            self.tpr,
+            self.roc_auc,
+        ) = train_model(
+            self.model,
+            self.train_loader,
+            self.val_loader,
+            self.criterion,
+            optimizer,
+            device,
+            num_epochs,
+            self.output_size,
+            model_pth,
+            bow_drops,
+        )
+
+        logger.info("Training Done.")
+
+    def testing(self):
+        """
+        Model testing.
+        """
+        self.all_preds, self.all_labels = model_testing(
+            self.model, self.test_loader, self.criterion
+        )
+        precision = precision_score(self.all_labels, self.all_preds, average="macro")
+        recall = recall_score(self.all_labels, self.all_preds, average="macro")
+        f1 = f1_score(self.all_labels, self.all_preds, average="macro")
+        logger.info(f"Precision:{precision}, Recall: {recall}, F1 Score: {f1}")
+
+    def plot_visualizations(self):
+        """
+        Generates visualizations for training ( accuracy and loss curves)
+        and testing( confusion matrix, roc curve)
+        """
+        num_epochs = self.config["training"]["num_epochs"]
+        accuracy_fig_pth = self.config["visualization"]["accuracy_fig_pth"]
+        loss_fig_pth = self.config["visualization"]["loss_fig_pth"]
+        cm_pth = self.config["visualization"]["confusion_matrix_fig_pth"]
+        roc_pth = self.config["visualization"]["roc_fig_pth"]
+        plot_learning_curve(
+            num_epochs,
+            self.train_accuracies,
+            self.val_accuracies,
+            self.train_losses,
+            self.val_losses,
+            accuracy_fig_pth,
+            loss_fig_pth,
+        )
+        plot_confusion_matrix(
+            self.all_labels, self.all_preds, self.label_encoder.classes_, cm_pth
+        )
+        auc_roc_curve(self.fpr, self.tpr, self.roc_auc, self.output_size, roc_pth)
diff --git a/bedms/utils.py b/bedms/utils.py
index 0dcb613..bd8c0a9 100644
--- a/bedms/utils.py
+++ b/bedms/utils.py
@@ -1,7 +1,7 @@
 """
 This module has all util functions for 'bedms'
 """
-
+import logging
 import warnings
 from collections import Counter
 from typing import Any, List, Optional, Tuple, Union
@@ -15,7 +15,6 @@
 from sentence_transformers import SentenceTransformer
 from sklearn.cluster import KMeans
 from sklearn.preprocessing import LabelEncoder
-
 from .const import (
     MODEL_BEDBASE,
     MODEL_ENCODE,
@@ -23,8 +22,12 @@
     NUM_CLUSTERS,
     REPO_ID,
     PEP_FILE_TYPES,
+    PROJECT_NAME,
 )
 
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(PROJECT_NAME)
+
 # TODO : convert to single np array before converting to tensor
 warnings.filterwarnings(
     "ignore",
diff --git a/bedms/utils_train.py b/bedms/utils_train.py
new file mode 100644
index 0000000..f3adda4
--- /dev/null
+++ b/bedms/utils_train.py
@@ -0,0 +1,694 @@
+"""
+This module has all training util functions for 'bedms'
+"""
+
+import os
+import logging
+from glob import glob
+import warnings
+from collections import Counter
+from typing import List, Tuple, Iterator, Dict
+import pickle
+import random
+
+
+import numpy as np
+import pandas as pd
+import torch
+from torch.utils.data import TensorDataset, DataLoader
+from sentence_transformers import SentenceTransformer
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.cluster import KMeans
+from sklearn.preprocessing import LabelEncoder, label_binarize
+from sklearn.metrics import (
+    confusion_matrix,
+    auc,
+    roc_curve,
+)
+import matplotlib.pyplot as plt
+import seaborn as sns
+from .const import PROJECT_NAME
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(PROJECT_NAME)
+
+warnings.filterwarnings(
+    "ignore",
+    category=UserWarning,
+    message="Creating a tensor from a list of numpy.ndarrays is extremely slow.",
+)
+
+def load_from_dir(dir: str) -> List[str]:
+    """
+    Loads each file from the directory path.
+
+    :param str dir: Path to the directory.
+    :return: List:paths to each file in the directory.
+    """
+    return glob(os.path.join(dir, "*.csv"))
+
+
+def load_and_preprocess(file_path: str) -> pd.DataFrame:
+    """
+    Loads and Preprocesses each csv file as a Pandas DataFrame.
+
+    :param str file_path: Path to each csv file.
+    :return pandas.DataFrame: df of each csv file.
+    """
+    df = pd.read_csv(file_path, sep=",")
+    df.replace("NA", np.nan, inplace=True)
+    for column in df.columns:
+        most_common_val = df[column].mode().iloc[0]
+        df[column] = df[column].fillna(most_common_val)
+    return df
+
+
+def accumulate_data(
+    files: List[Tuple[str, str]]
+) -> Tuple[List[List[List[str]]], List[List[List[str]]], List[pd.Index]]:
+    """
+    Accumulates data from multiple files into lists.
+
+    :param List[Tuple[str, str]] files: List containing 
+        sublists of values or header files.
+    :return Tuple[List[List[List[str]]], List[List[List[str]]],[List[str]]: 
+        Lists of values, headers, labels.
+    A tuple containing three lists:
+        - A nested list of values (list of tables where 
+            each table is a list of lists for columns),
+        - A nested list of headers (similar structure to values),
+        - A list of Pandas Index objects containing column labels.
+    """
+    x_values_list = []
+    x_headers_list = []
+    y_list = []
+    for values_file, headers_file in files:
+        df_values = load_and_preprocess(values_file)
+        df_headers = load_and_preprocess(headers_file)
+        df_values = df_values.fillna("")
+        df_headers = df_headers.fillna("")
+        y = df_values.columns
+        table_list = []
+        # values list
+        for col in df_values.columns:
+            sublist_list = df_values[col].tolist()
+            table_list.append(sublist_list)
+        x_values_list.append(table_list)
+        # headers list
+        table_list = []
+        for col in df_headers.columns:
+            sublist_list = df_headers[col].tolist()
+            table_list.append(sublist_list)
+        x_headers_list.append(table_list)
+        # y list
+        y_list.append(y)
+
+    return x_values_list, x_headers_list, y_list
+
+
+def lazy_loading(data_list: List, batch_size: int) -> Iterator[List]:
+    """
+    Lazy loading for data in batches.
+
+    :param List data_list: List of data to be loaded lazily.
+    :param int batch_size: Size of batch.
+    """
+    for i in range(0, len(data_list), batch_size):
+        yield data_list[i : i + batch_size]
+
+
+def get_top_training_cluster_averaged(
+    embeddings: List[torch.tensor], num: int
+) -> torch.Tensor:
+    """
+    Computes the clutser-averaged top training embeddings using k-means clustering.
+
+    :param List[torch.tensor] embeddings: List of embedding tensors to cluster.
+    :param int num: Number of clusters to be created using k-means.
+    :return torch.Tensor: A tensor representing the 
+        average of embeddings in the most common cluster.
+    """
+    flattened_embeddings = [embedding.tolist() for embedding in embeddings]
+    kmeans = KMeans(n_clusters=num, random_state=0).fit(flattened_embeddings)
+    labels_kmeans = kmeans.labels_
+    cluster_counts = Counter(labels_kmeans)
+    most_common_cluster = max(cluster_counts, key=cluster_counts.get)
+    most_common_indices = [
+        idx for idx, label in enumerate(labels_kmeans) if label == most_common_cluster
+    ]
+    most_common_embeddings = [
+        torch.tensor(embeddings[idx]) for idx in most_common_indices
+    ]
+
+    if most_common_embeddings:
+        top_k_average = torch.mean(
+            torch.stack(most_common_embeddings), dim=0
+        ).unsqueeze(0)
+    else:
+        top_k_average = torch.zeros_like(most_common_embeddings[0]).unsqueeze(0)
+    return top_k_average
+
+
+def training_encoding(
+    x_values_train_list: List[List[List[str]]],
+    x_headers_train_list: List[List[List[str]]],
+    y_train_list: List[pd.Index],
+    x_values_test_list: List[List[List[str]]],
+    x_headers_test_list: List[List[List[str]]],
+    y_test_list: List[pd.Index],
+    x_values_val_list: List[List[List[str]]],
+    x_headers_val_list: List[List[List[str]]],
+    y_val_list: List[pd.Index],
+    num_cluster: int,
+    vectorizer_pth: str,
+    label_encoder_pth: str,
+    sentence_transformer_model: str,
+) -> Tuple[
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
+    LabelEncoder,
+    List[str],
+    CountVectorizer,
+]:
+    """
+    Generates encoded headers and values.
+
+    :param List[List[List[str]]] x_values_train_list: 
+        Nested list containing the training set for values.
+    :param List[List[List[str]]] x_headers_train_list: 
+        Nested list containing the training set for headers.
+    :param List[pd.Index] y_train_list: 
+        List of the column labels ( attributes) for training.
+    :param List[List[List[str]]] x_values_test_list: 
+        Nested list containing the testing set for values.
+    :param List[List[List[str]]] x_headers_test_list: 
+        Nested list containing the testing set for headers.
+    :param List[pd.Index] y_test_list: 
+        List of the column labels ( attributes) for testing.
+    :param List[List[List[str]]] x_values_val_list: 
+        Nested list containing the validation set for values.
+    :param List[List[List[str]]] x_headers_val_list: 
+        Nested list containing the validation set for headers.
+    :param List[pd.Index] y_val_list: 
+        List of the column labels ( attributes) for validation.
+    :return Tuple[
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
+    LabelEncoder,
+    List[str],
+    CountVectorizer]: Returns a tuple of
+     - training dataset tensor
+     - testing dataset tensor
+     - validation dataset tensor
+     - trained label encoder
+     - list of unique values encountered during training
+     - Trained vectorizer for Bag of Words representation
+
+    """
+    # Bag of Words
+    flattened_list = [
+        item for sublist in x_values_train_list for col in sublist for item in col
+    ]
+    vectorizer = CountVectorizer()
+    vectorizer.fit(flattened_list)
+    with open(vectorizer_pth, "wb") as f:
+        pickle.dump(vectorizer, f)
+    vocabulary_size = len(vectorizer.vocabulary_)
+    logger.info(f"Vocabulary size: {vocabulary_size}")
+
+    # Sentence Transformers
+    model_name = sentence_transformer_model
+    sentence_encoder = SentenceTransformer(model_name)
+
+    # Label Encoders
+    label_encoder = LabelEncoder()
+    flat_y_train = [",".join(y) for y in y_train_list]
+    individual_values = [value.strip() for y in flat_y_train for value in y.split(",")]
+    unique_values = set(individual_values)
+    unique_values_list = list(unique_values)
+    label_encoder.fit(unique_values_list)
+
+    with open(label_encoder_pth, "wb") as f:
+        pickle.dump(label_encoder, f)
+
+    def encode_data(
+        x_values_list: List[List[List[str]]],
+        x_headers_list: List[List[List[str]]],
+        y_list: List[pd.Index],
+        num_cluster: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        This nested function encodes the values, headers and labels data. 
+        It is called for thrice - training, testing, validation.
+
+        :param List[List[List[str]]] x_values_list: Nested list containing values.
+        :param List[List[List[str]]] x_headers_list: Nested list containing headers.
+        :param List[pd.Index] y_list: Labels (attributes) list.
+        :param int num_cluster: Number of clusters to be generated.
+        """
+        x_values_bow_tensors = []
+        x_values_embeddings_tensors = []
+        x_headers_embeddings_tensors = []
+        y_tensors = []
+
+        for x_values, x_headers, y in zip(x_values_list, x_headers_list, y_list):
+
+            for i in range(len(x_values)):  # Iterate over columns
+                # BoW Representation
+                x_values_bow = vectorizer.transform(x_values[i]).toarray()
+                x_values_bow_tensor = (
+                    torch.tensor(x_values_bow, dtype=torch.float32)
+                    .mean(dim=0)
+                    .unsqueeze(0)
+                    .clone()
+                    .detach()
+                )
+
+                # Embeddings for Values
+                embeddings_values = [
+                    sentence_encoder.encode(str(value), show_progress_bar=False)
+                    for value in x_values[i]
+                ]
+
+                top_k_average_values = get_top_training_cluster_averaged(
+                    embeddings_values, num_cluster
+                )  # Average of all embeddings
+                x_values_embeddings_tensor = top_k_average_values.clone().detach()
+
+                # Embeddings for Headers
+                embeddings_headers = [
+                    sentence_encoder.encode(str(header), show_progress_bar=False)
+                    for header in x_headers[i]
+                ]
+
+                top_k_average_headers = get_top_training_cluster_averaged(
+                    embeddings_headers, num_cluster
+                )  # Average of all embeddings
+                x_headers_embeddings_tensor = top_k_average_headers.clone().detach()
+
+                # Labels
+                y_col = label_encoder.transform([y[i]])
+                y_col_tensor = torch.tensor(y_col, dtype=torch.long).clone().detach()
+
+                x_values_bow_tensors.append(x_values_bow_tensor)
+                x_values_embeddings_tensors.append(x_values_embeddings_tensor)
+                x_headers_embeddings_tensors.append(x_headers_embeddings_tensor)
+                y_tensors.append(y_col_tensor)
+
+        x_values_bow_tensor = torch.cat(
+            x_values_bow_tensors, dim=0
+        )  # this has [num_cols, vocab_size]
+        x_values_embeddings_tensor = torch.cat(
+            x_values_embeddings_tensors, dim=0
+        )  # [num_cols, embedding_dim]
+        x_headers_embeddings_tensor = torch.cat(x_headers_embeddings_tensors, dim=0)
+        y_tensor = torch.cat(y_tensors, dim=0)  # [num_cols]
+
+        return (
+            x_values_bow_tensor,
+            x_values_embeddings_tensor,
+            x_headers_embeddings_tensor,
+            y_tensor,
+        )
+
+    train_data = encode_data(
+        x_values_train_list, x_headers_train_list, y_train_list, num_cluster
+    )
+    test_data = encode_data(
+        x_values_test_list, x_headers_test_list, y_test_list, num_cluster
+    )
+    val_data = encode_data(
+        x_values_val_list, x_headers_val_list, y_val_list, num_cluster
+    )
+
+    return (
+        train_data,
+        test_data,
+        val_data,
+        label_encoder,
+        vectorizer,
+    )
+
+
+def data_loader(
+    encoded_data: Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
+    batch_size: int,
+) -> DataLoader:
+    """
+    Creates a DataLoader from encoded tensor data.
+
+    :param [torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor] encoded_data: 
+        Tuple containing tensors for
+    values bag of words, values embeddings, headers embeddings, and labels.
+    :param int batch_size: The number of samples per batch for the DataLoader.
+    :return DataLoader: A PyTorch DataLoader which yields 
+        batches of data from the given tensors.
+    """
+    (
+        x_values_bow_tensor,
+        x_values_embeddings_tensor,
+        x_headers_embeddings_tensor,
+        y_tensor,
+    ) = encoded_data
+    # Convert data to TensorDataset
+    dataset = TensorDataset(
+        x_values_bow_tensor,
+        x_values_embeddings_tensor,
+        x_headers_embeddings_tensor,
+        y_tensor,
+    )
+    # Create DataLoader
+    return DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+
+def drop_bow(bow_tensor: torch.Tensor, num_drops: int) -> torch.Tensor:
+    """
+    Randomly drops a specified number of columns in the 
+    Bag of Words tensor for regularization.
+
+    :param torch.Tensor bow_tensor: Bag of Words tensor.
+    :param int num_drops: Number of columns to be randomly 
+        dropped from the Bag of Words tensor.
+    :return torch.Tensor: Bag of Words tensor with dropped columns.
+    """
+    num_columns = bow_tensor.size(0)
+    columns = list(range(num_columns))
+    columns_to_drop = random.sample(columns, num_drops)
+
+    mask = torch.ones(num_columns, dtype=torch.bool)
+    mask[columns_to_drop] = False
+    mask = mask.unsqueeze(1).expand_as(bow_tensor)
+
+    # Apply the mask to the BoW tensor
+    dropped_bow_tensor = bow_tensor.clone()
+    dropped_bow_tensor[~mask] = 0.0
+
+    return dropped_bow_tensor
+
+
+def train_model(
+    model: torch.nn.Module,
+    train_loader: DataLoader,
+    val_loader: DataLoader,
+    criterion: torch.nn.Module,
+    optimizer: torch.optim.Optimizer,
+    device: torch.device,
+    num_epochs: int,
+    output_size: int,
+    model_pth: str,
+    bow_drops: int,
+) -> Tuple[
+    List[float],
+    List[float],
+    List[float],
+    List[float],
+    Dict[int, np.ndarray],
+    Dict[int, np.ndarray],
+    Dict[int, float],
+]:
+    """
+    Trains and validates the neural network model.
+
+    :param torch.nn.Module model: The neural network model to be trained.
+    :param DataLoader train_loader: DataLoader for the training set.
+    :param DataLoader val_loader: DataLoader for the validation set.
+    :param torch.nn.Module criterion: The loss function used to compute loss during training.
+    :param torch.optim.Optimizer optimizer: The optimizer to update the model parameters.
+    :param torch.device device: The device (CPU or GPU) on which the model will be trained.
+    :param int num_epochs: The number of epochs to train the model.
+    :param int output_size: The size of the model's output layer.
+    :param str model_pth: The file path to where the model would be saved.
+    :param int bow_drops: The number of Bag of Words columns to be dropped.
+    :return Tuple:
+     - List[float]: Train accuracy per epoch.
+     - List[float]: Validation accuracy per epoch.
+     - List[float]: Train loss per epoch.
+     - List[float]: Validation loss per epoch.
+     - Dict[int, np.ndarray]: Dictionary of False Positive Rates (FPR).
+     - Dict[int, np.ndarray]: Dictionary of True Positive Rates (TPR).
+     - Dict[int, float]: Dictionary of Area Under the ROC Curve for different classes.
+    """
+    patience = 3
+    train_accuracies = []
+    val_accuracies = []
+    train_losses = []
+    val_losses = []
+
+    best_val_loss = float("inf")
+    best_epoch = 0
+    early_stop = False
+
+    model.train()
+
+    for epoch in range(num_epochs):
+        total_samples = 0
+        correct_predictions = 0
+        train_loss = 0.0
+        for x_values_bow, x_values_embeddings, x_headers_embeddings, y in train_loader:
+            x_values_bow = x_values_bow.to(device)
+            x_values_embeddings = x_values_embeddings.to(device)
+            x_headers_embeddings = x_headers_embeddings.to(device)
+            y = y.to(device)
+
+            x_values_bow = drop_bow(x_values_bow, bow_drops)
+
+            optimizer.zero_grad()
+            outputs = model(x_values_bow, x_values_embeddings, x_headers_embeddings)
+
+            loss = criterion(outputs, y)
+            loss.backward()
+            optimizer.step()
+            train_loss += loss.item() * x_values_bow.size(0)
+
+            _, predicted = torch.max(outputs, 1)
+            total_samples += y.size(0)
+            correct_predictions += (predicted == y).sum().item()
+
+        train_accuracy = correct_predictions / total_samples * 100
+        train_accuracies.append(train_accuracy)
+        train_loss = train_loss / len(train_loader.dataset)
+        train_losses.append(train_loss)
+
+        model.eval()
+        val_loss = 0.0
+        correct_predictions_val = 0
+        total_samples_val = 0
+        y_true = []
+        y_scores = []
+        with torch.no_grad():
+            for (
+                x_values_bow,
+                x_values_embeddings,
+                x_headers_embeddings,
+                y,
+            ) in val_loader:
+                x_values_bow = x_values_bow.to(device)
+                x_values_embeddings = x_values_embeddings.to(device)
+                x_headers_embeddings = x_headers_embeddings.to(device)
+                y = y.to(device)
+                outputs = model(x_values_bow, x_values_embeddings, x_headers_embeddings)
+                loss = criterion(outputs, y)
+                val_loss += loss.item() * x_values_bow.size(0)
+
+                _, predicted = torch.max(outputs, 1)
+                total_samples_val += y.size(0)
+                correct_predictions_val += (predicted == y).sum().item()
+                y_true.extend(y.cpu().numpy())
+                y_scores.extend(outputs.cpu().numpy())
+
+        val_loss = val_loss / len(val_loader.dataset)
+        val_accuracy = correct_predictions_val / total_samples_val * 100
+        val_accuracies.append(val_accuracy)
+        val_losses.append(val_loss)
+
+        print(
+            f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {train_loss:.4f}, \
+            Training Accuracy: {train_accuracy:.2f}%, Validation Loss: {val_loss:.4f}, \
+            Validation Accuracy: {val_accuracy:.2f}%"
+        )
+
+        # Early stop
+
+        if val_loss < best_val_loss:
+            best_val_loss = val_loss
+            best_epoch = epoch
+            torch.save(model.state_dict(), model_pth)
+        elif epoch - best_epoch >= patience:
+            early_stop = True
+    if early_stop:
+        print(f"Early stop at {best_epoch + 1} epoch.")
+    y_true = label_binarize(y_true, classes=list(range(output_size)))
+
+    # Convert to numpy arrays
+    y_true = np.array(y_true)
+    y_scores = np.array(y_scores)
+
+    # Calculate ROC curves and AUC
+    fpr = {}
+    tpr = {}
+    roc_auc = {}
+
+    for i in range(
+        output_size
+    ):  # Replace output_size with your actual number of classes
+        fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_scores[:, i])
+        roc_auc[i] = auc(fpr[i], tpr[i])
+
+    return train_accuracies, val_accuracies, train_losses, val_losses, fpr, tpr, roc_auc
+
+
+def model_testing(
+    model: torch.nn.Module, test_loader: DataLoader, loss_fn: torch.nn.Module
+) -> Tuple[List[int], List[int], torch.Tensor]:
+    """
+    This functions tests the model.
+
+    :param torch.nn.Module model: The trained model.
+    :param DataLoader test_loader: DataLoader for the testing set.
+    :param torch.nn.Module loss_fn: The loss function used to compute loss.
+    :return Tuple:
+        - List[int]: List of all the predictions made by the model.
+        - List[int]: List of all the true labels ( Ground truth)
+        - torch.Tensor: Logist from the model for the test dataset.
+    """
+    all_preds = []
+    all_labels = []
+    model.eval()
+    total_loss_test = 0.0
+    total_correct_test = 0
+    total_samples_test = 0
+    with torch.no_grad():
+        for values_batch, bow_batch, headers_batch, labels in test_loader:
+            outputs = model(values_batch, bow_batch, headers_batch)
+            loss = loss_fn(outputs, labels)
+            total_loss_test += loss.item()
+            _, predicted_test = torch.max(outputs, 1)
+            correct_test = (predicted_test == labels).sum().item()
+            total_correct_test += correct_test
+            total_samples_test += labels.size(0)
+            all_preds.extend(predicted_test.cpu().numpy())
+            all_labels.extend(labels.cpu().numpy())
+    test_accuracy = total_correct_test / total_samples_test
+    test_loss = total_loss_test / len(test_loader)
+    logger.info(f"Test Accuracy: {test_accuracy}, Test Loss: {test_loss}")
+
+    return all_preds, all_labels
+
+
+def plot_learning_curve(
+    num_epochs: int,
+    train_accuracies: List[float],
+    val_accuracies: List[float],
+    train_losses: List[float],
+    val_losses: List[float],
+    accuracy_fig_pth: str,
+    loss_fig_pth: str,
+) -> None:
+    """
+    Plots the learning curves - accuracy and loss for Training and Validation of the model.
+
+    :param int num_epochs: Number of epochs for which the model was trained.
+    :param List[float] train_accuracies: List of training accuracies for each epoch.
+    :param List[float] val_accuracies: List of validation accuracies for each epoch.
+    :param List[float] train_losses: List of training losses for each epoch.
+    :param List[float] val_losses: List of validation losses for each epoch.
+    :param str accuracy_fig_pth: Path where the accuracy curve figure will be saved.
+    :param str loss_fig_pth: Path where the loss curve figure will be saved.
+    """
+
+    # accuracy
+    plt.plot(range(1, num_epochs + 1), train_accuracies, label="Training Accuracy")
+    plt.plot(range(1, num_epochs + 1), val_accuracies, label="Validation Accuracy")
+    plt.xlabel("Epoch")
+    plt.ylabel("Accuracy")
+    plt.title("Learning Curve")
+    plt.legend()
+    plt.grid(True)
+    plt.savefig(accuracy_fig_pth, format="svg")
+    plt.show()
+    plt.close()
+    # loss
+    plt.plot(range(1, num_epochs + 1), train_losses, label="Training Loss")
+    plt.plot(range(1, num_epochs + 1), val_losses, label="Validation Loss")
+    plt.xlabel("Epoch")
+    plt.ylabel("Loss")
+    plt.title("Learning Curve")
+    plt.legend()
+    plt.grid(True)
+    plt.savefig(loss_fig_pth, format="svg")
+    plt.show()
+    plt.close()
+
+
+def plot_confusion_matrix(
+    y_true: List[int],
+    y_pred: List[int],
+    unique_values_list: List[str],
+    confusion_matrix_fig_pth: str,
+) -> None:
+    """
+    Plots confusion matrix for the test data.
+
+    :param List[int] y_true: List of true labels ( Ground Truth)
+    :param List[int] y_pred: List of predictions made by the model.
+    :param List[str] unique_values_list: List of all the classes that the model predicted.
+    :param str confusion_matrix_fig_pth: Path where the confusion matrix figure will be saved.
+    """
+    conf_matrix = confusion_matrix(y_true, y_pred)
+    plt.figure(figsize=(12, 12))
+    sns.heatmap(
+        conf_matrix,
+        annot=True,
+        fmt="d",
+        cmap="Blues",
+        xticklabels=np.unique(unique_values_list),
+        yticklabels=np.unique(unique_values_list),
+    )
+    plt.title("Confusion Matrix")
+    plt.xlabel("Predicted Label")
+    plt.ylabel("True Label")
+    plt.savefig(confusion_matrix_fig_pth, format="svg")
+    plt.show()
+    plt.close()
+    class_accuracy = conf_matrix.diagonal() / conf_matrix.sum(axis=1)
+    for i, acc in enumerate(class_accuracy):
+        print(f"Accuracy for class {i}: {acc:.4f}")
+
+
+def auc_roc_curve(
+    fpr: Dict[int, np.ndarray],
+    tpr: Dict[int, np.ndarray],
+    roc_auc: Dict[int, float],
+    output_size: int,
+    roc_fig_pth: str,
+) -> None:
+    """
+    Plots the ROC Curve.
+
+    :param Dict[int, np.ndarray] fpr: Dictionary of False Positive Rates
+    :param Dicr[int, np.ndarray] tpr: Dictionary of True Positive Rates
+    :param Dict[int, float] roc_auc: Dictionary of Area Under Curve for ROC for different classes.
+    :param int output_size: The number of classes the model predicted into.
+    :param str roc_fig_pth: Path to where the ROC figure will be saved.
+    """
+    plt.figure(figsize=(12, 12))
+    for i in range(output_size):
+        plt.plot(
+            fpr[i],
+            tpr[i],
+            lw=2,
+            label="ROC curve (class %d) (AUC = %0.2f)" % (i, roc_auc[i]),
+        )
+
+    plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
+    plt.xlim([0.0, 1.0])
+    plt.ylim([0.0, 1.05])
+    plt.xlabel("False Positive Rate")
+    plt.ylabel("True Positive Rate")
+    plt.title("Receiver Operating Characteristic (ROC) Curve")
+    plt.legend(loc="lower right")
+    plt.savefig(roc_fig_pth, format="svg")
+    plt.show()
+    plt.close()

From a4863b080de024086789dd8b96d0954f27f022d3 Mon Sep 17 00:00:00 2001
From: saanikat <saanikatambe@gmail.com>
Date: Wed, 2 Oct 2024 11:31:59 -0400
Subject: [PATCH 06/18] black

---
 bedms/attr_standardizer.py |  2 +-
 bedms/utils.py             |  1 +
 bedms/utils_train.py       | 37 +++++++++++++++++++------------------
 3 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/bedms/attr_standardizer.py b/bedms/attr_standardizer.py
index 2372f0a..7b93f1d 100644
--- a/bedms/attr_standardizer.py
+++ b/bedms/attr_standardizer.py
@@ -62,7 +62,7 @@ def __init__(
         Initializes the attribute standardizer with user provided schema, loads the model.
 
         :param str schema: User provided schema, can be "ENCODE" or "FAIRTRACKS"
-        :param str custom_param: User provided config file for 
+        :param str custom_param: User provided config file for
             custom parameters, if they choose "CUSTOM" schema.
         :param int confidence: Confidence threshold for the predictions.
         """
diff --git a/bedms/utils.py b/bedms/utils.py
index bd8c0a9..fb63805 100644
--- a/bedms/utils.py
+++ b/bedms/utils.py
@@ -1,6 +1,7 @@
 """
 This module has all util functions for 'bedms'
 """
+
 import logging
 import warnings
 from collections import Counter
diff --git a/bedms/utils_train.py b/bedms/utils_train.py
index f3adda4..09f861b 100644
--- a/bedms/utils_train.py
+++ b/bedms/utils_train.py
@@ -38,6 +38,7 @@
     message="Creating a tensor from a list of numpy.ndarrays is extremely slow.",
 )
 
+
 def load_from_dir(dir: str) -> List[str]:
     """
     Loads each file from the directory path.
@@ -69,12 +70,12 @@ def accumulate_data(
     """
     Accumulates data from multiple files into lists.
 
-    :param List[Tuple[str, str]] files: List containing 
+    :param List[Tuple[str, str]] files: List containing
         sublists of values or header files.
-    :return Tuple[List[List[List[str]]], List[List[List[str]]],[List[str]]: 
+    :return Tuple[List[List[List[str]]], List[List[List[str]]],[List[str]]:
         Lists of values, headers, labels.
     A tuple containing three lists:
-        - A nested list of values (list of tables where 
+        - A nested list of values (list of tables where
             each table is a list of lists for columns),
         - A nested list of headers (similar structure to values),
         - A list of Pandas Index objects containing column labels.
@@ -125,7 +126,7 @@ def get_top_training_cluster_averaged(
 
     :param List[torch.tensor] embeddings: List of embedding tensors to cluster.
     :param int num: Number of clusters to be created using k-means.
-    :return torch.Tensor: A tensor representing the 
+    :return torch.Tensor: A tensor representing the
         average of embeddings in the most common cluster.
     """
     flattened_embeddings = [embedding.tolist() for embedding in embeddings]
@@ -174,23 +175,23 @@ def training_encoding(
     """
     Generates encoded headers and values.
 
-    :param List[List[List[str]]] x_values_train_list: 
+    :param List[List[List[str]]] x_values_train_list:
         Nested list containing the training set for values.
-    :param List[List[List[str]]] x_headers_train_list: 
+    :param List[List[List[str]]] x_headers_train_list:
         Nested list containing the training set for headers.
-    :param List[pd.Index] y_train_list: 
+    :param List[pd.Index] y_train_list:
         List of the column labels ( attributes) for training.
-    :param List[List[List[str]]] x_values_test_list: 
+    :param List[List[List[str]]] x_values_test_list:
         Nested list containing the testing set for values.
-    :param List[List[List[str]]] x_headers_test_list: 
+    :param List[List[List[str]]] x_headers_test_list:
         Nested list containing the testing set for headers.
-    :param List[pd.Index] y_test_list: 
+    :param List[pd.Index] y_test_list:
         List of the column labels ( attributes) for testing.
-    :param List[List[List[str]]] x_values_val_list: 
+    :param List[List[List[str]]] x_values_val_list:
         Nested list containing the validation set for values.
-    :param List[List[List[str]]] x_headers_val_list: 
+    :param List[List[List[str]]] x_headers_val_list:
         Nested list containing the validation set for headers.
-    :param List[pd.Index] y_val_list: 
+    :param List[pd.Index] y_val_list:
         List of the column labels ( attributes) for validation.
     :return Tuple[
     Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
@@ -240,7 +241,7 @@ def encode_data(
         num_cluster: int,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """
-        This nested function encodes the values, headers and labels data. 
+        This nested function encodes the values, headers and labels data.
         It is called for thrice - training, testing, validation.
 
         :param List[List[List[str]]] x_values_list: Nested list containing values.
@@ -339,11 +340,11 @@ def data_loader(
     """
     Creates a DataLoader from encoded tensor data.
 
-    :param [torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor] encoded_data: 
+    :param [torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor] encoded_data:
         Tuple containing tensors for
     values bag of words, values embeddings, headers embeddings, and labels.
     :param int batch_size: The number of samples per batch for the DataLoader.
-    :return DataLoader: A PyTorch DataLoader which yields 
+    :return DataLoader: A PyTorch DataLoader which yields
         batches of data from the given tensors.
     """
     (
@@ -365,11 +366,11 @@ def data_loader(
 
 def drop_bow(bow_tensor: torch.Tensor, num_drops: int) -> torch.Tensor:
     """
-    Randomly drops a specified number of columns in the 
+    Randomly drops a specified number of columns in the
     Bag of Words tensor for regularization.
 
     :param torch.Tensor bow_tensor: Bag of Words tensor.
-    :param int num_drops: Number of columns to be randomly 
+    :param int num_drops: Number of columns to be randomly
         dropped from the Bag of Words tensor.
     :return torch.Tensor: Bag of Words tensor with dropped columns.
     """

From e67e094153075866b779784cf1d90aa155d1a394 Mon Sep 17 00:00:00 2001
From: saanikat <saanikatambe@gmail.com>
Date: Wed, 2 Oct 2024 11:33:53 -0400
Subject: [PATCH 07/18] README updated with custom training

---
 README.md | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 52 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 3075eb6..ac46292 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,6 @@
 # BEDMS
 
-BEDMS (BED Metadata Standardizer) is a tool used to standardize genomics/epigenomics metadata based on a schema chosen by the user ( eg. ENCODE, FAIRTRACKS, BEDBASE).
-
+BEDMS (BED Metadata Standardizer) is a tool desgined to standardize genomics and epigenomics metadata attributes according to user-selected schemas such as `ENCODE`, `FAIRTRACKS` and `BEDBASE`. BEDMS ensures consistency and FAIRness of metadata across different platforms. Additionally, users have the option to train their own standardizer model using a custom schema (`CUSTOM`), allowing for the standardization of attributes based on users' specific research requirements. 
 
 ## Installation
 
@@ -16,6 +15,7 @@ pip install git+https://github.com/databio/bedms.git
 
 ## Usage
 
+### Standardizing based on available schemas
 ```python
 from bedms import AttrStandardizer
 
@@ -25,7 +25,57 @@ results = model.standardize(pep="geo/gse228634:default")
 assert results
 ```
 
+### Training custom schemas
+Training your custom schema is very easy with `BEDMS`. You would need two things to get started:
+1. Training Sets
+2. `training_config.yaml`
+
+To instantiate `TrainStandardizer` class:
+
+```python
+from bedms.train import TrainStandardizer
+
+trainer = TrainStandardizer("training_config.yaml")
+
+```
+To load the datasets and encode them:
+
+```python
+trainer.load_encode_data()
+```
+
+To train the custom model:
+
+```python
+trainer.training()
+```
+
+To test the custom model:
+
+```python
+trainer.testing()
+```
+
+To generate visualizations such as Learning Curves, Confusion Matrices, and ROC Curve:
+
+```python
+trainer.plot_visualizations()
+```
+
+### Standardizing based on custom schema
+For standardizing based on custom schema, you would require a `custom_config.yaml`.
+
+```python
+from bedms import AttrStandardizer
+
+model = AttrStandardizer("CUSTOM", "custom_config.yaml")
+
+results = model.standardize(pep="geo/gse228634:default")
+
+assert results
+```
 
+### Available schemas
 To see the available schemas, you can run:
 ```
 from bedms.const import AVAILABLE_SCHEMAS

From 24a5d77f3db667b26461a4db01048ccbb5a628d4 Mon Sep 17 00:00:00 2001
From: saanikat <saanikatambe@gmail.com>
Date: Wed, 2 Oct 2024 15:10:43 -0400
Subject: [PATCH 08/18] minor changes in comments

---
 bedms/utils_train.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/bedms/utils_train.py b/bedms/utils_train.py
index 09f861b..ab64566 100644
--- a/bedms/utils_train.py
+++ b/bedms/utils_train.py
@@ -522,7 +522,6 @@ def train_model(
         print(f"Early stop at {best_epoch + 1} epoch.")
     y_true = label_binarize(y_true, classes=list(range(output_size)))
 
-    # Convert to numpy arrays
     y_true = np.array(y_true)
     y_scores = np.array(y_scores)
 
@@ -531,9 +530,7 @@ def train_model(
     tpr = {}
     roc_auc = {}
 
-    for i in range(
-        output_size
-    ):  # Replace output_size with your actual number of classes
+    for i in range(output_size):
         fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_scores[:, i])
         roc_auc[i] = auc(fpr[i], tpr[i])
 

From 03ad055be5f86e9f1cffd808005b38b78457862e Mon Sep 17 00:00:00 2001
From: saanikat <saanikatambe@gmail.com>
Date: Mon, 7 Oct 2024 11:25:41 -0400
Subject: [PATCH 09/18] separating schemas from bedms

---
 bedms/__init__.py          |   3 +
 bedms/attr_standardizer.py | 209 +++++++++++++------------------------
 bedms/const.py             |  19 ----
 bedms/train.py             |   2 +-
 bedms/utils.py             |  20 ----
 5 files changed, 79 insertions(+), 174 deletions(-)

diff --git a/bedms/__init__.py b/bedms/__init__.py
index d0d13a5..99bc695 100644
--- a/bedms/__init__.py
+++ b/bedms/__init__.py
@@ -3,3 +3,6 @@
 """
 
 from .attr_standardizer import AttrStandardizer
+from .train import AttrStandardizerTrainer
+
+__all__ = ["AttrStandardizer", "AttrStandardizerTrainer"]
diff --git a/bedms/attr_standardizer.py b/bedms/attr_standardizer.py
index 7b93f1d..c823890 100644
--- a/bedms/attr_standardizer.py
+++ b/bedms/attr_standardizer.py
@@ -3,6 +3,9 @@
 """
 
 import logging
+import glob
+import os
+import yaml
 from typing import Dict, Tuple, Union, Optional
 import pickle
 import peppy
@@ -14,33 +17,11 @@
 from .const import (
     AVAILABLE_SCHEMAS,
     CONFIDENCE_THRESHOLD,
-    DROPOUT_PROB,
-    EMBEDDING_SIZE,
-    HIDDEN_SIZE,
-    INPUT_SIZE_BOW_BEDBASE,
-    INPUT_SIZE_BOW_ENCODE,
-    INPUT_SIZE_BOW_FAIRTRACKS,
-    OUTPUT_SIZE_BEDBASE,
-    OUTPUT_SIZE_ENCODE,
-    OUTPUT_SIZE_FAIRTRACKS,
     PROJECT_NAME,
     SENTENCE_TRANSFORMER_MODEL,
-    REPO_ID,
-    ENCODE_VECTORIZER_FILENAME,
-    ENCODE_LABEL_ENCODER_FILENAME,
-    FAIRTRACKS_VECTORIZER_FILENAME,
-    FAIRTRACKS_LABEL_ENCODER_FILENAME,
-    BEDBASE_VECTORIZER_FILENAME,
-    BEDBASE_LABEL_ENCODER_FILENAME,
 )
 from .model import BoWSTModel
-from .utils import (
-    data_encoding,
-    data_preprocessing,
-    fetch_from_pephub,
-    get_any_pep,
-    load_from_huggingface,
-)
+from .utils import data_encoding, data_preprocessing, fetch_from_pephub, get_any_pep
 
 
 logging.basicConfig(level=logging.INFO)
@@ -54,82 +35,53 @@ class AttrStandardizer:
 
     def __init__(
         self,
-        schema: str,
+        repo_id: str,
+        model_name: str,
         custom_param: Optional[str] = None,
         confidence: int = CONFIDENCE_THRESHOLD,
     ) -> None:
         """
         Initializes the attribute standardizer with user provided schema, loads the model.
 
-        :param str schema: User provided schema, can be "ENCODE" or "FAIRTRACKS"
+        :param str repo_id: HuggingFace repository ID
+        :param str model_name: Name of the schema model
         :param str custom_param: User provided config file for
             custom parameters, if they choose "CUSTOM" schema.
         :param int confidence: Confidence threshold for the predictions.
         """
-        self.schema = schema
+        self.repo_id = repo_id
+        self.model_name = model_name
         self.conf_threshold = confidence
         self.custom_param = custom_param
-
-        if self.schema == "CUSTOM" and self.custom_param:
-            self.custom_param = self._load_custom_param(self.custom_param)
         self.model, self.vectorizer, self.label_encoder = self._load_model()
 
-    def _load_custom_param(self, config_pth: str) -> Dict[str, Tuple]:
-        """
-        Loads the custom parameters from the config file provided by the user.
-
-        :param str config_pth: Path to the config file which has the custom parameters.
-        :return Dict[str, Tuple]: Custom Parameters dictionary.
-        """
-        with open(config_pth, "r", encoding="utf-8") as file:
-            return yaml.safe_load(file)
-
     def _get_parameters(self) -> Tuple[int, int, int, int, int, float]:
         """
         Get the model parameters as per the chosen schema.
 
         :return Tuple[int, int, int, int, int, int, float]: Tuple containing the model parameters.
         """
-        if self.schema == "ENCODE":
-            return (
-                INPUT_SIZE_BOW_ENCODE,
-                EMBEDDING_SIZE,
-                EMBEDDING_SIZE,
-                HIDDEN_SIZE,
-                OUTPUT_SIZE_ENCODE,
-                DROPOUT_PROB,
-            )
-        if self.schema == "FAIRTRACKS":
-            return (
-                INPUT_SIZE_BOW_FAIRTRACKS,
-                EMBEDDING_SIZE,
-                EMBEDDING_SIZE,
-                HIDDEN_SIZE,
-                OUTPUT_SIZE_FAIRTRACKS,
-                DROPOUT_PROB,
-            )
-        if self.schema == "BEDBASE":
-            return (
-                INPUT_SIZE_BOW_BEDBASE,
-                EMBEDDING_SIZE,
-                EMBEDDING_SIZE,
-                HIDDEN_SIZE,
-                OUTPUT_SIZE_BEDBASE,
-                DROPOUT_PROB,
-            )
-        if self.schema == "CUSTOM":
-            return (
-                self.custom_param["model"]["input_size_bow"],
-                self.custom_param["model"]["input_size_embeddings"],
-                self.custom_param["model"]["input_size_embeddings"],
-                self.custom_param["model"]["hidden_size"],
-                self.custom_param["model"]["output_size"],
-                self.custom_param["model"]["dropout_prob"],
-            )
-
-        raise ValueError(
-            f"Schema not available: {self.schema}."
-            "Presently, four schemas are available: ENCODE , FAIRTRACKS, BEDBASE, CUSTOM"
+        config_filename = f"config_{self.model_name}.yaml"
+        config_pth = hf_hub_download(
+            repo_id=self.repo_id,
+            filename=os.path.join(self.model_name, config_filename),
+        )
+        with open(config_pth, "r") as file:
+            config = yaml.safe_load(file)
+
+        input_size_bow = config["params"]["input_size_bow"]
+        embedding_size = config["params"]["embedding_size"]
+        hidden_size = config["params"]["hidden_size"]
+        output_size = config["params"]["output_size"]
+        dropout_prob = config["params"]["dropout_prob"]
+
+        return (
+            input_size_bow,
+            embedding_size,
+            embedding_size,
+            hidden_size,
+            output_size,
+            dropout_prob,
         )
 
     def _load_model(self) -> Tuple[nn.Module, object, object]:
@@ -140,65 +92,54 @@ def _load_model(self) -> Tuple[nn.Module, object, object]:
         :return object: The scikit learn vectorizer for bag of words encoding.
         :return object: Label encoder object for the labels (y).
         """
-        try:
-            if self.schema == "ENCODE":
-                filename_vc = ENCODE_VECTORIZER_FILENAME
-                filename_lb = ENCODE_LABEL_ENCODER_FILENAME
-            elif self.schema == "FAIRTRACKS":
-                filename_vc = FAIRTRACKS_VECTORIZER_FILENAME
-                filename_lb = FAIRTRACKS_LABEL_ENCODER_FILENAME
-            elif self.schema == "BEDBASE":
-                filename_vc = BEDBASE_VECTORIZER_FILENAME
-                filename_lb = BEDBASE_LABEL_ENCODER_FILENAME
-            elif self.schema == "CUSTOM":
-                vc_path = self.custom_param["paths"]["vectorizer_pth"]
-                lb_path = self.custom_param["paths"]["label_encoder_pth"]
-                state_dict = torch.load(self.custom_param["paths"]["model_pth"])
-            else:
-                raise ValueError(f"Schema not available: {self.schema}")
-
-            if self.schema != "CUSTOM":
-                vc_path = hf_hub_download(
-                    repo_id=REPO_ID,
-                    filename=filename_vc,
-                )
-                lb_path = hf_hub_download(
-                    repo_id=REPO_ID,
-                    filename=filename_lb,
-                )
-                model = load_from_huggingface(self.schema)
-                state_dict = torch.load(model)
+        model_filename = f"model_{self.model_name}.pth"
+        label_encoder_filename = f"label_encoder_{self.model_name}.pkl"
+        vectorizer_filename = f"vectorizer_{self.model_name}.pkl"
 
-            with open(vc_path, "rb") as f:
-                vectorizer = pickle.load(f)
+        model_pth = hf_hub_download(
+            repo_id=self.repo_id, filename=os.path.join(self.model_name, model_filename)
+        )
 
-            with open(lb_path, "rb") as f:
-                label_encoder = pickle.load(f)
+        vc_path = hf_hub_download(
+            repo_id=self.repo_id,
+            filename=os.path.join(self.model_name, vectorizer_filename),
+        )
 
-            (
-                input_size_values,
-                input_size_values_embeddings,
-                input_size_headers,
-                hidden_size,
-                output_size,
-                dropout_prob,
-            ) = self._get_parameters()
-
-            model = BoWSTModel(
-                input_size_values,
-                input_size_values_embeddings,
-                input_size_headers,
-                hidden_size,
-                output_size,
-                dropout_prob,
-            )
-            model.load_state_dict(state_dict)
-            model.eval()
-            return model, vectorizer, label_encoder
+        lb_path = hf_hub_download(
+            repo_id=self.repo_id,
+            filename=os.path.join(self.model_name, label_encoder_filename),
+        )
 
-        except Exception as e:
-            logger.error(f"Error loading the model: {str(e)}")
-            raise
+        with open(vc_path, "rb") as f:
+            vectorizer = pickle.load(f)
+
+        with open(lb_path, "rb") as f:
+            label_encoder = pickle.load(f)
+
+        state_dict = torch.load(model_pth)
+
+        (
+            input_size_values,
+            input_size_values_embeddings,
+            input_size_headers,
+            hidden_size,
+            output_size,
+            dropout_prob,
+        ) = self._get_parameters()
+
+        model = BoWSTModel(
+            input_size_values,
+            input_size_values_embeddings,
+            input_size_headers,
+            hidden_size,
+            output_size,
+            dropout_prob,
+        )
+
+        model.load_state_dict(state_dict)
+        model.eval()
+
+        return model, vectorizer, label_encoder
 
     def standardize(
         self, pep: Union[str, peppy.Project]
diff --git a/bedms/const.py b/bedms/const.py
index e200fda..c36f5f4 100644
--- a/bedms/const.py
+++ b/bedms/const.py
@@ -6,25 +6,6 @@
 
 AVAILABLE_SCHEMAS = ["ENCODE", "FAIRTRACKS", "BEDBASE", "CUSTOM"]
 PEP_FILE_TYPES = ["yaml", "csv"]
-REPO_ID = "databio/attribute-standardizer-model6"
-MODEL_ENCODE = "model_encode.pth"
-MODEL_FAIRTRACKS = "model_fairtracks.pth"
-MODEL_BEDBASE = "model_bedbase.pth"
-ENCODE_VECTORIZER_FILENAME = "vectorizer_encode.pkl"
-FAIRTRACKS_VECTORIZER_FILENAME = "vectorizer_fairtracks.pkl"
-BEDBASE_VECTORIZER_FILENAME = "vectorizer_bedbase.pkl"
-ENCODE_LABEL_ENCODER_FILENAME = "label_encoder_encode.pkl"
-FAIRTRACKS_LABEL_ENCODER_FILENAME = "label_encoder_fairtracks.pkl"
-BEDBASE_LABEL_ENCODER_FILENAME = "label_encoder_bedbase.pkl"
 SENTENCE_TRANSFORMER_MODEL = "all-MiniLM-L6-v2"
-HIDDEN_SIZE = 32
-DROPOUT_PROB = 0.113
 CONFIDENCE_THRESHOLD = 0.70
-EMBEDDING_SIZE = 384
-INPUT_SIZE_BOW_ENCODE = 10459
-INPUT_SIZE_BOW_FAIRTRACKS = 13617
-OUTPUT_SIZE_FAIRTRACKS = 15
-OUTPUT_SIZE_ENCODE = 18
 NUM_CLUSTERS = 3
-INPUT_SIZE_BOW_BEDBASE = 13708
-OUTPUT_SIZE_BEDBASE = 12
diff --git a/bedms/train.py b/bedms/train.py
index d2816fd..9bc3438 100644
--- a/bedms/train.py
+++ b/bedms/train.py
@@ -29,7 +29,7 @@
 logger = logging.getLogger(PROJECT_NAME)
 
 
-class TrainStandardizer:
+class AttrStandardizerTrainer:
     """
     This is the training class responsible for
     managing the training process for the standardizer model.
diff --git a/bedms/utils.py b/bedms/utils.py
index fb63805..20e7128 100644
--- a/bedms/utils.py
+++ b/bedms/utils.py
@@ -17,11 +17,7 @@
 from sklearn.cluster import KMeans
 from sklearn.preprocessing import LabelEncoder
 from .const import (
-    MODEL_BEDBASE,
-    MODEL_ENCODE,
-    MODEL_FAIRTRACKS,
     NUM_CLUSTERS,
-    REPO_ID,
     PEP_FILE_TYPES,
     PROJECT_NAME,
 )
@@ -50,22 +46,6 @@ def fetch_from_pephub(project: peppy.Project) -> pd.DataFrame:
     return csv_file_df
 
 
-def load_from_huggingface(schema: str) -> Optional[Any]:
-    """
-    Load a model from HuggingFace based on the schema of choice.
-
-    :param str schema: Schema Type
-    :return Optional[Any]: Loaded model object
-    """
-    if schema == "ENCODE":
-        model = hf_hub_download(repo_id=REPO_ID, filename=MODEL_ENCODE)
-    elif schema == "FAIRTRACKS":
-        model = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FAIRTRACKS)
-    elif schema == "BEDBASE":
-        model = hf_hub_download(repo_id=REPO_ID, filename=MODEL_BEDBASE)
-    return model
-
-
 def data_preprocessing(
     df: pd.DataFrame,
 ) -> Tuple[List[List[str]], List[str], List[List[str]], int]:

From 27df60707949d1d394875b349eee670317a3551d Mon Sep 17 00:00:00 2001
From: saanikat <saanikatambe@gmail.com>
Date: Mon, 7 Oct 2024 11:32:12 -0400
Subject: [PATCH 10/18] const error solved

---
 bedms/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bedms/train.py b/bedms/train.py
index 9bc3438..b50b9f5 100644
--- a/bedms/train.py
+++ b/bedms/train.py
@@ -21,7 +21,7 @@
     plot_confusion_matrix,
     auc_roc_curve,
 )
-from .const import PROJECT_NAME, EMBEDDING_SIZE
+from .const import PROJECT_NAME
 from .model import BoWSTModel
 
 

From 3cd2e7d2c627d2ee687950793c6488f117c4602e Mon Sep 17 00:00:00 2001
From: saanikat <saanikatambe@gmail.com>
Date: Mon, 7 Oct 2024 11:37:30 -0400
Subject: [PATCH 11/18] matplotlib

---
 requirements/requirements-all.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt
index 3f373a4..397ce51 100644
--- a/requirements/requirements-all.txt
+++ b/requirements/requirements-all.txt
@@ -4,3 +4,4 @@ torch
 sentence-transformers
 pephubclient>=0.4.2
 peppy>=0.40.6
+matplotlib
\ No newline at end of file

From 37d1fe690a9776a6fb3225af7795d97371d22da1 Mon Sep 17 00:00:00 2001
From: saanikat <saanikatambe@gmail.com>
Date: Mon, 7 Oct 2024 11:40:57 -0400
Subject: [PATCH 12/18] req

---
 requirements/requirements-all.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt
index 397ce51..daeede2 100644
--- a/requirements/requirements-all.txt
+++ b/requirements/requirements-all.txt
@@ -4,4 +4,5 @@ torch
 sentence-transformers
 pephubclient>=0.4.2
 peppy>=0.40.6
-matplotlib
\ No newline at end of file
+matplotlib
+seaborn
\ No newline at end of file

From 5edee67e4ee5121f2617c59d582a70caaffab9bf Mon Sep 17 00:00:00 2001
From: saanikat <saanikatambe@gmail.com>
Date: Mon, 7 Oct 2024 11:47:17 -0400
Subject: [PATCH 13/18] updated test

---
 tests/test_bedms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_bedms.py b/tests/test_bedms.py
index a47dfb1..b2d84d1 100755
--- a/tests/test_bedms.py
+++ b/tests/test_bedms.py
@@ -3,7 +3,7 @@
 
 class TestBEDMES:
     def test_bedmes(self):
-        model = AttrStandardizer("ENCODE")
+        model = AttrStandardizer(repo_id='databio/attribute-standardizer-model6', model_name='encode')
         results = model.standardize(pep="geo/gse228634:default")
 
         assert results

From e6ace1291ce06b36a957f0140b1309e9446c8fdd Mon Sep 17 00:00:00 2001
From: saanikat <saanikatambe@gmail.com>
Date: Mon, 7 Oct 2024 11:48:36 -0400
Subject: [PATCH 14/18] lint

---
 tests/test_bedms.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_bedms.py b/tests/test_bedms.py
index b2d84d1..fc654a1 100755
--- a/tests/test_bedms.py
+++ b/tests/test_bedms.py
@@ -3,7 +3,9 @@
 
 class TestBEDMES:
     def test_bedmes(self):
-        model = AttrStandardizer(repo_id='databio/attribute-standardizer-model6', model_name='encode')
+        model = AttrStandardizer(
+            repo_id="databio/attribute-standardizer-model6", model_name="encode"
+        )
         results = model.standardize(pep="geo/gse228634:default")
 
         assert results

From 732b5053314a9a3bffce31f888f2cad5fedddcc4 Mon Sep 17 00:00:00 2001
From: saanikat <saanikatambe@gmail.com>
Date: Mon, 7 Oct 2024 12:04:46 -0400
Subject: [PATCH 15/18] README updated

---
 README.md | 33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index ac46292..a56e07a 100644
--- a/README.md
+++ b/README.md
@@ -16,10 +16,15 @@ pip install git+https://github.com/databio/bedms.git
 ## Usage
 
 ### Standardizing based on available schemas
+
+To choose the schema you want to standardize according to, please refer to the [HuggingFace repository](https://huggingface.co/databio/attribute-standardizer-model6). Based on the schema design `.yaml` files, you can select which schema best represents your attributes. In the example below, we have chosen `encode` schema. 
+
 ```python
 from bedms import AttrStandardizer
 
-model = AttrStandardizer("ENCODE")
+model = AttrStandardizer(
+    repo_id="databio/attribute-standardizer-model6", model_name="encode"
+)
 results = model.standardize(pep="geo/gse228634:default")
 
 assert results
@@ -33,9 +38,9 @@ Training your custom schema is very easy with `BEDMS`. You would need two things
 To instantiate `TrainStandardizer` class:
 
 ```python
-from bedms.train import TrainStandardizer
+from bedms.train import AttrStandardizerTrainer
 
-trainer = TrainStandardizer("training_config.yaml")
+trainer = AttrStandardizerTrainer("training_config.yaml")
 
 ```
 To load the datasets and encode them:
@@ -63,26 +68,16 @@ trainer.plot_visualizations()
 ```
 
 ### Standardizing based on custom schema
-For standardizing based on custom schema, you would require a `custom_config.yaml`.
+
+For standardizing based on custom schema, your model should be on HuggingFace. The directory structure should follow the instructions mentioned on [HuggingFace](https://huggingface.co/databio/attribute-standardizer-model6). 
 
 ```python
 from bedms import AttrStandardizer
 
-model = AttrStandardizer("CUSTOM", "custom_config.yaml")
-
+model = AttrStandardizer(
+    repo_id="name/of/your/hf/repo", model_name="model/name"
+)
 results = model.standardize(pep="geo/gse228634:default")
 
 assert results
-```
-
-### Available schemas
-To see the available schemas, you can run:
-```
-from bedms.const import AVAILABLE_SCHEMAS
-print(AVAILABLE_SCHEMAS)
-
-# >> ['ENCODE', 'FAIRTRACKS', 'BEDBASE'] 
-
-```
-
-AVAILABLE_SCHEMAS is a list of available schemas that you can use to standardize your metadata.
+```
\ No newline at end of file

From dfa1a02008b1e4ac56508559209834237eda2f4f Mon Sep 17 00:00:00 2001
From: saanikat <saanikatambe@gmail.com>
Date: Tue, 15 Oct 2024 11:27:02 -0400
Subject: [PATCH 16/18] reviewer request changes

---
 README.md            |  13 +++--
 bedms/train.py       | 118 +++++++++++++++++++++++++++++++------------
 bedms/utils_train.py |  49 +++++++++++-------
 training_config.yaml |   1 +
 4 files changed, 127 insertions(+), 54 deletions(-)

diff --git a/README.md b/README.md
index a56e07a..a1e6789 100644
--- a/README.md
+++ b/README.md
@@ -46,27 +46,30 @@ trainer = AttrStandardizerTrainer("training_config.yaml")
 To load the datasets and encode them:
 
 ```python
-trainer.load_encode_data()
+train_data, val_data, test_data, label_encoder, vectorizer = trainer.load_data()
 ```
 
 To train the custom model:
 
 ```python
-trainer.training()
+trainer.train()
 ```
 
 To test the custom model:
 
 ```python
-trainer.testing()
+test_results_dict = trainer.test()
 ```
 
 To generate visualizations such as Learning Curves, Confusion Matrices, and ROC Curve:
 
 ```python
-trainer.plot_visualizations()
+acc_fig, loss_fig, conf_fig, roc_fig = trainer.plot_visualizations() 
 ```
 
+Where `acc_fig` is Accuracy Curve figure object, `loss_fig` is Loss Curve figure object, `conf_fig` is the Confusion Matrix figure object, and `roc_fig` is the ROC Curve figure object. 
+
+
 ### Standardizing based on custom schema
 
 For standardizing based on custom schema, your model should be on HuggingFace. The directory structure should follow the instructions mentioned on [HuggingFace](https://huggingface.co/databio/attribute-standardizer-model6). 
@@ -79,5 +82,5 @@ model = AttrStandardizer(
 )
 results = model.standardize(pep="geo/gse228634:default")
 
-assert results
+print(results) #Dictionary of suggested predictions with their confidence: {'attr_1':{'prediction_1': 0.70, 'prediction_2':0.30}}
 ```
\ No newline at end of file
diff --git a/bedms/train.py b/bedms/train.py
index b50b9f5..b7a5c77 100644
--- a/bedms/train.py
+++ b/bedms/train.py
@@ -4,14 +4,19 @@
 import torch
 from torch import nn
 from torch import optim
+from torch.utils.data import DataLoader
 from sklearn.metrics import (
     precision_score,
     recall_score,
     f1_score,
 )
+from sklearn.preprocessing import LabelEncoder
+from sklearn.feature_extraction.text import CountVectorizer
+import matplotlib.pyplot as plt
+from typing import List, Dict, Tuple
 import yaml
 from .utils_train import (
-    load_from_dir,
+    load_training_files_from_dir,
     accumulate_data,
     training_encoding,
     data_loader,
@@ -41,33 +46,56 @@ def __init__(self, config: str) -> None:
 
         :param str config: Path to the config file which has the training parameters provided by the user.
         """
-        self.label_encoder = None
-        self.vectorizer = None
-        self.train_loader = None
-        self.val_loader = None
-        self.test_loader = None
-        self.output_size = None
-        self.criterion = None
-        self.train_accuracies = None
-        self.val_accuracies = None
-        self.train_losses = None
-        self.val_losses = None
-        self.model = None
-        self.fpr = None
-        self.tpr = None
-        self.roc_auc = None
-        self.all_labels = None
-        self.all_preds = None
+        self.label_encoder: LabelEncoder = None
+        self.vectorizer: CountVectorizer = None
+        self.train_loader: DataLoader = None
+        self.val_loader: DataLoader = None
+        self.test_loader: DataLoader = None
+        self.output_size: int = 0
+        self.criterion: nn.Module = None
+        self.train_accuracies: List[float] = []
+        self.val_accuracies: List[float] = []
+        self.train_losses: List[float] = []
+        self.val_losses: List[float] = []
+        self.model: BoWSTModel = None
+        self.fpr: Dict[int, float] = {}
+        self.tpr: Dict[int, float] = {}
+        self.roc_auc: Dict[int, float] = {}
+        self.all_labels: List[int] = []
+        self.all_preds: List[int] = []
 
         with open(config, "r") as file:
             self.config = yaml.safe_load(file)
 
-    def load_encode_data(self) -> None:
+    def load_data(
+        self,
+    ) -> Tuple[
+        Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
+        Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
+        Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
+        LabelEncoder,
+        CountVectorizer,
+    ]:
         """
         Loads and prepares the encoded training, testing and validation datasets.
+        :return Tuple[
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
+            LabelEncoder,
+            CountVectorizer]: A tuple containing:
+                - training dataset tensor
+                - validation dataset tensor
+                - testing dataset tensor
+                - label encoder
+                - bag of words vectorizer
         """
-        values_files_list = load_from_dir(self.config["dataset"]["values_dir_pth"])
-        headers_files_list = load_from_dir(self.config["dataset"]["headers_dir_pth"])
+        values_files_list = load_training_files_from_dir(
+            self.config["dataset"]["values_dir_pth"]
+        )
+        headers_files_list = load_training_files_from_dir(
+            self.config["dataset"]["headers_dir_pth"]
+        )
 
         if len(values_files_list) != len(headers_files_list):
             logger.error(
@@ -149,13 +177,21 @@ def load_encode_data(self) -> None:
 
         logger.info("Loading Done.")
 
-    def training(self):
+        return (
+            train_encoded_data,
+            val_encoded_data,
+            test_encoded_data,
+            self.label_encoder,
+            self.vectorizer,
+        )
+
+    def train(self) -> None:
         """
         Trains the model.
         """
         input_size_values = len(self.vectorizer.vocabulary_)
-        input_size_values_embeddings = EMBEDDING_SIZE
-        input_size_headers = EMBEDDING_SIZE
+        input_size_values_embeddings = self.config["training"]["embedding_size"]
+        input_size_headers = self.config["training"]["embedding_size"]
         hidden_size = self.config["model"]["hidden_size"]
         self.output_size = len(self.label_encoder.classes_)  # Number of classes
         dropout_prob = self.config["model"]["dropout_prob"]
@@ -175,7 +211,9 @@ def training(self):
         optimizer = optim.Adam(
             self.model.parameters(), lr=learning_rate, weight_decay=l2_reg_lambda
         )
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model.to(self.device)
+
         # Training the model
         num_epochs = self.config["training"]["num_epochs"]
 
@@ -196,7 +234,7 @@ def training(self):
             self.val_loader,
             self.criterion,
             optimizer,
-            device,
+            self.device,
             num_epochs,
             self.output_size,
             model_pth,
@@ -205,29 +243,41 @@ def training(self):
 
         logger.info("Training Done.")
 
-    def testing(self):
+    def test(self) -> Dict[str, float]:
         """
         Model testing.
+
+        :return Dict[str, float]: Precision, Recall, and F1 values
         """
         self.all_preds, self.all_labels = model_testing(
-            self.model, self.test_loader, self.criterion
+            self.model, self.device, self.test_loader, self.criterion
         )
         precision = precision_score(self.all_labels, self.all_preds, average="macro")
         recall = recall_score(self.all_labels, self.all_preds, average="macro")
         f1 = f1_score(self.all_labels, self.all_preds, average="macro")
         logger.info(f"Precision:{precision}, Recall: {recall}, F1 Score: {f1}")
+        return {"precision": precision, "recall": recall, "f1": f1}
 
-    def plot_visualizations(self):
+    def plot_visualizations(
+        self,
+    ) -> Tuple[plt.Figure, plt.Figure, plt.Figure, plt.Figure]:
         """
         Generates visualizations for training ( accuracy and loss curves)
         and testing( confusion matrix, roc curve)
+
+        :return Tuple[plt.Figure, plt.Figure, plt.Figure, plt.Figure]:
+            A Tuple containing:
+                - accuracy figure
+                - loss figure
+                - confusion matrix figure
+                - ROC curve figure
         """
         num_epochs = self.config["training"]["num_epochs"]
         accuracy_fig_pth = self.config["visualization"]["accuracy_fig_pth"]
         loss_fig_pth = self.config["visualization"]["loss_fig_pth"]
         cm_pth = self.config["visualization"]["confusion_matrix_fig_pth"]
         roc_pth = self.config["visualization"]["roc_fig_pth"]
-        plot_learning_curve(
+        acc_fig, loss_fig = plot_learning_curve(
             num_epochs,
             self.train_accuracies,
             self.val_accuracies,
@@ -236,7 +286,11 @@ def plot_visualizations(self):
             accuracy_fig_pth,
             loss_fig_pth,
         )
-        plot_confusion_matrix(
+        conf_fig = plot_confusion_matrix(
             self.all_labels, self.all_preds, self.label_encoder.classes_, cm_pth
         )
-        auc_roc_curve(self.fpr, self.tpr, self.roc_auc, self.output_size, roc_pth)
+        roc_fig = auc_roc_curve(
+            self.fpr, self.tpr, self.roc_auc, self.output_size, roc_pth
+        )
+
+        return acc_fig, loss_fig, conf_fig, roc_fig
diff --git a/bedms/utils_train.py b/bedms/utils_train.py
index ab64566..9661988 100644
--- a/bedms/utils_train.py
+++ b/bedms/utils_train.py
@@ -39,7 +39,7 @@
 )
 
 
-def load_from_dir(dir: str) -> List[str]:
+def load_training_files_from_dir(dir: str) -> List[str]:
     """
     Loads each file from the directory path.
 
@@ -49,7 +49,7 @@ def load_from_dir(dir: str) -> List[str]:
     return glob(os.path.join(dir, "*.csv"))
 
 
-def load_and_preprocess(file_path: str) -> pd.DataFrame:
+def load_and_preprocess_files(file_path: str) -> pd.DataFrame:
     """
     Loads and Preprocesses each csv file as a Pandas DataFrame.
 
@@ -84,8 +84,8 @@ def accumulate_data(
     x_headers_list = []
     y_list = []
     for values_file, headers_file in files:
-        df_values = load_and_preprocess(values_file)
-        df_headers = load_and_preprocess(headers_file)
+        df_values = load_and_preprocess_files(values_file)
+        df_headers = load_and_preprocess_files(headers_file)
         df_values = df_values.fillna("")
         df_headers = df_headers.fillna("")
         y = df_values.columns
@@ -129,8 +129,8 @@ def get_top_training_cluster_averaged(
     :return torch.Tensor: A tensor representing the
         average of embeddings in the most common cluster.
     """
-    flattened_embeddings = [embedding.tolist() for embedding in embeddings]
-    kmeans = KMeans(n_clusters=num, random_state=0).fit(flattened_embeddings)
+    embeddings_list = [embedding.tolist() for embedding in embeddings]
+    kmeans = KMeans(n_clusters=num, random_state=0).fit(embeddings_list)
     labels_kmeans = kmeans.labels_
     cluster_counts = Counter(labels_kmeans)
     most_common_cluster = max(cluster_counts, key=cluster_counts.get)
@@ -198,13 +198,11 @@ def training_encoding(
     Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
     Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
     LabelEncoder,
-    List[str],
     CountVectorizer]: Returns a tuple of
      - training dataset tensor
      - testing dataset tensor
      - validation dataset tensor
      - trained label encoder
-     - list of unique values encountered during training
      - Trained vectorizer for Bag of Words representation
 
     """
@@ -538,7 +536,10 @@ def train_model(
 
 
 def model_testing(
-    model: torch.nn.Module, test_loader: DataLoader, loss_fn: torch.nn.Module
+    model: torch.nn.Module,
+    device: torch.device,
+    test_loader: DataLoader,
+    loss_fn: torch.nn.Module,
 ) -> Tuple[List[int], List[int], torch.Tensor]:
     """
     This functions tests the model.
@@ -559,6 +560,10 @@ def model_testing(
     total_samples_test = 0
     with torch.no_grad():
         for values_batch, bow_batch, headers_batch, labels in test_loader:
+            values_batch = values_batch.to(device)
+            bow_batch = bow_batch.to(device)
+            headers_batch = headers_batch.to(device)
+            labels = labels.to(device)
             outputs = model(values_batch, bow_batch, headers_batch)
             loss = loss_fn(outputs, labels)
             total_loss_test += loss.item()
@@ -583,7 +588,7 @@ def plot_learning_curve(
     val_losses: List[float],
     accuracy_fig_pth: str,
     loss_fig_pth: str,
-) -> None:
+) -> Tuple[plt.Figure, plt.Figure]:
     """
     Plots the learning curves - accuracy and loss for Training and Validation of the model.
 
@@ -594,10 +599,14 @@ def plot_learning_curve(
     :param List[float] val_losses: List of validation losses for each epoch.
     :param str accuracy_fig_pth: Path where the accuracy curve figure will be saved.
     :param str loss_fig_pth: Path where the loss curve figure will be saved.
+
+    :return Tuple[plt.Figure, plt.Figure]: Accuracy and Loss curves
     """
 
     # accuracy
-    plt.plot(range(1, num_epochs + 1), train_accuracies, label="Training Accuracy")
+    acc = plt.plot(
+        range(1, num_epochs + 1), train_accuracies, label="Training Accuracy"
+    )
     plt.plot(range(1, num_epochs + 1), val_accuracies, label="Validation Accuracy")
     plt.xlabel("Epoch")
     plt.ylabel("Accuracy")
@@ -607,8 +616,9 @@ def plot_learning_curve(
     plt.savefig(accuracy_fig_pth, format="svg")
     plt.show()
     plt.close()
+
     # loss
-    plt.plot(range(1, num_epochs + 1), train_losses, label="Training Loss")
+    loss = plt.plot(range(1, num_epochs + 1), train_losses, label="Training Loss")
     plt.plot(range(1, num_epochs + 1), val_losses, label="Validation Loss")
     plt.xlabel("Epoch")
     plt.ylabel("Loss")
@@ -618,6 +628,7 @@ def plot_learning_curve(
     plt.savefig(loss_fig_pth, format="svg")
     plt.show()
     plt.close()
+    return acc, loss
 
 
 def plot_confusion_matrix(
@@ -625,7 +636,7 @@ def plot_confusion_matrix(
     y_pred: List[int],
     unique_values_list: List[str],
     confusion_matrix_fig_pth: str,
-) -> None:
+) -> plt.Figure:
     """
     Plots confusion matrix for the test data.
 
@@ -633,6 +644,8 @@ def plot_confusion_matrix(
     :param List[int] y_pred: List of predictions made by the model.
     :param List[str] unique_values_list: List of all the classes that the model predicted.
     :param str confusion_matrix_fig_pth: Path where the confusion matrix figure will be saved.
+
+    :return plt.Figure: Confusion matrix figure
     """
     conf_matrix = confusion_matrix(y_true, y_pred)
     plt.figure(figsize=(12, 12))
@@ -653,6 +666,7 @@ def plot_confusion_matrix(
     class_accuracy = conf_matrix.diagonal() / conf_matrix.sum(axis=1)
     for i, acc in enumerate(class_accuracy):
         print(f"Accuracy for class {i}: {acc:.4f}")
+    return conf_matrix
 
 
 def auc_roc_curve(
@@ -661,7 +675,7 @@ def auc_roc_curve(
     roc_auc: Dict[int, float],
     output_size: int,
     roc_fig_pth: str,
-) -> None:
+) -> plt.Figure:
     """
     Plots the ROC Curve.
 
@@ -670,8 +684,10 @@ def auc_roc_curve(
     :param Dict[int, float] roc_auc: Dictionary of Area Under Curve for ROC for different classes.
     :param int output_size: The number of classes the model predicted into.
     :param str roc_fig_pth: Path to where the ROC figure will be saved.
+
+    :return plt.Figure: Figure for the ROC Curve.
     """
-    plt.figure(figsize=(12, 12))
+    fig = plt.figure(figsize=(12, 12))
     for i in range(output_size):
         plt.plot(
             fpr[i],
@@ -688,5 +704,4 @@ def auc_roc_curve(
     plt.title("Receiver Operating Characteristic (ROC) Curve")
     plt.legend(loc="lower right")
     plt.savefig(roc_fig_pth, format="svg")
-    plt.show()
-    plt.close()
+    return fig
diff --git a/training_config.yaml b/training_config.yaml
index 1236709..75910d2 100644
--- a/training_config.yaml
+++ b/training_config.yaml
@@ -21,6 +21,7 @@ training:
   vectorizer_pth: "path/to/vectorizer.pkl" #Path to where you want to save the Bag of Words vectorizer
   label_encoder_pth: "/path/to/label_encoder.pkl" #Path to where you want to save the Label Encoder
   sentence_transformer_model:  "all-MiniLM-L6-v2" #Name of the sentence transformer model you wish to use fro HuggingFace
+  embedding_size: 384 #Dimensionality of the embedding produced by the chosen sentence transformer
   bow_drops: 2 #Number of Bag of Words columns you wish to drop out during the training process (Avoids overfitting, can be set to 0)
 
 visualization:

From c009c70103a9160c98a71c9572128356fe3083d6 Mon Sep 17 00:00:00 2001
From: Oleksandr <41573628+khoroshevskyi@users.noreply.github.com>
Date: Tue, 3 Dec 2024 15:10:48 -0500
Subject: [PATCH 17/18] updated version

---
 bedms/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bedms/_version.py b/bedms/_version.py
index 3dc1f76..d3ec452 100644
--- a/bedms/_version.py
+++ b/bedms/_version.py
@@ -1 +1 @@
-__version__ = "0.1.0"
+__version__ = "0.2.0"

From fef2ed7bb0c0368f1bf38b87abe96716c7fddd0d Mon Sep 17 00:00:00 2001
From: Oleksandr <41573628+khoroshevskyi@users.noreply.github.com>
Date: Tue, 3 Dec 2024 15:11:23 -0500
Subject: [PATCH 18/18] updated changelog

---
 docs/changelog.md | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index 747b21b..25fa872 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -2,7 +2,15 @@
 
 This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format.
 
+## [0.2.0] - 2024-12-03
+### Added
+- Added generic way of initialization of all schemas 
+- Added TrainStandardizer module for custom model creation
+
+### Fixed
+- Fixed Typo in README #23
+
 
 ## [0.1.0] - 2024-09-16
 ### Added
-- initial project release
\ No newline at end of file
+- initial project release