From 8c3aec082e29b815a194168b3e9da49c17351b27 Mon Sep 17 00:00:00 2001
From: "Lea CHABOT lea-chabot@orange.fr" <lchabot@login01.cluster>
Date: Wed, 3 Jan 2024 16:01:01 +0100
Subject: [PATCH 01/15] arlotta

---
 03_Script/arlotta.R | 269 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 209 insertions(+), 60 deletions(-)

diff --git a/03_Script/arlotta.R b/03_Script/arlotta.R
index de41153..8ca4d32 100755
--- a/03_Script/arlotta.R
+++ b/03_Script/arlotta.R
@@ -3,28 +3,27 @@
 # Paola Arlotta reference :
 # https://www.nature.com/articles/s41586-021-03670-5
 #..............................................................
-
+###########################################################################
 #-------------------------------------
 # Library
 #-------------------------------------
 library(Matrix)
 library(data.table)
+library(Seurat)
 
 #-------------------------------------
-# Path to files / folders
+# Path to files / folderss
 #-------------------------------------
 DIRECTORY = getwd()
 TEXT_OUTPUT = snakemake@output[["data_for_sims_output"]]
 OUTPUTDIR = file.path((DIRECTORY), "05_Output")
 REF = file.path((DIRECTORY), "01_Reference")
 
-PATTERN_TO_KEEP = snakemake@params[["pattern_to_keep"]]
+DEVELOPMENTAL_TIME = snakemake@params[["developmental_time"]]
 SAMPLE_ID = snakemake@params[["sample_id"]]
 
-ARLOTTA_METADATA = snakemake@params[["arlotta_metadata"]]
-ARLOTTA_MATRIX = snakemake@params[["arlotta_matrix"]]
-ARLOTTA_CELLS = snakemake@params[["arlotta_cells"]]
-ARLOTTA_FEATURES = snakemake@params[["arlotta_features"]]
+NORM_METHOD = snakemake@params[["norm_method"]]
+NORM_SCALE_FACTOR = snakemake@params[["norm_scale_factor"]]
 
 OUTPUT_NAME_REF_METADATA = snakemake@params[["output_name_ref_metadata"]]
 OUTPUT_NAME_REF_MATRIX = snakemake@params[["output_name_ref_matrix"]]
@@ -36,7 +35,17 @@ STEP3 = "03_sims/"
 #-------------------------------------
 # Load the metadata of the reference
 #-------------------------------------
-reference_metadata <- fread(file = file.path(REF, ARLOTTA_METADATA))
+reference_metadata <- fread(file = file.path(REF, "arlotta/metaData_scDevSC.txt"))
+reference_metadata <- reference_metadata[-1, ]
+reference_metadata <- reference_metadata[, NEW_NAME := gsub("[^ATCG]", "", NAME)] ### Keep only the A,T,G and C in the original name
+
+#-------------------------------------
+# For the training we need at least
+# two cells per labels
+#-------------------------------------
+occurence <- table(reference_metadata$New_cellType)
+one_cell_per_label <- names(occurence[occurence == 1])
+reference_metadata <- reference_metadata[!(reference_metadata$New_cellType %in% one_cell_per_label), ]
 
 #-------------------------------------
 # Erase some labels in the metadata :
@@ -47,73 +56,85 @@ patterns_to_delete <- c("Low quality cells", "Doublet", "Red blood cells", "grou
 rows_to_delete <- apply(sapply(patterns_to_delete, grepl, reference_metadata$New_cellType), 1, any)
 reference_metadata <- subset(reference_metadata, !rows_to_delete)
 
-########################################## PROPRE AU PROJET ##########################################
 #-------------------------------------
-# Rename some subclass to match the
-# Allen reference
+# Load the matrix of reference 
+# (Lot of developmental time points)
 #-------------------------------------
-reference_metadata$New_cellType[reference_metadata$New_cellType == "UL CPN"] <- "IT"
-reference_metadata$New_cellType[reference_metadata$New_cellType == "Layer 4"] <- "IT"
-reference_metadata$New_cellType[reference_metadata$New_cellType == "DL CPN"] <- "DL CPN"
-reference_metadata$New_cellType[reference_metadata$New_cellType == "SCPN"] <- "L5 PT"
-reference_metadata$New_cellType[reference_metadata$New_cellType == "NP"] <- "L5 NP"
-reference_metadata$New_cellType[reference_metadata$New_cellType == "CThPN"] <- "L6 CT"
-reference_metadata$New_cellType[reference_metadata$New_cellType == "Layer 6b"] <- "L6b"
-reference_metadata$New_cellType[reference_metadata$New_cellType == "Cajal Retzius cells"] <- "CR"
+# Create a list to store al the file path
+list_files <- list(
+E10 <- file.path(REF, "arlotta/GSM5277843_E10_v1_filtered_feature_bc_matrix.h5"),
+E11 <- file.path(REF, "arlotta/GSM4635072_E11_5_filtered_gene_bc_matrices_h5.h5"),
+E12 <- file.path(REF, "arlotta/GSM4635073_E12_5_filtered_gene_bc_matrices_h5.h5"),
+E13 <- file.path(REF, "arlotta/GSM4635074_E13_5_filtered_gene_bc_matrices_h5.h5"),
+E14 <- file.path(REF, "arlotta/GSM4635075_E14_5_filtered_gene_bc_matrices_h5.h5"),
+E15 <- file.path(REF, "arlotta/GSM4635076_E15_5_S1_filtered_gene_bc_matrices_h5.h5"),
+E16 <- file.path(REF, "arlotta/GSM4635077_E16_filtered_gene_bc_matrices_h5.h5"),
+E17 <- file.path(REF, "arlotta/GSM5277844_E17_5_filtered_feature_bc_matrix.h5"),
+E18_S1 <- file.path(REF, "arlotta/GSM4635078_E18_5_S1_filtered_gene_bc_matrices_h5.h5"),
+E18_S3 <- file.path(REF, "arlotta/GSM4635079_E18_S3_filtered_gene_bc_matrices_h5.h5"),
+P1 <- file.path(REF, "arlotta/GSM4635081_P1_S2_filtered_gene_bc_matrices_h5.h5"),
+P1_S1 <- file.path(REF, "arlotta/GSM4635080_P1_S1_filtered_gene_bc_matrices_h5.h5"),
+P4 <- file.path(REF, "arlotta/GSM5277845_P4_filtered_feature_bc_matrix.h5")
+)
 
-######################################################################################################
+list_files <- setNames(list_files, c("E10", "E11", "E12", "E13", "E14", "E15", "E16", "E17", "E18_S1", "E18_S3", "P1", "P1_S1", "P4"))
 
-#-------------------------------------
-# Load the matrix of reference
-#-------------------------------------
-reference_matrix <- readMM(file = file.path(REF, ARLOTTA_MATRIX))
+# file we want to open (depending on what is written in the config.yaml)
+file_to_open <- DEVELOPMENTAL_TIME
 
-feature.names = read.delim(file.path(REF, ARLOTTA_FEATURES),
-                           header = FALSE,
-                           stringsAsFactors = FALSE)
+list_reference_matrix <- list()
 
-cells.names = read.delim(file.path(REF, ARLOTTA_CELLS),
-                           header = FALSE,
-                           stringsAsFactors = FALSE)
+for (x in file_to_open){
+    filepath <- list_files[names(list_files) == x]
+    list_reference_matrix[[x]] <- Read10X_h5(filepath[[x]])
+}
 
-colnames(reference_matrix) = cells.names$V1
-rownames(reference_matrix) = feature.names$V1
-
-# #-------------------------------------
-# # Transpose the matrix :
-# # Cells in row and genes in columns +
-# # Transform rownames ans colnames
-# # as characters (for sims).
-# #-------------------------------------
-reference_matrix <- t(reference_matrix)
-reference_matrix <- as.matrix(reference_matrix)
+#-------------------------------------
+# Normalization + Transpose them
+# (cell in rows) + Transform them
+# in matrix
+#-------------------------------------
+list_reference_matrix <- lapply(list_reference_matrix, function(x){
+    normalized_matrix <- NormalizeData( object = x, normalization.method = NORM_METHOD, scale.factor = NORM_SCALE_FACTOR, verbose = FALSE)
+})
 
-colnames(reference_matrix) <- as.character(colnames(reference_matrix)) # Genes
-rownames(reference_matrix) <- as.character(rownames(reference_matrix)) # Cells
+list_reference_matrix <-  lapply(list_reference_matrix, function(x) as.matrix(x))
+list_reference_matrix <-  lapply(list_reference_matrix, function(x) t(x))
 
 #-------------------------------------
-# Write a new matrix : Keep only
-# mouse age that are interesting
-# for the project
+# Keep only A,T,G and C in every 
+# matrix for the cells name (to match
+# the metadata)
 #-------------------------------------
-pattern_to_keep <- c(PATTERN_TO_KEEP) 
-rows_to_keep <- grepl(paste(pattern_to_keep, collapse = "|"), rownames(reference_matrix))
-reference_matrix <- reference_matrix[rows_to_keep, ]
+clean_rownames <- function(x){
+  current_rownames <- rownames(x)
+  modified_rownames <- gsub("[^ATCG]", "", current_rownames)
+  rownames(x) <- modified_rownames
+  return(x)
+}
+list_reference_matrix <- lapply(list_reference_matrix, clean_rownames)
 
 #-------------------------------------
 # Keep cells that match both the 
 # metadata an matrix of reference
 #-------------------------------------
-column_metadata <- reference_metadata$NAME
-matching_cells <- intersect(rownames(reference_matrix), column_metadata)
-reference_metadata <- reference_metadata[reference_metadata$NAME %in% matching_cells, ]
-reference_matrix <- reference_matrix[matching_cells, ]
+metadata <- data.table()
 
-#-------------------------------------
-# Order metadata according to matrix
-# (Necessary ?)
-#-------------------------------------
-reference_metadata <- reference_metadata[match(rownames(reference_matrix),reference_metadata$NAME),]
+for(x in names(list_reference_matrix)){
+    if(x %in% reference_metadata$orig_ident){
+        rows_metadata <- reference_metadata[orig_ident == x] # Look in the metadata and keep only the rows that corespond to the matrix (Keep P4 rows for the P4 matrix for exemple)
+        cell_metadata <- rows_metadata$NEW_NAME # Cells in the metadata
+        cell_reference_matrix <- rownames(list_reference_matrix[[x]]) # Cells in the metadata
+        matching_cells <- intersect(cell_metadata, cell_reference_matrix) # Look at cells that match
+        temporary_metadata <- subset(rows_metadata, NEW_NAME %in% matching_cells) # Create a temporary metadata for each matrix whith only the cells that match and remove the others
+        temporary_metadata$NEW_NAME <- paste(x, temporary_metadata$NEW_NAME, sep="_") # Add the develomental age in front of cell name (some diferent developmental point have same name for cells, that is why)
+        metadata <- rbind(metadata, temporary_metadata) # Append the for each matrix the new metadata to the final one
+        list_reference_matrix[[x]] <- list_reference_matrix[[x]][matching_cells, ] # Now keep only the mathing cell in the matrix and remove the others
+        rownames(list_reference_matrix[[x]]) <- paste(x, rownames(list_reference_matrix[[x]]), sep = "_") # Add the develomental age in front of cell name (Like this also names still match metadata)
+    }
+}
+
+reference_matrix <- do.call(rbind, list_reference_matrix)
 
 #-------------------------------------
 # Load our matrix to annotate
@@ -142,11 +163,139 @@ write.csv(reference_matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT
 class(matrix) <- "numeric"
 write.csv(matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_MATRIX,".csv")))
 
-fwrite(x = reference_metadata, file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_REF_METADATA,".csv")))
+fwrite(x = metadata, file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_REF_METADATA,".csv")))
 
 #-------------------------------------
 # create the output file
 #-------------------------------------
 output_file<-file(TEXT_OUTPUT)
 writeLines(c("Files preparation for the Arlotta reference for SIMS finished (CSV format)"), output_file)
-close(output_file)
\ No newline at end of file
+close(output_file)
+
+######################################################################################################################################################################################################################################################
+# Old code
+
+# #-------------------------------------
+# # Path to files / folders
+# #-------------------------------------
+# DIRECTORY = getwd()
+# TEXT_OUTPUT = snakemake@output[["data_for_sims_output"]]
+# OUTPUTDIR = file.path((DIRECTORY), "05_Output")
+# REF = file.path((DIRECTORY), "01_Reference")
+
+# PATTERN_TO_KEEP = snakemake@params[["pattern_to_keep"]]
+# SAMPLE_ID = snakemake@params[["sample_id"]]
+
+# ARLOTTA_METADATA = snakemake@params[["arlotta_metadata"]]
+# ARLOTTA_MATRIX = snakemake@params[["arlotta_matrix"]]
+# ARLOTTA_CELLS = snakemake@params[["arlotta_cells"]]
+# ARLOTTA_FEATURES = snakemake@params[["arlotta_features"]]
+
+# OUTPUT_NAME_REF_METADATA = snakemake@params[["output_name_ref_metadata"]]
+# OUTPUT_NAME_REF_MATRIX = snakemake@params[["output_name_ref_matrix"]]
+# OUTPUT_NAME_MATRIX = snakemake@params[["output_name_matrix"]]
+
+# STEP2 = "02_seurat/"
+# STEP3 = "03_sims/"
+
+# #-------------------------------------
+# # Load the metadata of the reference
+# #-------------------------------------
+# reference_metadata <- fread(file = file.path(REF, ARLOTTA_METADATA))
+
+# #-------------------------------------
+# # Erase some labels in the metadata :
+# # Low quality cells, Doublet,
+# # Red blood cells, group
+# #-------------------------------------
+# patterns_to_delete <- c("Low quality cells", "Doublet", "Red blood cells", "group") 
+# rows_to_delete <- apply(sapply(patterns_to_delete, grepl, reference_metadata$New_cellType), 1, any)
+# reference_metadata <- subset(reference_metadata, !rows_to_delete)
+
+# #-------------------------------------
+# # Load the matrix of reference
+# #-------------------------------------
+# reference_matrix <- readMM(file = file.path(REF, ARLOTTA_MATRIX))
+
+# feature.names = read.delim(file.path(REF, ARLOTTA_FEATURES),
+#                            header = FALSE,
+#                            stringsAsFactors = FALSE)
+
+# cells.names = read.delim(file.path(REF, ARLOTTA_CELLS),
+#                            header = FALSE,
+#                            stringsAsFactors = FALSE)
+
+# colnames(reference_matrix) = cells.names$V1
+# rownames(reference_matrix) = feature.names$V1
+
+# # #-------------------------------------
+# # # Transpose the matrix :
+# # # Cells in row and genes in columns +
+# # # Transform rownames and colnames
+# # # as characters (for sims).
+# # #-------------------------------------
+# reference_matrix <- t(reference_matrix)
+# reference_matrix <- as.matrix(reference_matrix)
+
+# colnames(reference_matrix) <- as.character(colnames(reference_matrix)) # Genes
+# rownames(reference_matrix) <- as.character(rownames(reference_matrix)) # Cells
+
+# #-------------------------------------
+# # Write a new matrix : Keep only
+# # mouse age that are interesting
+# # for the project
+# #-------------------------------------
+# pattern_to_keep <- c(PATTERN_TO_KEEP) 
+# rows_to_keep <- grepl(paste(pattern_to_keep, collapse = "|"), rownames(reference_matrix))
+# reference_matrix <- reference_matrix[rows_to_keep, ]
+
+# #-------------------------------------
+# # Keep cells that match both the 
+# # metadata an matrix of reference
+# #-------------------------------------
+# column_metadata <- reference_metadata$NAME
+# matching_cells <- intersect(rownames(reference_matrix), column_metadata)
+# reference_metadata <- reference_metadata[reference_metadata$NAME %in% matching_cells, ]
+# reference_matrix <- reference_matrix[matching_cells, ]
+
+# #-------------------------------------
+# # Order metadata according to matrix
+# # (Necessary ?)
+# #-------------------------------------
+# reference_metadata <- reference_metadata[match(rownames(reference_matrix),reference_metadata$NAME),]
+
+# #-------------------------------------
+# # Load our matrix to annotate
+# #-------------------------------------
+# matrix <- fread(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_normalized_matrix.csv"))) # The normalized matrix is used for this reference.
+# matrix <- as.matrix(matrix)
+# rownames(matrix) <- matrix[,"V1"]
+
+# #-------------------------------------
+# # Look at genes who are common in both
+# # matrix and subset genes who are not
+# #-------------------------------------
+# gene_intersection <- intersect(colnames(matrix), colnames(reference_matrix))
+# reference_matrix <- reference_matrix[, gene_intersection]
+# matrix <- matrix[, gene_intersection]
+
+# #-------------------------------------
+# # Wtrite the two new matrix +
+# # create directory
+# #-------------------------------------
+# dir.create(file.path(OUTPUTDIR, STEP3, SAMPLE_ID))
+
+# class(reference_matrix) <- "numeric"
+# write.csv(reference_matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_REF_MATRIX,".csv")))
+
+# class(matrix) <- "numeric"
+# write.csv(matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_MATRIX,".csv")))
+
+# fwrite(x = reference_metadata, file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_REF_METADATA,".csv")))
+
+# #-------------------------------------
+# # create the output file
+# #-------------------------------------
+# output_file<-file(TEXT_OUTPUT)
+# writeLines(c("Files preparation for the Arlotta reference for SIMS finished (CSV format)"), output_file)
+# close(output_file)
\ No newline at end of file

From 8391f19204082a354040b8c2c85ca3c962a24a6f Mon Sep 17 00:00:00 2001
From: "Lea CHABOT lea-chabot@orange.fr" <lchabot@login01.cluster>
Date: Wed, 3 Jan 2024 16:09:34 +0100
Subject: [PATCH 02/15] preparation_sims

---
 02_Container/preparation_sims.yaml | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/02_Container/preparation_sims.yaml b/02_Container/preparation_sims.yaml
index a8d356a..48dd025 100644
--- a/02_Container/preparation_sims.yaml
+++ b/02_Container/preparation_sims.yaml
@@ -4,20 +4,11 @@ channels :
     - conda-forge
     - defaults
 dependencies :
-    - r-base =4.3.0
-    - r-data.table
-    - r-matrix
-    # - conda-forge::anndata = 0.9.2
-    # - conda-forge::pandas = 2.1.1
-    - r-biocmanager = 1.30.22
-    - r-magick
-    - bioconductor-scran
-    - r-igraph
-    - bioconductor-scater =1.28.0
-    # - bioconductor-biostrings
-    # - bioconductor-keggrest
-    # - bioconductor-annotationdbi
-    # - bioconductor-annotationhub
-    - r-reshape2 =1.4.4
-
-    
\ No newline at end of file
+    - r-data.table = 1.14.2
+    - conda-forge::r-matrix = 1.5.4.1
+    - conda-forge::anndata = 0.9.2
+    - conda-forge::pandas = 2.1.1
+    - r-seurat = 4.0.3
+    - r-dplyr = 1.0.9
+    - conda-forge::r-hdf5r = 1.3.8
+    - bioconductor-rhdf5 = 2.38.1
\ No newline at end of file

From c110f4a2f220476e7457f631ad5e71ebc34edb8c Mon Sep 17 00:00:00 2001
From: "Lea CHABOT lea-chabot@orange.fr" <lchabot@login01.cluster>
Date: Wed, 3 Jan 2024 16:33:18 +0100
Subject: [PATCH 03/15] test

---
 02_Container/preparation_sims.yaml | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/02_Container/preparation_sims.yaml b/02_Container/preparation_sims.yaml
index 48dd025..1544613 100644
--- a/02_Container/preparation_sims.yaml
+++ b/02_Container/preparation_sims.yaml
@@ -1,14 +1,17 @@
-channels :
-    - r
-    - bioconda
-    - conda-forge
     - defaults
 dependencies :
-    - r-data.table = 1.14.2
-    - conda-forge::r-matrix = 1.5.4.1
-    - conda-forge::anndata = 0.9.2
-    - conda-forge::pandas = 2.1.1
-    - r-seurat = 4.0.3
-    - r-dplyr = 1.0.9
-    - conda-forge::r-hdf5r = 1.3.8
-    - bioconductor-rhdf5 = 2.38.1
\ No newline at end of file
+    - r-base =4.3.0
+    - r-data.table
+    - r-matrix
+    # - conda-forge::anndata = 0.9.2
+    # - conda-forge::pandas = 2.1.1
+    - r-biocmanager = 1.30.22
+    - r-magick
+    - bioconductor-scran
+    - r-igraph
+    - bioconductor-scater =1.28.0
+    # - bioconductor-biostrings
+    # - bioconductor-keggrest
+    # - bioconductor-annotationdbi
+    # - bioconductor-annotationhub
+    - r-reshape2 =1.4.4
\ No newline at end of file

From ddc8e791d8e64162675d8d258a14a6e602534003 Mon Sep 17 00:00:00 2001
From: "Lea CHABOT lea-chabot@orange.fr" <lchabot@login01.cluster>
Date: Wed, 3 Jan 2024 16:49:02 +0100
Subject: [PATCH 04/15] Container knnor

---
 02_Container/knnor.yaml | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 02_Container/knnor.yaml

diff --git a/02_Container/knnor.yaml b/02_Container/knnor.yaml
new file mode 100644
index 0000000..556eb30
--- /dev/null
+++ b/02_Container/knnor.yaml
@@ -0,0 +1,12 @@
+channels :
+    - main
+    - defaults
+    - bioconda
+    - conda-forge
+dependencies :
+    - python = 3.9
+    - pandas
+    - numpy
+    - pip
+    - pip :
+        - knnor

From 79a49914c211a0d0425b9636f3b9fa68e8025ea5 Mon Sep 17 00:00:00 2001
From: "Lea CHABOT lea-chabot@orange.fr" <lchabot@login01.cluster>
Date: Wed, 3 Jan 2024 16:50:59 +0100
Subject: [PATCH 05/15] Scripts + gitignore

---
 .gitignore                           |   5 +-
 03_Script/allen_mouse_whole_cortex.R |  73 ++++++++--
 03_Script/arlotta.R                  |   2 +-
 03_Script/imbalanced_datasets.py     | 200 +++++++++++++++++++++++++++
 03_Script/sims.py                    | 116 ++++++++--------
 03_Script/to_anndata_file.py         |  21 +--
 6 files changed, 327 insertions(+), 90 deletions(-)
 create mode 100644 03_Script/imbalanced_datasets.py

diff --git a/.gitignore b/.gitignore
index feb3cad..adbbf72 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,6 @@
 02_Container/cellranger.sif
 02_Container/mambaforge:23.1.0-1.sif
 05_Output
-05_Output_june
 .vscode/
 .bash_history/
 .cache/
@@ -18,4 +17,6 @@ test
 S004647_to_S004650_unoptimized_ref/
 S004647_to_S004650/
 __pycache__
-
+.netrc
+*.errs
+*.out
diff --git a/03_Script/allen_mouse_whole_cortex.R b/03_Script/allen_mouse_whole_cortex.R
index f455df2..4eac2e5 100644
--- a/03_Script/allen_mouse_whole_cortex.R
+++ b/03_Script/allen_mouse_whole_cortex.R
@@ -8,21 +8,24 @@
 #-------------------------------------
 library(rhdf5)
 library(data.table)
+library(Seurat)
+library(dplyr)
 
 #-------------------------------------
 # Path to files / folders
 #-------------------------------------
 DIRECTORY = getwd()
 TEXT_OUTPUT = snakemake@output[["data_for_sims_output"]]
-OUTPUTDIR = file.path(dirname(DIRECTORY), "05_Output")
-REF = file.path(dirname(DIRECTORY), "01_Reference")
+OUTPUTDIR = file.path((DIRECTORY), "05_Output")
+REF = file.path((DIRECTORY), "01_Reference")
 
 SAMPLE_ID = snakemake@params[["sample_id"]]
 
+NORM_METHOD = snakemake@params[["norm_method"]]
+NORM_SCALE_FACTOR = snakemake@params[["norm_scale_factor"]]
+
 ALLEN_METADATA = snakemake@params[["allen_metadata"]]
 ALLEN_MATRIX = snakemake@params[["allen_matrix"]]
-ALLEN_GENES = snakemake@params[["allen_genes"]]
-ALLEN_CELLS =  snakemake@params[["allen_cells"]]
 
 OUTPUT_NAME_REF_METADATA = snakemake@params[["output_name_ref_metadata"]]
 OUTPUT_NAME_REF_MATRIX = snakemake@params[["output_name_ref_matrix"]]
@@ -35,22 +38,36 @@ STEP3 = "03_sims/"
 # Load the metadata of the reference
 #-------------------------------------
 reference_metadata <- fread(file = file.path(REF, ALLEN_METADATA))
+print("Reference metadata loaded. It's size is :")
+print(dim(reference_metadata))
 
 ########################################## PROPRE AU PROJET ##########################################
 #-------------------------------------
 # Keep only some part of the cortex
 # we want to study: Ssp
 #-------------------------------------
-reference_metadata <- reference_metadata[reference_metadata$region_label %in% "Ssp",]
+reference_metadata <- reference_metadata[reference_metadata$region_label %in% "SSp",]
+print("Only SSp are kept in the metadata: The new size of this one is :")
+print(dim(reference_metadata))
 
 ######################################################################################################
 
+#-------------------------------------
+# For the training we need at least
+# two cells per labels
+#-------------------------------------
+occurence <- table(reference_metadata$subclass_label)
+one_cell_per_label <- names(occurence[occurence == 1])
+reference_metadata <- reference_metadata[!(reference_metadata$subclass_label %in% one_cell_per_label), ]
+print("The new size after removing labels thaht only appear once :")
+print(dim(reference_metadata))
+
 #-------------------------------------
 # Load the matrix of reference
 #-------------------------------------
 reference_matrix <- h5read(file = file.path(REF, ALLEN_MATRIX), "/data/counts")
-reference_genes <- h5read(file = file.path(REF, ALLEN_GENES), "/data/gene")
-reference_cells <- h5read(file = file.path(REF, ALLEN_CELLS), "/data/samples")
+reference_genes <- h5read(file = file.path(REF, ALLEN_MATRIX), "/data/gene")
+reference_cells <- h5read(file = file.path(REF, ALLEN_MATRIX), "/data/samples")
 
 rownames(reference_matrix) <- as.character(reference_cells)
 colnames(reference_matrix) <- as.character(reference_genes)
@@ -58,34 +75,56 @@ colnames(reference_matrix) <- as.character(reference_genes)
 rm(list = c("reference_genes","reference_cells"))
 gc()
 
+print("Reference matrix loaded. The size is :")
+print(dim(reference_matrix))
+
 #-------------------------------------
-# Keep cells thaht match both the 
+# Keep cells that match both the 
 # metadata an matrix of reference
 #-------------------------------------
 reference_matrix <- reference_matrix[rownames(reference_matrix) %in% reference_metadata$sample_name,]
 reference_metadata <- reference_metadata[reference_metadata$sample_name %in% rownames(reference_matrix),]
 
+print("The metadata and matrix of reference have the same cells now : The new size of them tow :")
+print(dim(reference_matrix))
+print(dim(reference_metadata))
+
 #-------------------------------------
 # Order metadata according to matrix
 # (Necessary ?)
 #-------------------------------------
 reference_metadata <- reference_metadata[match(rownames(reference_matrix),reference_metadata$sample_name),]
+print("Cells are in the same order in booth files")
 
 #-------------------------------------
-# Wtrite the the metadata as csv   
+# Write the the metadata as csv   
 #-------------------------------------
+dir.create(file.path(OUTPUTDIR, STEP3, SAMPLE_ID))
+
 fwrite(x = reference_metadata, file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_REF_METADATA, ".csv")))
 
 rm("reference_metadata")
 gc()
 
+print("Metadata is written")
+#-------------------------------------
+# Normalization of the matrix of
+# reference
+#-------------------------------------
+reference_matrix <-  NormalizeData( object = reference_matrix, normalization.method = NORM_METHOD, scale.factor = NORM_SCALE_FACTOR, verbose = FALSE)
+print("The matrix of reference is now normalized the same way as our matrix to analyze: Here the first 20 rows and 10 columns :")
+print(reference_matrix[1:20, 1:10])
+
 #-------------------------------------
 # Load our matrix to annotate
 #-------------------------------------
-matrix <- fread(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_count_matrix.csv"))) # The count matrix is used for this reference.
+matrix <- fread(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_normalized_matrix.csv")))
 matrix <- as.matrix(matrix)
 rownames(matrix) <- matrix[,"V1"]
 
+print("Our matrix to annotate is loaded, see the dimension below :")
+print(dim(matrix))
+
 #-------------------------------------
 # Look at genes who are common in both
 # matrix and subset genes who are nots
@@ -94,19 +133,25 @@ gene_intersection <- intersect(colnames(matrix), colnames(reference_matrix))
 reference_matrix <- reference_matrix[, gene_intersection]
 matrix <- matrix[, gene_intersection]
 
+print("Only the gene in common in booth our matrix are kept : The new dimmension :")
+print(dim(reference_matrix))
+print(dim(matrix))
+
 #-------------------------------------
 # Wtrite the two new matrix
 #-------------------------------------
-dir.create(file.path(OUTPUTDIR, STEP3, SAMPLE_ID))
-
 class(matrix) <- "numeric"
 write.csv(matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_MATRIX,".csv")))
+print("Our matrix is written")
 
 rm("matrix")
 gc()
 
-class(reference_matrix) <- "numeric"
+# reference_matrix <- as.matrix(reference_matrix)
+# rownames(reference_matrix) <- reference_matrix[,"sample_name"]
+# class(reference_matrix) <- "numeric"
 write.csv(reference_matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_REF_MATRIX,".csv")))
+print("Our matrix of reference is written")
 
 rm("reference_matrix")
 gc()
@@ -115,5 +160,5 @@ gc()
 # create the output file
 #-------------------------------------
 output_file<-file(TEXT_OUTPUT)
-writeLines(c("Files preparation for the Arlotta reference for SIMS finished (CSV format)"), output_file)
+writeLines(c("Files preparation for the Allen reference for SIMS finished (CSV format)"), output_file)
 close(output_file)
diff --git a/03_Script/arlotta.R b/03_Script/arlotta.R
index 8ca4d32..8874d8c 100755
--- a/03_Script/arlotta.R
+++ b/03_Script/arlotta.R
@@ -3,7 +3,7 @@
 # Paola Arlotta reference :
 # https://www.nature.com/articles/s41586-021-03670-5
 #..............................................................
-###########################################################################
+
 #-------------------------------------
 # Library
 #-------------------------------------
diff --git a/03_Script/imbalanced_datasets.py b/03_Script/imbalanced_datasets.py
new file mode 100644
index 0000000..71d5b78
--- /dev/null
+++ b/03_Script/imbalanced_datasets.py
@@ -0,0 +1,200 @@
+#..............................................................
+# KNNOR : An oversampling technique for imalanced datasets
+#..............................................................
+
+#-------------------------------------
+# Import
+#-------------------------------------
+from knnor import data_augment
+from sklearn.preprocessing import LabelEncoder
+import pandas as pd
+import numpy as np
+import sys
+import os
+
+#-------------------------------------
+# Path to files / folders
+#-------------------------------------
+DIRECTORY = os.getcwd()
+OUTPUTDIR = os.path.join(DIRECTORY, "05_Output")
+
+SAMPLE_ID = sys.argv[1]
+MATRIX = sys.argv[2]
+METADATA = sys.argv[3]
+CLASS_LABEL = sys.argv[4]
+MAX_OVERSAMPLING = sys.argv[5]
+CELLS_COLUMN = sys.argv[6]
+
+STEP3 = "03_sims/"
+
+#-------------------------------------
+# Open metadata and matrix 
+#-------------------------------------
+print("****************************************************************************************************************************************************************************************")
+matrix = pd.read_csv(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, MATRIX + ".csv"))
+print("The original matrix :")
+print(matrix)
+print("")
+
+metadata = pd.read_csv(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, METADATA + ".csv"))
+print("The original metadata :")
+print(metadata)
+print("")
+
+#-------------------------------------
+# Occurence of labels in metadata
+#-------------------------------------
+max_labels = metadata[CLASS_LABEL].value_counts()
+print("The data will be increased to the level of the largest occurrence (see table below): ")
+print(max_labels)
+print("")
+
+#-------------------------------------
+# Supress the top occurences in
+# metadata if needed !
+#-------------------------------------
+MAX_OVERSAMPLING = MAX_OVERSAMPLING.split(',')
+MAX_OVERSAMPLING = '|'.join(MAX_OVERSAMPLING)
+
+new_metadata = metadata[~metadata[CLASS_LABEL].str.contains(MAX_OVERSAMPLING)]
+print("The new metadata after supression of top occurence(s) :")
+print(new_metadata)
+print("")
+
+#-------------------------------------
+# Delete cells in matrix that are not
+# anymore in the metadata
+#-------------------------------------
+new_matrix = matrix[matrix.iloc[:, 0].isin(new_metadata[CELLS_COLUMN])]
+print("The new matrix that match the new metadata :")
+print(new_matrix)
+print("")
+
+#-------------------------------------
+# Check if cells are in the same order
+# in booth file
+#-------------------------------------
+are_identical = new_matrix.iloc[:, 0].equals(new_metadata[CELLS_COLUMN])
+
+if are_identical:
+    print("Column are in the same order")
+else:
+    print("Column are not in the same order : it will be reorder") ### A faire !!!
+
+
+#-------------------------------------
+# Add the label column from the 
+# metadata at the end of the matrix
+#-------------------------------------
+new_matrix['Label'] = new_metadata[CLASS_LABEL].tolist()
+
+#-------------------------------------
+# Put the label into numeric class
+#-------------------------------------
+label_encoder = LabelEncoder()
+new_matrix['Label'] = label_encoder.fit_transform(new_matrix['Label'])
+
+# Create an asociate dictionary
+label_dictionnary = {index: label for index, label in enumerate(label_encoder.classes_)}
+print("Labels dictionnary : ")
+print(label_dictionnary)
+print("")
+
+# Put the first column as rownames
+new_matrix = new_matrix.set_index(new_matrix.columns[0])
+new_matrix.index.name = None
+
+#-------------------------------------
+# Convert the matrix to a numpy array
+#-------------------------------------
+new_matrix = np.array(new_matrix)
+
+# X : the matrix, y : the labels (numeric values)
+X = new_matrix[:,:-1]
+y = new_matrix[:,-1]
+
+# -------------------------------------
+# Use KNNOR
+# -------------------------------------
+knnor = data_augment.KNNOR()
+
+# X_new : complete data with augmented datapoints
+# y_new : Labels including the augmented ones
+# X_aug_min : Just the augmented minority points
+# y_aug_min : Labels for only augmented minority points
+X_new, y_new, X_aug_min, y_aug_min = knnor.fit_resample(X, y)
+y_new = y_new.reshape(-1,1)
+
+# new_data contains the augmented data (Matrix + numeric labels )
+new_data = np.append(X_new, y_new, axis=1)
+
+# Transform value numeric back to the labels name
+y_aug_min = np.array([label_dictionnary[label] for label in y_aug_min])
+
+# Look at label augmentation
+print("Labels augmentation : ")
+unique_values, counts = np.unique(y_aug_min, return_counts=True)
+for value, count in zip(unique_values, counts):
+    print(f"{value} is repeated {count} times.")
+print("")
+
+# -------------------------------------
+# Create final metadata
+# -------------------------------------
+# Create a dictionary to store new data
+metadata_knnor = {
+    CELLS_COLUMN: ["KNNOR" + str(i) for i in range(1, len(y_aug_min) + 1)],
+    CLASS_LABEL: list(y_aug_min)
+}
+
+# Create a DataFrame from the dictionary
+metadata_knnor = pd.DataFrame(metadata_knnor)
+print("The knnor metadata only with augmented points :")
+print(metadata_knnor)
+print("")
+
+# Add the new DataFrame to the metadata
+metadata = pd.concat([metadata, metadata_knnor], ignore_index=True)
+
+# Replace all NaN values with -100
+metadata.fillna(-100, inplace = True)
+print("The metadata with augmented values :")
+print(metadata)
+print("")
+
+# Occurence of labels with new data created
+max_labels = metadata[CLASS_LABEL].value_counts()
+print("Occurence of labels with new data created: ")
+print(max_labels)
+print("")
+
+# Write the new metadata with augmented values
+metadata.to_csv(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, "KNNOR_" + METADATA + ".csv"), index = False)
+
+# -------------------------------------
+# Create final matrix
+# -------------------------------------
+# Recover new cell names created for new data
+cell_names = list(metadata_knnor[CELLS_COLUMN])
+
+# Add a new column (first column) to the numpy array (for the concat step at the end)
+num_rows = X_aug_min.shape[0]
+empty_column = np.zeros((num_rows, 1), dtype=int)
+X_aug_min = np.hstack((empty_column, X_aug_min))
+
+# Transform the numpy matrix with augmented data to a DataFrame
+X_aug_min = pd.DataFrame(X_aug_min, columns = matrix.columns)
+
+# Add the cell names to the first column
+X_aug_min.iloc[:, 0] = cell_names
+print("The knnor matrix only with augmented points :")
+print(X_aug_min)
+print("")
+
+# Add the augmented data matrix to our matrix 
+matrix = pd.concat([matrix, X_aug_min], ignore_index=True)
+print("The matrix with augmented values :")
+print(matrix)
+
+# Write the new matrix with augmented data
+matrix.to_csv(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, "KNNOR_" + MATRIX + ".csv"), index = False)
\ No newline at end of file
diff --git a/03_Script/sims.py b/03_Script/sims.py
index eb381bf..51d54c1 100644
--- a/03_Script/sims.py
+++ b/03_Script/sims.py
@@ -1,5 +1,7 @@
 #..............................................................
-# 
+# SIMS : Scalable, Interpretable Machine Learning for 
+# Single-Cell : A machine learning tool for scRNA-Seq label
+# transfer in neuroscience
 #..............................................................
 
 #-------------------------------------
@@ -11,6 +13,7 @@
 import os
 import sys
 import anndata as an
+import wandb
 
 #-------------------------------------
 # Path to files / folders
@@ -28,72 +31,75 @@
 PATIENCE = sys.argv[7]
 MAX_EPOCH = sys.argv[8]
 MATRIX = sys.argv[9]
+KEY = sys.argv[10]
 
 STEP3 = "03_sims/"
 
-# #-----------------------------------------------------------------------------------------------
-# #                                       To train the model 
-# #-----------------------------------------------------------------------------------------------
+#-----------------------------------------------------------------------------------------------
+#                                       To train the model 
+#-----------------------------------------------------------------------------------------------
 
-# #-------------------------------------
-# # Logger : Allows you to visualize 
-# # data training.
-# #-------------------------------------
-# logger = WandbLogger(project = PROJECT_NAME, name = PROJECT_NAME, save_dir = os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID), version = "")
+# -------------------------------------
+# Logger : Allows you to visualize 
+# data training.
+# -------------------------------------
+wandb.login(key = KEY)
 
-# #-------------------------------------
-# # Load the anndata file
-# #-------------------------------------
-# reference_matrix = an.read_h5ad(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, REFERENCE_MATRIX + "_join_metadata.h5ad"))
+logger = WandbLogger(project = PROJECT_NAME, name = PROJECT_NAME, save_dir = os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID), version = "")
 
-# #-------------------------------------
-# # Custom training jobs
-# #-------------------------------------
-# sims = SIMS(data = reference_matrix, class_label = CLASS_LABEL, num_workers = 12, stratify = True) 
-
-# # weighting loss inversely proportional by label freq, helps learn rare cell types (recommended)
-# sims.setup_model(n_a=64, n_d=64, weights=sims.weights)
-
-# sims.setup_trainer(
-#     logger = logger,
-#     callbacks = [
-#         EarlyStopping(
-#             monitor = MONITOR,
-#             patience = 25,
-#             verbose = 1
-#         ),
-#         LearningRateMonitor(logging_interval = "epoch"),
-#         ModelCheckpoint(
-#             dirpath = os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID),
-#             filename = "checkpoint"
-#         ),
-#     ],
-#     max_epochs = 100,
-#     devices = 1
-# )
-
-# sims.train()
+#-------------------------------------
+# Load the anndata file
+#-------------------------------------
+reference_matrix = an.read_h5ad(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, REFERENCE_MATRIX + "_join_metadata.h5ad"))
+
+#-------------------------------------
+# Custom training jobs
+#-------------------------------------
+sims = SIMS(data = reference_matrix, class_label = CLASS_LABEL, num_workers = int(NUM_WORKERS), stratify = True) 
+
+# weighting loss inversely proportional by label freq, helps learn rare cell types (recommended)
+sims.setup_model(n_a=64, n_d=64, weights=sims.weights)
+
+sims.setup_trainer(
+    logger = logger,
+    callbacks = [
+        EarlyStopping(
+            monitor = MONITOR,
+            patience = int(PATIENCE),
+            verbose = 1,
+            # mode = 'max'
+        ),
+        LearningRateMonitor(logging_interval = "epoch"),
+        ModelCheckpoint(
+            dirpath = os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID),
+            # filename = "checkpoint_" + PROJECT_NAME + ".ckpt"
+        ),
+    ],
+    max_epochs = int(MAX_EPOCH)
+)
+
+sims.train()
 
 #-----------------------------------------------------------------------------------------------
 #                                To predict labels on our data 
 #-----------------------------------------------------------------------------------------------
 
-#-------------------------------------
-# Load the model
-#-------------------------------------
-model = os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, "checkpoint-v1.ckpt")
+# #-------------------------------------
+# # Load the model
+# #-------------------------------------
+# model = os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, "epoch=129-step=88010.ckpt")
 
-#-------------------------------------
-# Prediction
-#-------------------------------------
-sims = SIMS(weights_path = model)
-cell_prediction = sims.predict(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, MATRIX + ".h5ad"))
+# #-------------------------------------
+# # Prediction
+# #-------------------------------------
+# sims = SIMS(weights_path = model)
+# cell_prediction = sims.predict(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, MATRIX + ".h5ad"))
 
-#-------------------------------------
-# Write prediction in a csv file
-#-------------------------------------
-matrix = an.read_h5ad(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, MATRIX + ".h5ad"))
+# #-------------------------------------
+# # Write prediction in a csv file
+# #-------------------------------------
+# matrix = an.read_h5ad(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, MATRIX + ".h5ad"))
 
-cell_prediction.insert(0,"Cells", matrix.obs_names)
+# cell_prediction.insert(0,"Cells", matrix.obs_names)
 
-cell_prediction.to_csv(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, MATRIX + "_prediction.csv"), index = False)
\ No newline at end of file
+# cell_prediction.to_csv(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, MATRIX + "_prediction.csv"), index = False)
\ No newline at end of file
diff --git a/03_Script/to_anndata_file.py b/03_Script/to_anndata_file.py
index 2b1efc1..e277d61 100644
--- a/03_Script/to_anndata_file.py
+++ b/03_Script/to_anndata_file.py
@@ -21,12 +21,14 @@
 OUTPUT_NAME_REF_MATRIX = sys.argv[2]
 OUTPUT_NAME_MATRIX = sys.argv[3]
 OUTPUT_NAME_REF_METADATA = sys.argv[4]
+CELLS_COLUMN = sys.argv[5]
 
 STEP3 = "03_sims/"
 
 #-------------------------------------
 # Load reference matrix
 #-------------------------------------
+
 reference_matrix = an.read_csv(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, OUTPUT_NAME_REF_MATRIX + ".csv"))
 
 #-------------------------------------
@@ -48,7 +50,7 @@
 # Write matrix as anndata with
 # metadata inside
 #-------------------------------------
-reference_metadata = pd.read_csv(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, OUTPUT_NAME_REF_METADATA + ".csv"), index_col = "NAME")
+reference_metadata = pd.read_csv(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, OUTPUT_NAME_REF_METADATA + ".csv"), index_col = CELLS_COLUMN )
 anndata_ref_matrix.obs = pd.concat([anndata_ref_matrix.obs, reference_metadata], axis=1, join='inner')
 
 anndata_ref_matrix.write_h5ad(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, OUTPUT_NAME_REF_MATRIX + "_join_metadata.h5ad"))
@@ -62,23 +64,6 @@
 print(anndata_ref_matrix.obs_names)
 print("-------------------------------------------------------------------")
 
-cell_name_to_observe = "P4_AAACCTGAGATCGATA-1"
-
-cell_data = anndata_ref_matrix.obs.loc[cell_name_to_observe]
-print("")
-print("")
-print("-------------------------------------------------------------------")
-print(cell_data)
-
-
-gene_name_to_observe = "mt-Cytb"
-
-gene_data = anndata_ref_matrix.var.loc[gene_name_to_observe ]
-print("")
-print("")
-print("-------------------------------------------------------------------")
-print(gene_data.head(5))
-
 #-------------------------------------
 # Load the matrix to annotate
 #-------------------------------------

From fd7f172a2fd271b46373b3743a83fc55bf798c6e Mon Sep 17 00:00:00 2001
From: "Lea CHABOT lea-chabot@orange.fr" <lchabot@login01.cluster>
Date: Wed, 3 Jan 2024 17:12:38 +0100
Subject: [PATCH 06/15] test

---
 02_Container/preparation_sims.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/02_Container/preparation_sims.yaml b/02_Container/preparation_sims.yaml
index 1544613..e86cc7e 100644
--- a/02_Container/preparation_sims.yaml
+++ b/02_Container/preparation_sims.yaml
@@ -14,4 +14,4 @@ dependencies :
     # - bioconductor-keggrest
     # - bioconductor-annotationdbi
     # - bioconductor-annotationhub
-    - r-reshape2 =1.4.4
\ No newline at end of file
+    - r-reshape2 =1.4.4

From fddbb1b74bab105c40a8bde5562eb1388c64b33f Mon Sep 17 00:00:00 2001
From: "Lea CHABOT lea-chabot@orange.fr" <lchabot@login01.cluster>
Date: Wed, 3 Jan 2024 17:42:10 +0100
Subject: [PATCH 07/15] commit

---
 04_Workflow/SIMS.smk              |  9 ++++----
 04_Workflow/knnor.smk             | 38 +++++++++++++++++++++++++++++++
 04_Workflow/prepare_data_sims.smk | 25 ++++++++++----------
 Snakefile                         | 18 +++++++++++----
 run_slurm.sh                      | 21 ++++++++++-------
 5 files changed, 81 insertions(+), 30 deletions(-)
 create mode 100644 04_Workflow/knnor.smk

diff --git a/04_Workflow/SIMS.smk b/04_Workflow/SIMS.smk
index 4597f3e..7decb73 100644
--- a/04_Workflow/SIMS.smk
+++ b/04_Workflow/SIMS.smk
@@ -17,11 +17,12 @@ rule sims:
         reference_matrix = config["reference_sims"]["output_name_ref_matrix"],
         project_name = config["sims"]["project_name"],
         class_label = config["sims"]["class_label"],
-        num_workers = int(config["sims"]["num_workers"]),
+        num_workers = config["sims"]["num_workers"],
         monitor = config["sims"]["monitor"],
-        patience = int(config["sims"]["patience"]),
-        max_epoch = int(config["sims"]["max_epoch"]),
+        patience = config["sims"]["patience"],
+        max_epoch = config["sims"]["max_epoch"],
         matrix = config["reference_sims"]["output_name_matrix"],
+        key = config["sims"]["key"],
 
     message:
         "Run SIMS for cell annotation"
@@ -34,7 +35,7 @@ rule sims:
         else
             pip install --use-pep517 git+https://github.com/braingeneers/SIMS.git
             python 03_Script/sims.py {params.sample_id} {params.reference_matrix} {params.project_name} {params.class_label} \
-            {params.num_workers} {params.monitor} {params.patience} {params.max_epoch} {params.matrix}
+            {params.num_workers} {params.monitor} {params.patience} {params.max_epoch} {params.matrix} {params.key}
             touch {output.sims_output}
         fi
         """
diff --git a/04_Workflow/knnor.smk b/04_Workflow/knnor.smk
new file mode 100644
index 0000000..0620545
--- /dev/null
+++ b/04_Workflow/knnor.smk
@@ -0,0 +1,38 @@
+#----------------------------------------
+# KNNOR : K-Nearest Neighbor
+# OveRsampliing method : For imbalanced
+# datasets
+#----------------------------------------
+rule knnor:
+    input: 
+        data_for_sims_output = expand(OUTPUTDIR + "03_sims/data_for_sims_output.txt"),
+    
+    output:
+        knnor_output = expand(OUTPUTDIR +"03_sims/knnor_output.txt"),
+
+    params:
+        knnor_rule = config["rules"]["knnor_rule"],
+        sample_id = config["reference_sims"]["sample_id"],
+        matrix = config["reference_sims"]["output_name_ref_matrix"],
+        metadata = config["reference_sims"]["output_name_ref_metadata"],
+        class_label = config["knnor"]["class_label"],
+        max_oversampling = config["knnor"]["max_oversampling"],
+        cells_column = config["reference_sims"]["cells_column"],
+    
+    conda: 
+        CONTAINER + "knnor.yaml"
+
+    message:
+        "Oversampling data with KNNOR"
+
+    shell:
+        """
+        if [ {params.knnor_rule} = "FALSE" ]
+        then
+            touch {output.knnor_output}
+        else
+            
+            python 03_Script/imbalenced_datasets.py {params.sample_id} {params.matrix} {params.metadata} {params.class_label} "{params.max_oversampling}" {params.cells_column}
+            touch {output.knnor_output}
+        fi
+        """
\ No newline at end of file
diff --git a/04_Workflow/prepare_data_sims.smk b/04_Workflow/prepare_data_sims.smk
index 0c2f8bc..051dc73 100644
--- a/04_Workflow/prepare_data_sims.smk
+++ b/04_Workflow/prepare_data_sims.smk
@@ -11,25 +11,23 @@ rule data_for_sims:
 
     params:
         # sims_rule = config["rules"]["sims_rule"],
-        # General
+        ### General
         r_script = config["reference_sims"]["r_script"],
         sample_id = config["reference_sims"]["sample_id"],
         output_name_ref_metadata = config["reference_sims"]["output_name_ref_metadata"],
         output_name_ref_matrix = config["reference_sims"]["output_name_ref_matrix"],
         output_name_matrix = config["reference_sims"]["output_name_matrix"],
-        # Reference Arlotta
-        arlotta_metadata = config["reference_sims"]["arlotta_metadata"],
-        arlotta_matrix = config["reference_sims"]["arlotta_matrix"],
-        arlotta_cells = config["reference_sims"]["arlotta_cells"],
-        arlotta_features = config["reference_sims"]["arlotta_features"],
-        pattern_to_keep = config["reference_sims"]["pattern_to_keep"],
-        # Reference Allen mouse whole cortex
+        norm_method = config["seurat"]["norm_method"],
+        norm_scale_factor = config["seurat"]["norm_scale_factor"],
+        ### Reference Arlotta
+        # arlotta_metadata = config["reference_sims"]["arlotta_metadata"],
+        # arlotta_matrix = config["reference_sims"]["arlotta_matrix"],
+        # arlotta_cells = config["reference_sims"]["arlotta_cells"],
+        # arlotta_features = config["reference_sims"]["arlotta_features"],
+        developmental_time = config["reference_sims"]["developmental_time"],
+        ### Reference Allen mouse whole cortex
         allen_metadata = config["reference_sims"]["allen_metadata"],
         allen_matrix = config["reference_sims"]["allen_matrix"],
-        allen_genes = config["reference_sims"]["allen_genes"],
-        allen_cells = config["reference_sims"]["allen_cells"],
-        # Reference MouseGastrulation
-        mousegastrulation_samples = config["reference_sims"]["mousegastrulation_samples"].split(','),
 
     conda:
         CONTAINER + "preparation_sims.yaml"
@@ -58,6 +56,7 @@ rule anndata_for_sims:
         output_name_ref_matrix = config["reference_sims"]["output_name_ref_matrix"],
         output_name_matrix = config["reference_sims"]["output_name_matrix"],
         output_name_ref_metadata = config["reference_sims"]["output_name_ref_metadata"],
+        cells_column = config["reference_sims"]["cells_column"],
 
     conda:
         CONTAINER + "preparation_sims.yaml"
@@ -71,7 +70,7 @@ rule anndata_for_sims:
         then
             touch {output.anndata_for_sims_output}
         else
-            python 03_Script/to_anndata_file.py {params.sample_id} {params.output_name_ref_matrix} {params.output_name_matrix} {params.output_name_ref_metadata}
+            python 03_Script/to_anndata_file.py {params.sample_id} {params.output_name_ref_matrix} {params.output_name_matrix} {params.output_name_ref_metadata} {params.cells_column}
             touch {output.anndata_for_sims_output}
         fi
         """
\ No newline at end of file
diff --git a/Snakefile b/Snakefile
index 2c3ed2b..cd47d36 100644
--- a/Snakefile
+++ b/Snakefile
@@ -76,10 +76,15 @@ rule all:
 		# Seurat
 		# seurat_report = expand(OUTPUTDIR + "02_seurat/{sample_id}/{sample_id}_seurat_report.html", sample_id = SAMPLE_ID),
 		### Prepare data for SIMS
-		data_for_sims_output = expand(OUTPUTDIR + "03_sims/data_for_sims_output.txt"),
+		# data_for_sims_output = expand(OUTPUTDIR + "03_sims/data_for_sims_output.txt"),
+		# anndata_for_sims_output = expand(OUTPUTDIR + "03_sims/anndata_for_sims_output.txt"),
+		### KNNOR
+		# knnor_output = expand(OUTPUTDIR +"03_sims/knnor_output.txt"),
 		# anndata_for_sims_output = expand(OUTPUTDIR + "03_sims/anndata_for_sims_output.txt"),
 		### SIMS
-		# sims_output = expand(OUTPUTDIR + "03_sims/output_sims.txt"),
+		sims_output = expand(OUTPUTDIR + "03_sims/output_sims.txt"),
+
+
 
 		## Differential expression analyses
 		# violinplot = expand(OUTPUTDIR + "03_diffexp/violin_plot/{features}_violin_plot.pdf", features=FEATURES),
@@ -132,13 +137,16 @@ if run_demultiplex:
 	include: ENVDIR + "seurat.smk"
 	include: ENVDIR + "prepare_data_sims.smk"
 	include: ENVDIR + "SIMS.smk"
+	include: ENVDIR + "knnor.smk"
 
 if run_multiplex:	
-	# include: ENVDIR + "clean.smk"
-	# include: ENVDIR + "cellranger.smk"
+	include: ENVDIR + "clean.smk"
+	include: ENVDIR + "cellranger.smk"
 	include: ENVDIR + "seurat.smk"
 	include: ENVDIR + "prepare_data_sims.smk"
-	# include: ENVDIR + "SIMS.smk"
+	include: ENVDIR + "SIMS.smk"
+	include: ENVDIR + "knnor.smk"
+
 
 # include: ENVDIR + "demuxlet.smk"
 # include: ENVDIR + "seurat.smk"
diff --git a/run_slurm.sh b/run_slurm.sh
index d7cd545..910cc71 100644
--- a/run_slurm.sh
+++ b/run_slurm.sh
@@ -1,12 +1,12 @@
 #!/bin/sh
-#SBATCH -J Snakemake_run
-#SBATCH -p skylake
-#SBATCH -N 1
-#SBATCH -n 12
+
+#SBATCH -J allen_SIMS
+#SBATCH -p kepler
+#SBATCH --ntasks-per-node 18
 #SBATCH -A b324
-#SBATCH -t 2:00:00
+#SBATCH -t 20:00:00
 #SBATCH -o ./%N.%x.out
-#SBATCH -e ./%N.%x.err
+#SBATCH -e ./%N.%x.errs
 
 # This script needs to be started from the run directory
 
@@ -16,10 +16,15 @@ module load userspace/all
 module load python3/3.6.3
 module load singularity/3.5.1
 pip install snakemake==6.3.0
-pip install pandas
+# module load cuda/11.6
 
 # ================================================
 # Run the workflow
 # ================================================
 
-snakemake --snakefile Snakefile --use-singularity --use-conda --conda-frontend conda --conda-not-block-search-path-envvars --singularity-args="-B /scratch/$SLURM_JOB_USER/ibdm_lenne_scrnaseq_gastruloids/" --cores 12
\ No newline at end of file
+# For the worflow in general
+snakemake --unlock
+# snakemake --snakefile Snakefile --use-singularity --use-conda --conda-frontend conda --conda-not-block-search-path-envvars --singularity-args="-B /scratch/$SLURM_JOB_USER/scRNAseq_analysis_workflow/" --cores 18
+
+### For sims
+snakemake --snakefile Snakefile --use-singularity --singularity-args="-B /scratch/$SLURM_JOB_USER/scRNAseq_analysis_workflow/" --cores 18
\ No newline at end of file

From 38a61840dbdeb17385e5e826a1171b1bde00ad75 Mon Sep 17 00:00:00 2001
From: "Lea CHABOT lea-chabot@orange.fr" <lchabot@login01.cluster>
Date: Wed, 3 Jan 2024 16:01:01 +0100
Subject: [PATCH 08/15] arlotta

---
 03_Script/arlotta.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/03_Script/arlotta.R b/03_Script/arlotta.R
index 8874d8c..593fa66 100755
--- a/03_Script/arlotta.R
+++ b/03_Script/arlotta.R
@@ -3,7 +3,7 @@
 # Paola Arlotta reference :
 # https://www.nature.com/articles/s41586-021-03670-5
 #..............................................................
-
+###########################################################################
 #-------------------------------------
 # Library
 #-------------------------------------
@@ -12,7 +12,7 @@ library(data.table)
 library(Seurat)
 
 #-------------------------------------
-# Path to files / folderss
+# Path to files / folders
 #-------------------------------------
 DIRECTORY = getwd()
 TEXT_OUTPUT = snakemake@output[["data_for_sims_output"]]

From 12b31d660fc5f96f768962e868b27588d011d104 Mon Sep 17 00:00:00 2001
From: "Lea CHABOT lea-chabot@orange.fr" <lchabot@login01.cluster>
Date: Fri, 16 Feb 2024 15:32:36 +0100
Subject: [PATCH 09/15] umapCellAssignation

---
 03_Script/umapCellAssignation.R     | 79 +++++++++++++++++++++++++++++
 04_Workflow/umapCellAssignation.smk | 19 +++++++
 Snakefile                           | 11 +++-
 3 files changed, 107 insertions(+), 2 deletions(-)
 create mode 100644 03_Script/umapCellAssignation.R
 create mode 100644 04_Workflow/umapCellAssignation.smk

diff --git a/03_Script/umapCellAssignation.R b/03_Script/umapCellAssignation.R
new file mode 100644
index 0000000..7139e4b
--- /dev/null
+++ b/03_Script/umapCellAssignation.R
@@ -0,0 +1,79 @@
+#..............................................................
+# 
+#..............................................................
+
+#-------------------------------------
+# Library
+#-------------------------------------
+library(Seurat)
+library(ggplot2)
+library(data.table)
+library(dplyr)
+
+#-------------------------------------
+# Path to files / folders
+#-------------------------------------
+DIRECTORY = getwd()
+TEXT_OUTPUT = snakemake@output[["data_for_sims_output"]]
+OUTPUTDIR = file.path((DIRECTORY), "05_Output")
+
+SAMPLE_ID = snakemake@params[["sample_id"]]
+
+MATRIX = snakemake@params[["matrix"]]
+
+STEP2 = "02_seurat/"
+STEP3 = "03_sims/"
+
+#-------------------------------------
+# Load  the perdiction file and the 
+# seurat file
+#-------------------------------------
+data <- readRDS(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID,"_filtered_seurat_object.rds")))
+pred <- fread(file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(MATRIX,"_prediction.csv")))
+
+#-------------------------------------
+# Add the first prediction in the 
+# seurat object
+#-------------------------------------
+data$labels <- pred$first_pred
+Idents(data) <- data$labels
+
+#-------------------------------------
+# Create a list for each label
+#-------------------------------------
+liste <- unique(pred$first_pred)
+
+#-------------------------------------
+# Create a list to stock each UMAP
+#-------------------------------------
+plots_list <- list()
+
+#-------------------------------------
+# To create a UMAP for each label
+#-------------------------------------
+for (i in liste) {
+  cells <- WhichCells(data, idents = c(i))
+  umap <- DimPlot(data, reduction = "umap", group.by = "labels", cells.highlight= list(cells), cols.highlight = c("lightblue"), cols= "grey") + ggtitle(i)
+  plots_list[[i]] <- umap
+}
+
+#-------------------------------------
+# Save each figure on a pdf
+#-------------------------------------
+num_graphiques_par_page <- 1
+pages <- split(plots_list, ceiling(seq_along(plots_list) / num_graphiques_par_page))
+pdf(file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, "umap_per_labels.csv"))
+for (i in seq_along(pages)) {
+    for(plot in pages[[i]]){
+        print(plot)
+    }
+}
+dev.off()
+
+#-------------------------------------
+# Save the globa figure on a pdf
+#-------------------------------------
+pdf(file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, "umap.csv"))
+UMAP <- DimPlot(data, group.by = "labels", repel = TRUE) + theme(legend.position = "right")
+print(UMAP)
+dev.off()
\ No newline at end of file
diff --git a/04_Workflow/umapCellAssignation.smk b/04_Workflow/umapCellAssignation.smk
new file mode 100644
index 0000000..11975ce
--- /dev/null
+++ b/04_Workflow/umapCellAssignation.smk
@@ -0,0 +1,19 @@
+rule umapCellAssignation:
+    input:
+        sims_output = expand(OUTPUTDIR + "03_sims/output_sims.txt"),
+
+    output:
+        umapAssignation_output = expand(OUTPUTDIR + "03_sims/umapAssignation_output.txt"),
+
+    params:
+        sample_id = config["reference_sims"]["sample_id"],
+        matrix = config["reference_sims"]["output_name_matrix"],
+    
+    conda :
+        CONTAINER + "seurat.yaml"
+
+    message:
+        "Cells assignation on UMAP"
+
+    script:
+        SCRIPTDIR + "umapCellAssignation.R"
diff --git a/Snakefile b/Snakefile
index cd47d36..737300a 100644
--- a/Snakefile
+++ b/Snakefile
@@ -77,12 +77,15 @@ rule all:
 		# seurat_report = expand(OUTPUTDIR + "02_seurat/{sample_id}/{sample_id}_seurat_report.html", sample_id = SAMPLE_ID),
 		### Prepare data for SIMS
 		# data_for_sims_output = expand(OUTPUTDIR + "03_sims/data_for_sims_output.txt"),
-		# anndata_for_sims_output = expand(OUTPUTDIR + "03_sims/anndata_for_sims_output.txt"),
+		anndata_for_sims_output = expand(OUTPUTDIR + "03_sims/anndata_for_sims_output.txt"),
 		### KNNOR
 		# knnor_output = expand(OUTPUTDIR +"03_sims/knnor_output.txt"),
 		# anndata_for_sims_output = expand(OUTPUTDIR + "03_sims/anndata_for_sims_output.txt"),
 		### SIMS
-		sims_output = expand(OUTPUTDIR + "03_sims/output_sims.txt"),
+		# sims_output = expand(OUTPUTDIR + "03_sims/output_sims.txt"),
+		# umapAssignation_output = expand(OUTPUTDIR + "03_sims/umapAssignation_output.txt"),
+		################### REMI
+		# remi_output = expand(OUTPUTDIR + "remi_output.txt"),
 
 
 
@@ -138,6 +141,8 @@ if run_demultiplex:
 	include: ENVDIR + "prepare_data_sims.smk"
 	include: ENVDIR + "SIMS.smk"
 	include: ENVDIR + "knnor.smk"
+	include: ENVDIR + "remi.smk"
+	include: ENVDIR + "umapCellAssignation.smk"
 
 if run_multiplex:	
 	include: ENVDIR + "clean.smk"
@@ -146,6 +151,8 @@ if run_multiplex:
 	include: ENVDIR + "prepare_data_sims.smk"
 	include: ENVDIR + "SIMS.smk"
 	include: ENVDIR + "knnor.smk"
+	include: ENVDIR + "remi.smk"
+	include: ENVDIR + "umapCellAssignation.smk"
 
 
 # include: ENVDIR + "demuxlet.smk"

From 1687de3a71a9683b45878ebaa96e8aec1dedbebe Mon Sep 17 00:00:00 2001
From: "Lea CHABOT lea-chabot@orange.fr" <lchabot@login01.cluster>
Date: Fri, 5 Apr 2024 16:38:34 +0200
Subject: [PATCH 10/15] previous work

---
 02_Container/remi.yaml               |  10 ++
 03_Script/allen_mouse_whole_cortex.R | 116 ++++++++-----
 03_Script/arlotta.R                  | 249 +++++++++------------------
 3 files changed, 161 insertions(+), 214 deletions(-)
 create mode 100644 02_Container/remi.yaml

diff --git a/02_Container/remi.yaml b/02_Container/remi.yaml
new file mode 100644
index 0000000..d9d5e69
--- /dev/null
+++ b/02_Container/remi.yaml
@@ -0,0 +1,10 @@
+channels :
+    - r
+    - bioconda
+    - conda-forge
+    - defaults
+dependencies :
+    - r-seurat = 4.0.3
+    - r-dplyr = 1.0.9
+    - r-ggplot2 =3.3.5
+    - r-cowplot = 1.1.1
diff --git a/03_Script/allen_mouse_whole_cortex.R b/03_Script/allen_mouse_whole_cortex.R
index 4eac2e5..baab9ca 100644
--- a/03_Script/allen_mouse_whole_cortex.R
+++ b/03_Script/allen_mouse_whole_cortex.R
@@ -20,6 +20,7 @@ OUTPUTDIR = file.path((DIRECTORY), "05_Output")
 REF = file.path((DIRECTORY), "01_Reference")
 
 SAMPLE_ID = snakemake@params[["sample_id"]]
+REFERENCE_NAME = snakemake@params[["reference_name"]]
 
 NORM_METHOD = snakemake@params[["norm_method"]]
 NORM_SCALE_FACTOR = snakemake@params[["norm_scale_factor"]]
@@ -33,11 +34,24 @@ OUTPUT_NAME_MATRIX = snakemake@params[["output_name_matrix"]]
  
 STEP2 = "02_seurat/"
 STEP3 = "03_sims/"
+STEP_REMI = "03_remi/"
+
+METADATA_REMI <- TRUE
 
 #-------------------------------------
 # Load the metadata of the reference
 #-------------------------------------
-reference_metadata <- fread(file = file.path(REF, ALLEN_METADATA))
+
+if(METADATA_REMI){
+    reference_metadata <- fread(file = file.path(OUTPUTDIR, STEP_REMI, "final_allen.csv"))
+    cells_names <- "sample_name"
+    celltypes <- "celltype_label"
+} else {
+    reference_metadata <- fread(file = file.path(REF, ALLEN_METADATA))
+    cells_names <- "sample_name"
+    celltypes <- "subclass_label"
+}
+
 print("Reference metadata loaded. It's size is :")
 print(dim(reference_metadata))
 
@@ -46,19 +60,21 @@ print(dim(reference_metadata))
 # Keep only some part of the cortex
 # we want to study: Ssp
 #-------------------------------------
-reference_metadata <- reference_metadata[reference_metadata$region_label %in% "SSp",]
-print("Only SSp are kept in the metadata: The new size of this one is :")
-print(dim(reference_metadata))
-
+if(!METADATA_REMI){
+    reference_metadata <- reference_metadata[reference_metadata$region_label %in% "SSp",]
+    print("Only SSp are kept in the metadata: The new size of this one is :")
+    print(dim(reference_metadata))
+}
 ######################################################################################################
 
 #-------------------------------------
 # For the training we need at least
 # two cells per labels
 #-------------------------------------
-occurence <- table(reference_metadata$subclass_label)
+occurence <- table(reference_metadata[[celltypes]])
 one_cell_per_label <- names(occurence[occurence == 1])
-reference_metadata <- reference_metadata[!(reference_metadata$subclass_label %in% one_cell_per_label), ]
+reference_metadata <- reference_metadata[!(reference_metadata[[celltypes]] %in% one_cell_per_label), ]
+
 print("The new size after removing labels thaht only appear once :")
 print(dim(reference_metadata))
 
@@ -82,10 +98,10 @@ print(dim(reference_matrix))
 # Keep cells that match both the 
 # metadata an matrix of reference
 #-------------------------------------
-reference_matrix <- reference_matrix[rownames(reference_matrix) %in% reference_metadata$sample_name,]
-reference_metadata <- reference_metadata[reference_metadata$sample_name %in% rownames(reference_matrix),]
+reference_matrix <- reference_matrix[rownames(reference_matrix) %in% reference_metadata[[cells_names]],]
+reference_metadata <- reference_metadata[reference_metadata[[cells_names]] %in% rownames(reference_matrix),]
 
-print("The metadata and matrix of reference have the same cells now : The new size of them tow :")
+print("The metadata and matrix of reference have the same cells now : The new size of them two :")
 print(dim(reference_matrix))
 print(dim(reference_metadata))
 
@@ -93,68 +109,76 @@ print(dim(reference_metadata))
 # Order metadata according to matrix
 # (Necessary ?)
 #-------------------------------------
-reference_metadata <- reference_metadata[match(rownames(reference_matrix),reference_metadata$sample_name),]
-print("Cells are in the same order in booth files")
+reference_metadata <- reference_metadata[match(rownames(reference_matrix),reference_metadata[[cells_names]]),]
+print("Cells are in the same order in both files")
 
 #-------------------------------------
 # Write the the metadata as csv   
 #-------------------------------------
-dir.create(file.path(OUTPUTDIR, STEP3, SAMPLE_ID))
+dir.create(file.path(OUTPUTDIR, STEP3, REFERENCE_NAME))
 
-fwrite(x = reference_metadata, file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_REF_METADATA, ".csv")))
+fwrite(x = reference_metadata, file = file.path(OUTPUTDIR, STEP3, REFERENCE_NAME, paste0(OUTPUT_NAME_REF_METADATA, ".csv")))
 
 rm("reference_metadata")
 gc()
 
 print("Metadata is written")
+
 #-------------------------------------
 # Normalization of the matrix of
 # reference
 #-------------------------------------
-reference_matrix <-  NormalizeData( object = reference_matrix, normalization.method = NORM_METHOD, scale.factor = NORM_SCALE_FACTOR, verbose = FALSE)
-print("The matrix of reference is now normalized the same way as our matrix to analyze: Here the first 20 rows and 10 columns :")
-print(reference_matrix[1:20, 1:10])
+### Seurat change the names of some genes at this step
+cell_before_norm <- as.data.table(colnames(reference_matrix))
+write.csv(cell_before_norm, file.path(OUTPUTDIR, STEP3, REFERENCE_NAME, "cells_before_norm.csv"))
 
-#-------------------------------------
-# Load our matrix to annotate
-#-------------------------------------
-matrix <- fread(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_normalized_matrix.csv")))
-matrix <- as.matrix(matrix)
-rownames(matrix) <- matrix[,"V1"]
+reference_matrix <- NormalizeData( object = reference_matrix, normalization.method = NORM_METHOD, scale.factor = NORM_SCALE_FACTOR, verbose = FALSE)
 
-print("Our matrix to annotate is loaded, see the dimension below :")
-print(dim(matrix))
+cell_after_norm <- as.data.table(colnames(reference_matrix))
+write.csv(cell_after_norm, file.path(OUTPUTDIR, STEP3, REFERENCE_NAME, "cells_after_norm.csv"))
 
-#-------------------------------------
-# Look at genes who are common in both
-# matrix and subset genes who are nots
-#-------------------------------------
-gene_intersection <- intersect(colnames(matrix), colnames(reference_matrix))
-reference_matrix <- reference_matrix[, gene_intersection]
-matrix <- matrix[, gene_intersection]
+print("The matrix of reference is now normalized the same way as our matrix to analyze: Here the first 10 rows and 10 columns :")
+print(reference_matrix[1:10, 1:10])
 
-print("Only the gene in common in booth our matrix are kept : The new dimmension :")
-print(dim(reference_matrix))
-print(dim(matrix))
+# #-------------------------------------
+# # Load our matrix to annotate
+# #-------------------------------------
+# matrix <- fread(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_normalized_matrix.csv")))
+# matrix <- as.matrix(matrix)
+# rownames(matrix) <- matrix[,"V1"]
+
+# print("Our matrix to annotate is loaded, see the dimension below :")
+# print(dim(matrix))
+
+# #-------------------------------------
+# # Look at genes who are common in both
+# # matrix and subset genes who are nots
+# #-------------------------------------
+# gene_intersection <- intersect(colnames(matrix), colnames(reference_matrix))
+# reference_matrix <- reference_matrix[, gene_intersection]
+# matrix <- matrix[, gene_intersection]
+
+# print("Only the gene in common in booth our matrix are kept : The new dimmension :")
+# print(dim(reference_matrix))
+# print(colnames(reference_matrix)[1:35])
+# print(dim(matrix))
+# print(colnames(matrix)[1:35])
 
 #-------------------------------------
 # Wtrite the two new matrix
 #-------------------------------------
-class(matrix) <- "numeric"
-write.csv(matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_MATRIX,".csv")))
-print("Our matrix is written")
+# class(matrix) <- "numeric"
+# write.csv(matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_MATRIX,".csv")))
+# print("Our matrix is written")
 
-rm("matrix")
-gc()
+# rm("matrix")
+# gc()
 
-# reference_matrix <- as.matrix(reference_matrix)
-# rownames(reference_matrix) <- reference_matrix[,"sample_name"]
-# class(reference_matrix) <- "numeric"
-write.csv(reference_matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_REF_MATRIX,".csv")))
+write.csv(reference_matrix, file.path(OUTPUTDIR, STEP3, REFERENCE_NAME, paste0(OUTPUT_NAME_REF_MATRIX,".csv")))
 print("Our matrix of reference is written")
 
-rm("reference_matrix")
-gc()
+# rm("reference_matrix")
+# gc()
 
 #-------------------------------------
 # create the output file
diff --git a/03_Script/arlotta.R b/03_Script/arlotta.R
index 593fa66..d747eba 100755
--- a/03_Script/arlotta.R
+++ b/03_Script/arlotta.R
@@ -3,7 +3,7 @@
 # Paola Arlotta reference :
 # https://www.nature.com/articles/s41586-021-03670-5
 #..............................................................
-###########################################################################
+
 #-------------------------------------
 # Library
 #-------------------------------------
@@ -21,6 +21,7 @@ REF = file.path((DIRECTORY), "01_Reference")
 
 DEVELOPMENTAL_TIME = snakemake@params[["developmental_time"]]
 SAMPLE_ID = snakemake@params[["sample_id"]]
+REFERENCE_NAME = snakemake@params[["reference_name"]]
 
 NORM_METHOD = snakemake@params[["norm_method"]]
 NORM_SCALE_FACTOR = snakemake@params[["norm_scale_factor"]]
@@ -31,30 +32,52 @@ OUTPUT_NAME_MATRIX = snakemake@params[["output_name_matrix"]]
 
 STEP2 = "02_seurat/"
 STEP3 = "03_sims/"
+STEP_REMI = "03_remi/"
+
+METADATA_REMI <- TRUE
 
 #-------------------------------------
 # Load the metadata of the reference
 #-------------------------------------
-reference_metadata <- fread(file = file.path(REF, "arlotta/metaData_scDevSC.txt"))
-reference_metadata <- reference_metadata[-1, ]
-reference_metadata <- reference_metadata[, NEW_NAME := gsub("[^ATCG]", "", NAME)] ### Keep only the A,T,G and C in the original name
+if(METADATA_REMI){
+    reference_metadata <- fread(file = file.path(OUTPUTDIR, STEP_REMI, "final_arlotta.csv"))
+    cells_names <- "sample_name"
+    celltypes <- "celltype_label"
+    column_id <- "biosample_id"
+} else {
+    reference_metadata <- fread(file = file.path(REF, "arlotta/metaData_scDevSC.txt"))
+    reference_metadata <- reference_metadata[-1, ]
+    cells_names <- "NAME"
+    celltypes <- "New_cellType"
+    column_id <- "orig_ident"
+}
+
+reference_metadata <- reference_metadata[, NEW_NAME := gsub("[^ATCG]", "", get(cells_names))] ### Keep only the A,T,G and C in the original name
+print("The metadata is loaded, seee his dimension below :")
+print(dim(reference_metadata))
+print(head(reference_metadata))
 
 #-------------------------------------
 # For the training we need at least
 # two cells per labels
 #-------------------------------------
-occurence <- table(reference_metadata$New_cellType)
+occurence <- table(reference_metadata[[celltypes]])
 one_cell_per_label <- names(occurence[occurence == 1])
-reference_metadata <- reference_metadata[!(reference_metadata$New_cellType %in% one_cell_per_label), ]
+reference_metadata <- reference_metadata[!(reference_metadata[[celltypes]] %in% one_cell_per_label), ]
+
+print("The new dimension after removing the labels taht only appear once :")
+print(dim(reference_metadata))
 
 #-------------------------------------
 # Erase some labels in the metadata :
 # Low quality cells, Doublet,
 # Red blood cells, group
 #-------------------------------------
-patterns_to_delete <- c("Low quality cells", "Doublet", "Red blood cells", "group") 
-rows_to_delete <- apply(sapply(patterns_to_delete, grepl, reference_metadata$New_cellType), 1, any)
-reference_metadata <- subset(reference_metadata, !rows_to_delete)
+if (!METADATA_REMI){
+    patterns_to_delete <- c("Low quality cells", "Doublet", "Red blood cells", "group") 
+    rows_to_delete <- apply(sapply(patterns_to_delete, grepl, reference_metadata[[celltypes]]), 1, any)
+    reference_metadata <- subset(reference_metadata, !rows_to_delete)
+}
 
 #-------------------------------------
 # Load the matrix of reference 
@@ -89,6 +112,8 @@ for (x in file_to_open){
     list_reference_matrix[[x]] <- Read10X_h5(filepath[[x]])
 }
 
+print("The diferent matrix for diferent developmental time have been loaded in a list")
+
 #-------------------------------------
 # Normalization + Transpose them
 # (cell in rows) + Transform them
@@ -101,6 +126,8 @@ list_reference_matrix <- lapply(list_reference_matrix, function(x){
 list_reference_matrix <-  lapply(list_reference_matrix, function(x) as.matrix(x))
 list_reference_matrix <-  lapply(list_reference_matrix, function(x) t(x))
 
+print("The matrix have been normalized")
+
 #-------------------------------------
 # Keep only A,T,G and C in every 
 # matrix for the cells name (to match
@@ -121,49 +148,62 @@ list_reference_matrix <- lapply(list_reference_matrix, clean_rownames)
 metadata <- data.table()
 
 for(x in names(list_reference_matrix)){
-    if(x %in% reference_metadata$orig_ident){
-        rows_metadata <- reference_metadata[orig_ident == x] # Look in the metadata and keep only the rows that corespond to the matrix (Keep P4 rows for the P4 matrix for exemple)
-        cell_metadata <- rows_metadata$NEW_NAME # Cells in the metadata
-        cell_reference_matrix <- rownames(list_reference_matrix[[x]]) # Cells in the metadata
-        matching_cells <- intersect(cell_metadata, cell_reference_matrix) # Look at cells that match
-        temporary_metadata <- subset(rows_metadata, NEW_NAME %in% matching_cells) # Create a temporary metadata for each matrix whith only the cells that match and remove the others
-        temporary_metadata$NEW_NAME <- paste(x, temporary_metadata$NEW_NAME, sep="_") # Add the develomental age in front of cell name (some diferent developmental point have same name for cells, that is why)
-        metadata <- rbind(metadata, temporary_metadata) # Append the for each matrix the new metadata to the final one
-        list_reference_matrix[[x]] <- list_reference_matrix[[x]][matching_cells, ] # Now keep only the mathing cell in the matrix and remove the others
-        rownames(list_reference_matrix[[x]]) <- paste(x, rownames(list_reference_matrix[[x]]), sep = "_") # Add the develomental age in front of cell name (Like this also names still match metadata)
+  print("The developmental time :")
+  print(x)
+  if(x %in% reference_metadata[[column_id]]){
+    rows_metadata <- reference_metadata[reference_metadata[[column_id]] == x] # Look in the metadata and keep only the rows that corespond to the matrix (Keep P4 rows for the P4 matrix for exemple)
+    print("The dimension of the metadata for this develompental time :")
+    print(dim(rows_metadata))
+    cell_metadata <- rows_metadata$NEW_NAME # Cells in the metadata
+    cell_reference_matrix <- rownames(list_reference_matrix[[x]]) # Cells in the matrix
+    matching_cells <- intersect(cell_metadata, cell_reference_matrix) # Look at cells that match
+    temporary_metadata <- subset(rows_metadata, NEW_NAME %in% matching_cells) # Create a temporary metadata for each matrix whith only the cells that match and remove the others
+    temporary_metadata$NEW_NAME <- paste(x, temporary_metadata$NEW_NAME, sep="_")# Add the develomental age in front of cell name (some diferent developmental point have same name for cells, that is why)
+    print("The new dimension of the metadata after keeping only cells that match with the matrix :")
+    print(dim(temporary_metadata))
+    metadata <- rbind(metadata, temporary_metadata) # Append the for each matrix the new metadata to the final one
+    list_reference_matrix[[x]] <- list_reference_matrix[[x]][matching_cells, ] # Now keep only the mathing cell in the matrix and remove the others
+    rownames(list_reference_matrix[[x]]) <- paste(x, rownames(list_reference_matrix[[x]]), sep = "_") # Add the develomental age in front of cell name (Like this also names still match metadata)
     }
 }
 
+print("The final size of the metadata : ")
+print(dim(metadata))
+
 reference_matrix <- do.call(rbind, list_reference_matrix)
+print("The size of the matrix of reference : ")
+print(dim(reference_matrix))
+print("Here the first 10 rows and 10 columns :")
+print(reference_matrix[1:10, 1:10])
 
-#-------------------------------------
-# Load our matrix to annotate
-#-------------------------------------
-matrix <- fread(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_normalized_matrix.csv"))) # The normalized matrix is used for this reference.
-matrix <- as.matrix(matrix)
-rownames(matrix) <- matrix[,"V1"]
+# # #-------------------------------------
+# # # Load our matrix to annotate
+# # #-------------------------------------
+# # matrix <- fread(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_count_matrix.csv"))) # The normalized matrix is used for this reference.
+# # matrix <- as.matrix(matrix)
+# # rownames(matrix) <- matrix[,"V1"]
 
-#-------------------------------------
-# Look at genes who are common in both
-# matrix and subset genes who are not
-#-------------------------------------
-gene_intersection <- intersect(colnames(matrix), colnames(reference_matrix))
-reference_matrix <- reference_matrix[, gene_intersection]
-matrix <- matrix[, gene_intersection]
+# # #-------------------------------------
+# # # Look at genes who are common in both
+# # # matrix and subset genes who are not
+# # #-------------------------------------
+# # gene_intersection <- intersect(colnames(matrix), colnames(reference_matrix))
+# # reference_matrix <- reference_matrix[, gene_intersection]
+# # matrix <- matrix[, gene_intersection]
 
-#-------------------------------------
-# Wtrite the two new matrix +
-# create directory
-#-------------------------------------
-dir.create(file.path(OUTPUTDIR, STEP3, SAMPLE_ID))
+# #-------------------------------------
+# # Wtrite the two new matrix +
+# # create directory
+# #-------------------------------------
+dir.create(file.path(OUTPUTDIR, STEP3, REFERENCE_NAME))
 
 class(reference_matrix) <- "numeric"
-write.csv(reference_matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_REF_MATRIX,".csv")))
+write.csv(reference_matrix, file.path(OUTPUTDIR, STEP3, REFERENCE_NAME, paste0(OUTPUT_NAME_REF_MATRIX,".csv")))
 
-class(matrix) <- "numeric"
-write.csv(matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_MATRIX,".csv")))
+# # class(matrix) <- "numeric"
+# # write.csv(matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_MATRIX,".csv")))
 
-fwrite(x = metadata, file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_REF_METADATA,".csv")))
+fwrite(x = metadata, file = file.path(OUTPUTDIR, STEP3, REFERENCE_NAME, paste0(OUTPUT_NAME_REF_METADATA,".csv")))
 
 #-------------------------------------
 # create the output file
@@ -172,130 +212,3 @@ output_file<-file(TEXT_OUTPUT)
 writeLines(c("Files preparation for the Arlotta reference for SIMS finished (CSV format)"), output_file)
 close(output_file)
 
-######################################################################################################################################################################################################################################################
-# Old code
-
-# #-------------------------------------
-# # Path to files / folders
-# #-------------------------------------
-# DIRECTORY = getwd()
-# TEXT_OUTPUT = snakemake@output[["data_for_sims_output"]]
-# OUTPUTDIR = file.path((DIRECTORY), "05_Output")
-# REF = file.path((DIRECTORY), "01_Reference")
-
-# PATTERN_TO_KEEP = snakemake@params[["pattern_to_keep"]]
-# SAMPLE_ID = snakemake@params[["sample_id"]]
-
-# ARLOTTA_METADATA = snakemake@params[["arlotta_metadata"]]
-# ARLOTTA_MATRIX = snakemake@params[["arlotta_matrix"]]
-# ARLOTTA_CELLS = snakemake@params[["arlotta_cells"]]
-# ARLOTTA_FEATURES = snakemake@params[["arlotta_features"]]
-
-# OUTPUT_NAME_REF_METADATA = snakemake@params[["output_name_ref_metadata"]]
-# OUTPUT_NAME_REF_MATRIX = snakemake@params[["output_name_ref_matrix"]]
-# OUTPUT_NAME_MATRIX = snakemake@params[["output_name_matrix"]]
-
-# STEP2 = "02_seurat/"
-# STEP3 = "03_sims/"
-
-# #-------------------------------------
-# # Load the metadata of the reference
-# #-------------------------------------
-# reference_metadata <- fread(file = file.path(REF, ARLOTTA_METADATA))
-
-# #-------------------------------------
-# # Erase some labels in the metadata :
-# # Low quality cells, Doublet,
-# # Red blood cells, group
-# #-------------------------------------
-# patterns_to_delete <- c("Low quality cells", "Doublet", "Red blood cells", "group") 
-# rows_to_delete <- apply(sapply(patterns_to_delete, grepl, reference_metadata$New_cellType), 1, any)
-# reference_metadata <- subset(reference_metadata, !rows_to_delete)
-
-# #-------------------------------------
-# # Load the matrix of reference
-# #-------------------------------------
-# reference_matrix <- readMM(file = file.path(REF, ARLOTTA_MATRIX))
-
-# feature.names = read.delim(file.path(REF, ARLOTTA_FEATURES),
-#                            header = FALSE,
-#                            stringsAsFactors = FALSE)
-
-# cells.names = read.delim(file.path(REF, ARLOTTA_CELLS),
-#                            header = FALSE,
-#                            stringsAsFactors = FALSE)
-
-# colnames(reference_matrix) = cells.names$V1
-# rownames(reference_matrix) = feature.names$V1
-
-# # #-------------------------------------
-# # # Transpose the matrix :
-# # # Cells in row and genes in columns +
-# # # Transform rownames and colnames
-# # # as characters (for sims).
-# # #-------------------------------------
-# reference_matrix <- t(reference_matrix)
-# reference_matrix <- as.matrix(reference_matrix)
-
-# colnames(reference_matrix) <- as.character(colnames(reference_matrix)) # Genes
-# rownames(reference_matrix) <- as.character(rownames(reference_matrix)) # Cells
-
-# #-------------------------------------
-# # Write a new matrix : Keep only
-# # mouse age that are interesting
-# # for the project
-# #-------------------------------------
-# pattern_to_keep <- c(PATTERN_TO_KEEP) 
-# rows_to_keep <- grepl(paste(pattern_to_keep, collapse = "|"), rownames(reference_matrix))
-# reference_matrix <- reference_matrix[rows_to_keep, ]
-
-# #-------------------------------------
-# # Keep cells that match both the 
-# # metadata an matrix of reference
-# #-------------------------------------
-# column_metadata <- reference_metadata$NAME
-# matching_cells <- intersect(rownames(reference_matrix), column_metadata)
-# reference_metadata <- reference_metadata[reference_metadata$NAME %in% matching_cells, ]
-# reference_matrix <- reference_matrix[matching_cells, ]
-
-# #-------------------------------------
-# # Order metadata according to matrix
-# # (Necessary ?)
-# #-------------------------------------
-# reference_metadata <- reference_metadata[match(rownames(reference_matrix),reference_metadata$NAME),]
-
-# #-------------------------------------
-# # Load our matrix to annotate
-# #-------------------------------------
-# matrix <- fread(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_normalized_matrix.csv"))) # The normalized matrix is used for this reference.
-# matrix <- as.matrix(matrix)
-# rownames(matrix) <- matrix[,"V1"]
-
-# #-------------------------------------
-# # Look at genes who are common in both
-# # matrix and subset genes who are not
-# #-------------------------------------
-# gene_intersection <- intersect(colnames(matrix), colnames(reference_matrix))
-# reference_matrix <- reference_matrix[, gene_intersection]
-# matrix <- matrix[, gene_intersection]
-
-# #-------------------------------------
-# # Wtrite the two new matrix +
-# # create directory
-# #-------------------------------------
-# dir.create(file.path(OUTPUTDIR, STEP3, SAMPLE_ID))
-
-# class(reference_matrix) <- "numeric"
-# write.csv(reference_matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_REF_MATRIX,".csv")))
-
-# class(matrix) <- "numeric"
-# write.csv(matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_MATRIX,".csv")))
-
-# fwrite(x = reference_metadata, file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_REF_METADATA,".csv")))
-
-# #-------------------------------------
-# # create the output file
-# #-------------------------------------
-# output_file<-file(TEXT_OUTPUT)
-# writeLines(c("Files preparation for the Arlotta reference for SIMS finished (CSV format)"), output_file)
-# close(output_file)
\ No newline at end of file

From 58b1dd016a8b676509ca0d117b56ffc996414624 Mon Sep 17 00:00:00 2001
From: "Lea CHABOT lea-chabot@orange.fr" <lchabot@login02.cluster>
Date: Mon, 15 Apr 2024 10:54:08 +0200
Subject: [PATCH 11/15] adapt code for new version of sims

---
 02_Container/preparation_sims.yaml      |  27 ++-
 02_Container/sims.yaml                  |   3 +-
 03_Script/allen_mouse_whole_cortex.R    | 221 ++++++++++++------------
 03_Script/arlotta.R                     |  60 ++++---
 03_Script/remi.R                        | 103 +++++++++++
 03_Script/sims_prediction.py            |  53 ++++++
 03_Script/{sims.py => sims_training.py} |  39 +----
 03_Script/to_anndata_file.py            |  65 +++----
 03_Script/umapCellAssignation.R         | 127 +++++++++-----
 04_Workflow/SIMS.smk                    |  60 ++++---
 04_Workflow/knnor.smk                   |   2 +-
 04_Workflow/prepare_data_sims.smk       |   9 +-
 04_Workflow/remi.smk                    |  18 ++
 04_Workflow/umapCellAssignation.smk     |  14 +-
 Snakefile                               | 101 ++++-------
 config.yaml                             | 139 ++++++++-------
 feature_config.csv                      |  16 ++
 multi_config.csv                        |  13 +-
 run_slurm.sh                            |  12 +-
 19 files changed, 648 insertions(+), 434 deletions(-)
 create mode 100644 03_Script/remi.R
 create mode 100644 03_Script/sims_prediction.py
 rename 03_Script/{sims.py => sims_training.py} (62%)
 create mode 100644 04_Workflow/remi.smk
 create mode 100644 feature_config.csv

diff --git a/02_Container/preparation_sims.yaml b/02_Container/preparation_sims.yaml
index e86cc7e..48dd025 100644
--- a/02_Container/preparation_sims.yaml
+++ b/02_Container/preparation_sims.yaml
@@ -1,17 +1,14 @@
+channels :
+    - r
+    - bioconda
+    - conda-forge
     - defaults
 dependencies :
-    - r-base =4.3.0
-    - r-data.table
-    - r-matrix
-    # - conda-forge::anndata = 0.9.2
-    # - conda-forge::pandas = 2.1.1
-    - r-biocmanager = 1.30.22
-    - r-magick
-    - bioconductor-scran
-    - r-igraph
-    - bioconductor-scater =1.28.0
-    # - bioconductor-biostrings
-    # - bioconductor-keggrest
-    # - bioconductor-annotationdbi
-    # - bioconductor-annotationhub
-    - r-reshape2 =1.4.4
+    - r-data.table = 1.14.2
+    - conda-forge::r-matrix = 1.5.4.1
+    - conda-forge::anndata = 0.9.2
+    - conda-forge::pandas = 2.1.1
+    - r-seurat = 4.0.3
+    - r-dplyr = 1.0.9
+    - conda-forge::r-hdf5r = 1.3.8
+    - bioconductor-rhdf5 = 2.38.1
\ No newline at end of file
diff --git a/02_Container/sims.yaml b/02_Container/sims.yaml
index afd0061..b431007 100644
--- a/02_Container/sims.yaml
+++ b/02_Container/sims.yaml
@@ -8,4 +8,5 @@ dependencies :
   - pip
   - pip:
     - --upgrade pip setuptools wheel
-    - --use-pep517 git+https://github.com/braingeneers/SIMS.git
\ No newline at end of file
+    # Sims version Nov 17, 2023
+    - --use-pep517 git+https://github.com/braingeneers/SIMS.git@c3cc547e9223e979fdc70f9af2ae932b729da88e
\ No newline at end of file
diff --git a/03_Script/allen_mouse_whole_cortex.R b/03_Script/allen_mouse_whole_cortex.R
index baab9ca..b3a6daf 100644
--- a/03_Script/allen_mouse_whole_cortex.R
+++ b/03_Script/allen_mouse_whole_cortex.R
@@ -38,117 +38,143 @@ STEP_REMI = "03_remi/"
 
 METADATA_REMI <- TRUE
 
-#-------------------------------------
-# Load the metadata of the reference
-#-------------------------------------
+# #-------------------------------------
+# # Load the metadata of the reference
+# #-------------------------------------
 
-if(METADATA_REMI){
-    reference_metadata <- fread(file = file.path(OUTPUTDIR, STEP_REMI, "final_allen.csv"))
-    cells_names <- "sample_name"
-    celltypes <- "celltype_label"
-} else {
-    reference_metadata <- fread(file = file.path(REF, ALLEN_METADATA))
-    cells_names <- "sample_name"
-    celltypes <- "subclass_label"
-}
+# if(METADATA_REMI){
+#     reference_metadata <- fread(file = file.path(OUTPUTDIR, STEP_REMI, "final_allen.csv"))
+#     cells_names <- "sample_name"
+#     celltypes <- "celltype_label"
+# } else {
+#     reference_metadata <- fread(file = file.path(REF, ALLEN_METADATA))
+#     cells_names <- "sample_name"
+#     celltypes <- "subclass_label"
+# }
 
-print("Reference metadata loaded. It's size is :")
-print(dim(reference_metadata))
+# print("Reference metadata loaded. It's size is :")
+# print(dim(reference_metadata))
 
-########################################## PROPRE AU PROJET ##########################################
-#-------------------------------------
-# Keep only some part of the cortex
-# we want to study: Ssp
-#-------------------------------------
-if(!METADATA_REMI){
-    reference_metadata <- reference_metadata[reference_metadata$region_label %in% "SSp",]
-    print("Only SSp are kept in the metadata: The new size of this one is :")
-    print(dim(reference_metadata))
-}
-######################################################################################################
+# ########################################## PROPRE AU PROJET ##########################################
+# #-------------------------------------
+# # Keep only some part of the cortex
+# # we want to study: Ssp
+# #-------------------------------------
+# if(!METADATA_REMI){
+#     reference_metadata <- reference_metadata[reference_metadata$region_label %in% "SSp",]
+#     print("Only SSp are kept in the metadata: The new size of this one is :")
+#     print(dim(reference_metadata))
+# }
+# ######################################################################################################
 
-#-------------------------------------
-# For the training we need at least
-# two cells per labels
-#-------------------------------------
-occurence <- table(reference_metadata[[celltypes]])
-one_cell_per_label <- names(occurence[occurence == 1])
-reference_metadata <- reference_metadata[!(reference_metadata[[celltypes]] %in% one_cell_per_label), ]
+# #-------------------------------------
+# # For the training we need at least
+# # two cells per labels
+# #-------------------------------------
+# occurence <- table(reference_metadata[[celltypes]])
+# one_cell_per_label <- names(occurence[occurence == 1])
+# reference_metadata <- reference_metadata[!(reference_metadata[[celltypes]] %in% one_cell_per_label), ]
 
-print("The new size after removing labels thaht only appear once :")
-print(dim(reference_metadata))
+# print("The new size after removing labels thaht only appear once :")
+# print(dim(reference_metadata))
 
-#-------------------------------------
-# Load the matrix of reference
-#-------------------------------------
-reference_matrix <- h5read(file = file.path(REF, ALLEN_MATRIX), "/data/counts")
-reference_genes <- h5read(file = file.path(REF, ALLEN_MATRIX), "/data/gene")
-reference_cells <- h5read(file = file.path(REF, ALLEN_MATRIX), "/data/samples")
+# #-------------------------------------
+# # Load the matrix of reference
+# #-------------------------------------
+# reference_matrix <- h5read(file = file.path(REF, ALLEN_MATRIX), "/data/counts")
+# reference_genes <- h5read(file = file.path(REF, ALLEN_MATRIX), "/data/gene")
+# reference_cells <- h5read(file = file.path(REF, ALLEN_MATRIX), "/data/samples")
 
-rownames(reference_matrix) <- as.character(reference_cells)
-colnames(reference_matrix) <- as.character(reference_genes)
+# rownames(reference_matrix) <- as.character(reference_cells)
+# colnames(reference_matrix) <- as.character(reference_genes)
 
-rm(list = c("reference_genes","reference_cells"))
-gc()
+# rm(list = c("reference_genes","reference_cells"))
+# gc()
 
-print("Reference matrix loaded. The size is :")
-print(dim(reference_matrix))
+# print("Reference matrix loaded. The size is :")
+# print(dim(reference_matrix))
 
-#-------------------------------------
-# Keep cells that match both the 
-# metadata an matrix of reference
-#-------------------------------------
-reference_matrix <- reference_matrix[rownames(reference_matrix) %in% reference_metadata[[cells_names]],]
-reference_metadata <- reference_metadata[reference_metadata[[cells_names]] %in% rownames(reference_matrix),]
+# #-------------------------------------
+# # Keep cells that match both the 
+# # metadata and matrix of reference
+# #-------------------------------------
+# reference_matrix <- reference_matrix[rownames(reference_matrix) %in% reference_metadata[[cells_names]],]
+# reference_metadata <- reference_metadata[reference_metadata[[cells_names]] %in% rownames(reference_matrix),]
 
-print("The metadata and matrix of reference have the same cells now : The new size of them two :")
-print(dim(reference_matrix))
-print(dim(reference_metadata))
+# print("The metadata and matrix of reference have the same cells now : The new size of them two :")
+# print(dim(reference_matrix))
+# print(dim(reference_metadata))
+# print(reference_matrix[1:5, 1:25])
 
-#-------------------------------------
-# Order metadata according to matrix
-# (Necessary ?)
-#-------------------------------------
-reference_metadata <- reference_metadata[match(rownames(reference_matrix),reference_metadata[[cells_names]]),]
-print("Cells are in the same order in both files")
+# #-------------------------------------
+# # Order metadata according to matrix
+# # (Necessary ?)
+# #-------------------------------------
+# reference_metadata <- reference_metadata[match(rownames(reference_matrix),reference_metadata[[cells_names]]),]
+# print("Cells are in the same order in both files")
+
+# #-------------------------------------
+# # Write the the metadata as csv   
+# #-------------------------------------
+# dir.create(file.path(OUTPUTDIR, STEP3, REFERENCE_NAME))
+
+# fwrite(x = reference_metadata, file = file.path(OUTPUTDIR, STEP3, REFERENCE_NAME, paste0(OUTPUT_NAME_REF_METADATA, ".csv")))
+
+# rm("reference_metadata")
+# gc()
+
+# print("Metadata is written")
+
+# #-------------------------------------
+# # Normalization of the matrix of
+# # reference
+# #-------------------------------------
+# reference_matrix <- NormalizeData( object = reference_matrix, normalization.method = NORM_METHOD, scale.factor = NORM_SCALE_FACTOR, verbose = FALSE)
+# reference_matrix <- as.matrix(reference_matrix)
+
+# print("The matrix of reference is now normalized the same way as our matrix to analyze: Here the first 10 rows and 10 columns :")
+# print(reference_matrix[1:5, 1:25])
 
 #-------------------------------------
-# Write the the metadata as csv   
+# Load our matrix to annotate
 #-------------------------------------
-dir.create(file.path(OUTPUTDIR, STEP3, REFERENCE_NAME))
+dir.create(file.path(OUTPUTDIR, STEP3, SAMPLE_ID))
+
+matrix <- fread(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_normalized_matrix.csv")))
+matrix <- as.matrix(matrix)
+rownames(matrix) <- matrix[,"V1"]
+matrix <- matrix[, -1]
 
-fwrite(x = reference_metadata, file = file.path(OUTPUTDIR, STEP3, REFERENCE_NAME, paste0(OUTPUT_NAME_REF_METADATA, ".csv")))
+print("Our matrix to annotate is loaded, see the dimension below :")
+print(dim(matrix))
 
-rm("reference_metadata")
+# -------------------------------------
+# Wtrite the two new matrix
+# -------------------------------------
+class(matrix) <- "numeric"
+write.csv(matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_MATRIX,".csv")))
+print("Our matrix is written")
+
+rm("matrix")
 gc()
 
-print("Metadata is written")
+# class(reference_matrix) <- "numeric"
+# write.csv(reference_matrix, file.path(OUTPUTDIR, STEP3, REFERENCE_NAME, paste0(OUTPUT_NAME_REF_MATRIX,".csv")))
+# print("Our matrix of reference is written")
+
+# rm("reference_matrix")
+# gc()
 
 #-------------------------------------
-# Normalization of the matrix of
-# reference
+# create the output file
 #-------------------------------------
-### Seurat change the names of some genes at this step
-cell_before_norm <- as.data.table(colnames(reference_matrix))
-write.csv(cell_before_norm, file.path(OUTPUTDIR, STEP3, REFERENCE_NAME, "cells_before_norm.csv"))
-
-reference_matrix <- NormalizeData( object = reference_matrix, normalization.method = NORM_METHOD, scale.factor = NORM_SCALE_FACTOR, verbose = FALSE)
+output_file<-file(TEXT_OUTPUT)
+writeLines(c("Files preparation for the Allen reference for SIMS finished (CSV format)"), output_file)
+close(output_file)
 
-cell_after_norm <- as.data.table(colnames(reference_matrix))
-write.csv(cell_after_norm, file.path(OUTPUTDIR, STEP3, REFERENCE_NAME, "cells_after_norm.csv"))
 
-print("The matrix of reference is now normalized the same way as our matrix to analyze: Here the first 10 rows and 10 columns :")
-print(reference_matrix[1:10, 1:10])
 
-# #-------------------------------------
-# # Load our matrix to annotate
-# #-------------------------------------
-# matrix <- fread(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_normalized_matrix.csv")))
-# matrix <- as.matrix(matrix)
-# rownames(matrix) <- matrix[,"V1"]
 
-# print("Our matrix to annotate is loaded, see the dimension below :")
-# print(dim(matrix))
 
 # #-------------------------------------
 # # Look at genes who are common in both
@@ -162,27 +188,4 @@ print(reference_matrix[1:10, 1:10])
 # print(dim(reference_matrix))
 # print(colnames(reference_matrix)[1:35])
 # print(dim(matrix))
-# print(colnames(matrix)[1:35])
-
-#-------------------------------------
-# Wtrite the two new matrix
-#-------------------------------------
-# class(matrix) <- "numeric"
-# write.csv(matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_MATRIX,".csv")))
-# print("Our matrix is written")
-
-# rm("matrix")
-# gc()
-
-write.csv(reference_matrix, file.path(OUTPUTDIR, STEP3, REFERENCE_NAME, paste0(OUTPUT_NAME_REF_MATRIX,".csv")))
-print("Our matrix of reference is written")
-
-# rm("reference_matrix")
-# gc()
-
-#-------------------------------------
-# create the output file
-#-------------------------------------
-output_file<-file(TEXT_OUTPUT)
-writeLines(c("Files preparation for the Allen reference for SIMS finished (CSV format)"), output_file)
-close(output_file)
+# print(colnames(matrix)[1:35])
\ No newline at end of file
diff --git a/03_Script/arlotta.R b/03_Script/arlotta.R
index d747eba..eed037e 100755
--- a/03_Script/arlotta.R
+++ b/03_Script/arlotta.R
@@ -171,39 +171,40 @@ print("The final size of the metadata : ")
 print(dim(metadata))
 
 reference_matrix <- do.call(rbind, list_reference_matrix)
+
 print("The size of the matrix of reference : ")
 print(dim(reference_matrix))
 print("Here the first 10 rows and 10 columns :")
 print(reference_matrix[1:10, 1:10])
 
-# # #-------------------------------------
-# # # Load our matrix to annotate
-# # #-------------------------------------
-# # matrix <- fread(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_count_matrix.csv"))) # The normalized matrix is used for this reference.
-# # matrix <- as.matrix(matrix)
-# # rownames(matrix) <- matrix[,"V1"]
-
-# # #-------------------------------------
-# # # Look at genes who are common in both
-# # # matrix and subset genes who are not
-# # #-------------------------------------
-# # gene_intersection <- intersect(colnames(matrix), colnames(reference_matrix))
-# # reference_matrix <- reference_matrix[, gene_intersection]
-# # matrix <- matrix[, gene_intersection]
+#-------------------------------------
+# Load our matrix to annotate
+#-------------------------------------
+matrix <- fread(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_normalized_matrix.csv"))) # The normalized matrix is used for this reference.
+matrix <- as.matrix(matrix)
+rownames(matrix) <- matrix[,"V1"]
+matrix <- matrix[, -1]
 
-# #-------------------------------------
-# # Wtrite the two new matrix +
-# # create directory
-# #-------------------------------------
-dir.create(file.path(OUTPUTDIR, STEP3, REFERENCE_NAME))
+print("Our matrix to annotate is loaded, here the first rows : ")
+print(matrix[1:10, 1:10])
 
+#-------------------------------------
+# Wtrite the two new matrix +
+# create directory
+#-------------------------------------
+dir.create(file.path(OUTPUTDIR, STEP3, REFERENCE_NAME))
 class(reference_matrix) <- "numeric"
-write.csv(reference_matrix, file.path(OUTPUTDIR, STEP3, REFERENCE_NAME, paste0(OUTPUT_NAME_REF_MATRIX,".csv")))
+write.csv(reference_matrix, file.path(OUTPUTDIR, STEP3, REFERENCE_NAME, paste0(OUTPUT_NAME_REF_MATRIX,".csv")), row.names = TRUE)
+print("The reference matrix is written")
 
-# # class(matrix) <- "numeric"
-# # write.csv(matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_MATRIX,".csv")))
 
-fwrite(x = metadata, file = file.path(OUTPUTDIR, STEP3, REFERENCE_NAME, paste0(OUTPUT_NAME_REF_METADATA,".csv")))
+dir.create(file.path(OUTPUTDIR, STEP3, SAMPLE_ID))
+class(matrix) <- "numeric"
+write.csv(matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_MATRIX,".csv")))
+print("The matrix is written")
+
+fwrite(x = metadata, file = file.path(OUTPUTDIR, STEP3, REFERENCE_NAME, paste0(OUTPUT_NAME_REF_METADATA,".csv")), row.names = TRUE)
+print("The metadaa is written")
 
 #-------------------------------------
 # create the output file
@@ -212,3 +213,16 @@ output_file<-file(TEXT_OUTPUT)
 writeLines(c("Files preparation for the Arlotta reference for SIMS finished (CSV format)"), output_file)
 close(output_file)
 
+
+
+
+
+
+
+# #-------------------------------------
+# # Look at genes who are common in both
+# # matrix and subset genes who are not
+# #-------------------------------------
+# gene_intersection <- intersect(colnames(matrix), colnames(reference_matrix))
+# reference_matrix <- reference_matrix[, gene_intersection]
+# matrix <- matrix[, gene_intersection]
diff --git a/03_Script/remi.R b/03_Script/remi.R
new file mode 100644
index 0000000..559be00
--- /dev/null
+++ b/03_Script/remi.R
@@ -0,0 +1,103 @@
+#..............................................................
+# Preparation of new metadatas using Remi work for the Allen
+# and Di Bella references
+#..............................................................
+
+#-------------------------------------
+# Library
+#-------------------------------------
+library(Seurat)
+library(data.table)
+library(ggplot2)
+
+#-------------------------------------
+# Path to files / folders
+#-------------------------------------
+DIRECTORY = getwd()
+OUTPUTDIR = file.path((DIRECTORY), "05_Output")
+REF = file.path((DIRECTORY), "01_Reference")
+
+STEP2 = "02_seurat/"
+STEP3 = "03_sims/"
+STEP_REMI = "03_remi/"
+
+dir.create(file.path(OUTPUTDIR, STEP_REMI))
+
+#-------------------------------------
+# Fichier celltype manipulation
+#-------------------------------------
+
+# Open the RDS and write the structure as a .txt file :
+remi_celltypes <- readRDS(file.path(REF, "remi/GECortex_PostMsub_celltypefinal.rds" )) # ~ 16 GB
+sink(file.path(OUTPUTDIR, STEP_REMI, "remi_celltypefinal_structure.txt" ))
+str(remi_celltypes)
+sink()
+
+# Write the metadata as a csv file :
+remi_celltypes_metadata <- remi_celltypes@meta.data
+write.csv(remi_celltypes_metadata, file.path(OUTPUTDIR, STEP_REMI, "remi_celltypefinal_metadata.csv"), row.names = TRUE)
+
+# write metadata for both Arlotta and Allen reference :
+motif_arlotta <- "J.Di Bella et al., Nature 2021"
+remi_arlotta_metadata <- subset(remi_celltypes_metadata, grepl(motif_arlotta, remi_celltypes_metadata[["study_label"]]))
+write.csv(remi_arlotta_metadata, file.path(OUTPUTDIR, STEP_REMI, "remi_arlotta_metadata.csv"), row.names = TRUE)
+
+motif_allen <- "Yao et al., Cell 2021"
+remi_allen_metadata <- subset(remi_celltypes_metadata, grepl(motif_allen, remi_celltypes_metadata[["study_label"]]))
+remi_allen_metadata <- remi_allen_metadata[remi_allen_metadata$platform_label %in% "10X", ] # I only use this matrix in my analysis
+write.csv(remi_allen_metadata, file.path(OUTPUTDIR, STEP_REMI, "remi_allen_metadata.csv"), row.names = TRUE)
+
+#-------------------------------------
+# ARLOTTA 
+#-------------------------------------
+# Open the metadata
+arlotta_metadata <- fread(file = file.path(REF, "arlotta/metaData_scDevSC.txt"))
+
+# Remove every cells that are not Non Neuronal
+strings_to_remove <- c("Interneurons", "Cajal Retzius cells", "UL CPN", "Layer 4", "DL CPN", "DL_CPN_1", "DL_CPN_2", "SCPN", "NP", "CThPN", "Layer 6b", "Doublet", "group", "Low quality cells", "Red blood cells")
+for (pattern in strings_to_remove) {
+  rows_to_remove <- grep(pattern, arlotta_metadata$New_cellType)
+  arlotta_metadata <- arlotta_metadata[-rows_to_remove, ]
+}
+
+# Keep only some column in both files for a futur merge
+arlotta_metadata <- subset(arlotta_metadata, select = c("NAME", "New_cellType", "orig_ident")) # Cells, subclass (for NN), and the age of the mouse
+remi_arlotta_metadata <- subset(remi_arlotta_metadata, select = c("sample_name", "celltype_label", "biosample_id"))# Cells, celltype (for GABAergic and Glutamatergic), and the age of the mouse
+
+# Give the two data table the same column name for the merge
+setnames(arlotta_metadata, "NAME", "sample_name")
+setnames(arlotta_metadata, "New_cellType", "celltype_label")
+setnames(arlotta_metadata, "orig_ident", "biosample_id")
+
+# Merge the two files
+final_arlotta <- rbind(remi_arlotta_metadata, arlotta_metadata)
+write.csv(final_arlotta, file.path(OUTPUTDIR, STEP_REMI, "final_arlotta.csv"), row.names = FALSE)
+
+#-------------------------------------
+# ALLEN
+#-------------------------------------
+# Open the metadata
+allen_metadata <- fread(file = file.path(REF, "allen/metadata.csv"))
+
+# Keep only cells that are Non Neuronal
+strings_to_keep <- c("Astro", "Endo", "Micro-PVM", "Oligo", "SMC-Peri", "VLMC")
+allen_metadata <- allen_metadata[allen_metadata$subclass_label %in% strings_to_keep]
+
+# Keep only some column in both files for a futur merge
+allen_metadata <- subset(allen_metadata, select = c("sample_name", "subclass_label"))# Cells, subclass (for NN)
+remi_allen_metadata <- subset(remi_allen_metadata, select = c("exp_component_name", "celltype_label"))
+
+# Give the two data table the same column name for the merge
+setnames(remi_allen_metadata, "exp_component_name", "sample_name")
+setnames(allen_metadata, "subclass_label", "celltype_label")
+
+# Merge the two files
+final_allen <- rbind(remi_allen_metadata, allen_metadata)
+write.csv(final_allen, file.path(OUTPUTDIR, STEP_REMI, "final_allen.csv"), row.names = FALSE)
+
+
+
+
+
+
+
diff --git a/03_Script/sims_prediction.py b/03_Script/sims_prediction.py
new file mode 100644
index 0000000..29c7f9f
--- /dev/null
+++ b/03_Script/sims_prediction.py
@@ -0,0 +1,53 @@
+#..............................................................
+# SIMS : Scalable, Interpretable Machine Learning for 
+# Single-Cell : A machine learning tool for scRNA-Seq label
+# transfer in neuroscience
+#..............................................................
+
+#-------------------------------------
+# Import
+#-------------------------------------
+from scsims import SIMS
+from pytorch_lightning.loggers import WandbLogger
+from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint
+import os
+import sys
+import anndata as an
+import wandb
+
+#-------------------------------------
+# Path to files / folders
+#-------------------------------------
+DIRECTORY = os.getcwd()
+OUTPUTDIR = os.path.join(DIRECTORY, "05_Output")
+
+SAMPLE_ID = sys.argv[1]
+MATRIX = sys.argv[2]
+EPOCH = sys.argv[3]
+REFERENCE_NAME = sys.argv[4]
+
+STEP3 = "03_sims/"
+
+#-----------------------------------------------------------------------------------------------
+#                                To predict labels on our data 
+#-----------------------------------------------------------------------------------------------
+
+#-------------------------------------
+# Load the model
+#-------------------------------------
+model = os.path.join(OUTPUTDIR, STEP3, REFERENCE_NAME, EPOCH)
+
+#-------------------------------------
+# Prediction
+#-------------------------------------
+sims = SIMS(weights_path = model)
+cell_prediction = sims.predict(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, MATRIX + ".h5ad"))
+
+#-------------------------------------
+# Write prediction in a csv file
+#-------------------------------------
+matrix = an.read_h5ad(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, MATRIX + ".h5ad"))
+
+cell_prediction.insert(0,"Cells", matrix.obs_names)
+
+cell_prediction.to_csv(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, SAMPLE_ID + "_" + MATRIX + "_prediction.csv"), index = False)
\ No newline at end of file
diff --git a/03_Script/sims.py b/03_Script/sims_training.py
similarity index 62%
rename from 03_Script/sims.py
rename to 03_Script/sims_training.py
index 51d54c1..abd3e3c 100644
--- a/03_Script/sims.py
+++ b/03_Script/sims_training.py
@@ -22,7 +22,7 @@
 REF = os.path.join(DIRECTORY, "01_Reference")
 OUTPUTDIR = os.path.join(DIRECTORY, "05_Output")
 
-SAMPLE_ID = sys.argv[1]
+REFERENCE_NAME = sys.argv[1]
 REFERENCE_MATRIX = sys.argv[2]
 PROJECT_NAME = sys.argv[3]
 CLASS_LABEL = sys.argv[4]
@@ -43,14 +43,17 @@
 # Logger : Allows you to visualize 
 # data training.
 # -------------------------------------
+### To use wandb
 wandb.login(key = KEY)
+logger = WandbLogger(project = PROJECT_NAME, name = PROJECT_NAME, save_dir = os.path.join(OUTPUTDIR, STEP3, REFERENCE_NAME), version = "")
 
-logger = WandbLogger(project = PROJECT_NAME, name = PROJECT_NAME, save_dir = os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID), version = "")
+### To turn off wandb
+# logger = WandbLogger(offline=True)
 
 #-------------------------------------
 # Load the anndata file
 #-------------------------------------
-reference_matrix = an.read_h5ad(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, REFERENCE_MATRIX + "_join_metadata.h5ad"))
+reference_matrix = an.read_h5ad(os.path.join(OUTPUTDIR, STEP3, REFERENCE_NAME, REFERENCE_MATRIX + "_join_metadata.h5ad"))
 
 #-------------------------------------
 # Custom training jobs
@@ -67,39 +70,15 @@
             monitor = MONITOR,
             patience = int(PATIENCE),
             verbose = 1,
-            # mode = 'max'
         ),
         LearningRateMonitor(logging_interval = "epoch"),
         ModelCheckpoint(
-            dirpath = os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID),
-            # filename = "checkpoint_" + PROJECT_NAME + ".ckpt"
+            dirpath = os.path.join(OUTPUTDIR, STEP3, REFERENCE_NAME),
+            every_n_epochs = 1,
+            save_top_k = -1
         ),
     ],
     max_epochs = int(MAX_EPOCH)
 )
 
 sims.train()
-
-#-----------------------------------------------------------------------------------------------
-#                                To predict labels on our data 
-#-----------------------------------------------------------------------------------------------
-
-# #-------------------------------------
-# # Load the model
-# #-------------------------------------
-# model = os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, "epoch=129-step=88010.ckpt")
-
-# #-------------------------------------
-# # Prediction
-# #-------------------------------------
-# sims = SIMS(weights_path = model)
-# cell_prediction = sims.predict(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, MATRIX + ".h5ad"))
-
-# #-------------------------------------
-# # Write prediction in a csv file
-# #-------------------------------------
-# matrix = an.read_h5ad(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, MATRIX + ".h5ad"))
-
-# cell_prediction.insert(0,"Cells", matrix.obs_names)
-
-# cell_prediction.to_csv(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, MATRIX + "_prediction.csv"), index = False)
\ No newline at end of file
diff --git a/03_Script/to_anndata_file.py b/03_Script/to_anndata_file.py
index e277d61..1c40063 100644
--- a/03_Script/to_anndata_file.py
+++ b/03_Script/to_anndata_file.py
@@ -22,47 +22,48 @@
 OUTPUT_NAME_MATRIX = sys.argv[3]
 OUTPUT_NAME_REF_METADATA = sys.argv[4]
 CELLS_COLUMN = sys.argv[5]
+REFERENCE_NAME = sys.argv[6]
 
 STEP3 = "03_sims/"
 
-#-------------------------------------
-# Load reference matrix
-#-------------------------------------
+# #-------------------------------------
+# # Load reference matrix
+# #-------------------------------------
 
-reference_matrix = an.read_csv(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, OUTPUT_NAME_REF_MATRIX + ".csv"))
+# reference_matrix = an.read_csv(os.path.join(OUTPUTDIR, STEP3, REFERENCE_NAME, OUTPUT_NAME_REF_MATRIX + ".csv"))
 
-#-------------------------------------
-# Write matrix as anndata
-#-------------------------------------
-reference_matrix.write_h5ad(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, OUTPUT_NAME_REF_MATRIX + ".h5ad"))
+# #-------------------------------------
+# # Write matrix as anndata
+# #-------------------------------------
+# reference_matrix.write_h5ad(os.path.join(OUTPUTDIR, STEP3, REFERENCE_NAME, OUTPUT_NAME_REF_MATRIX + ".h5ad"))
 
-anndata_ref_matrix = an.read_h5ad(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, OUTPUT_NAME_REF_MATRIX + ".h5ad"))
-print("REFERENCE MATRIX")
-print("-------------------------------------------------------------------")
-print(anndata_ref_matrix)
-print("-------------------------------------------------------------------")
-print(anndata_ref_matrix.var_names)
-print("-------------------------------------------------------------------")
-print(anndata_ref_matrix.obs_names)
-print("-------------------------------------------------------------------")
+# anndata_ref_matrix = an.read_h5ad(os.path.join(OUTPUTDIR, STEP3, REFERENCE_NAME, OUTPUT_NAME_REF_MATRIX + ".h5ad"))
+# print("REFERENCE MATRIX")
+# print("-------------------------------------------------------------------")
+# print(anndata_ref_matrix)
+# print("-------------------------------------------------------------------")
+# print(anndata_ref_matrix.var_names)
+# print("-------------------------------------------------------------------")
+# print(anndata_ref_matrix.obs_names)
+# print("-------------------------------------------------------------------")
 
-#-------------------------------------
-# Write matrix as anndata with
-# metadata inside
-#-------------------------------------
-reference_metadata = pd.read_csv(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, OUTPUT_NAME_REF_METADATA + ".csv"), index_col = CELLS_COLUMN )
-anndata_ref_matrix.obs = pd.concat([anndata_ref_matrix.obs, reference_metadata], axis=1, join='inner')
+# #-------------------------------------
+# # Write matrix as anndata with
+# # metadata inside
+# #-------------------------------------
+# reference_metadata = pd.read_csv(os.path.join(OUTPUTDIR, STEP3, REFERENCE_NAME, OUTPUT_NAME_REF_METADATA + ".csv"), index_col = CELLS_COLUMN )
+# anndata_ref_matrix.obs = pd.concat([anndata_ref_matrix.obs, reference_metadata], axis=1, join='inner')
 
-anndata_ref_matrix.write_h5ad(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, OUTPUT_NAME_REF_MATRIX + "_join_metadata.h5ad"))
+# anndata_ref_matrix.write_h5ad(os.path.join(OUTPUTDIR, STEP3, REFERENCE_NAME, OUTPUT_NAME_REF_MATRIX + "_join_metadata.h5ad"))
 
-print("REFERENCE MATRIX AVEC METADATA")
-print("-------------------------------------------------------------------")
-print(anndata_ref_matrix)
-print("-------------------------------------------------------------------")
-print(anndata_ref_matrix.var_names)
-print("-------------------------------------------------------------------")
-print(anndata_ref_matrix.obs_names)
-print("-------------------------------------------------------------------")
+# print("REFERENCE MATRIX AVEC METADATA")
+# print("-------------------------------------------------------------------")
+# print(anndata_ref_matrix)
+# print("-------------------------------------------------------------------")
+# print(anndata_ref_matrix.var_names)
+# print("-------------------------------------------------------------------")
+# print(anndata_ref_matrix.obs_names)
+# print("-------------------------------------------------------------------")
 
 #-------------------------------------
 # Load the matrix to annotate
diff --git a/03_Script/umapCellAssignation.R b/03_Script/umapCellAssignation.R
index 7139e4b..79896c2 100644
--- a/03_Script/umapCellAssignation.R
+++ b/03_Script/umapCellAssignation.R
@@ -1,14 +1,17 @@
-#..............................................................
-# 
-#..............................................................
-
 #-------------------------------------
-# Library
+# This script add the cell labels in a seurat object
+# Unknown cell labels are add by the use of threshold 
+# (difference btw first pred and second pred) 
+# UMAP are produced :
+# - For each labels
+# - Global UMAP with sims annotation without unknown threshold
+# - Global UMAP with sims annotation with unknown threshold
 #-------------------------------------
+
 library(Seurat)
-library(ggplot2)
 library(data.table)
-library(dplyr)
+library(cowplot)
+library(ggplot2)
 
 #-------------------------------------
 # Path to files / folders
@@ -16,6 +19,8 @@ library(dplyr)
 DIRECTORY = getwd()
 TEXT_OUTPUT = snakemake@output[["data_for_sims_output"]]
 OUTPUTDIR = file.path((DIRECTORY), "05_Output")
+RDS_ASSIGNATION = snakemake@params[["rds_assignation"]]
+PRED_FILTERED = snakemake@params[["pred_filtered"]]
 
 SAMPLE_ID = snakemake@params[["sample_id"]]
 
@@ -24,56 +29,92 @@ MATRIX = snakemake@params[["matrix"]]
 STEP2 = "02_seurat/"
 STEP3 = "03_sims/"
 
-#-------------------------------------
-# Load  the perdiction file and the 
-# seurat file
-#-------------------------------------
+# Load data
 data <- readRDS(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID,"_filtered_seurat_object.rds")))
-pred <- fread(file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(MATRIX,"_prediction.csv")))
+pred <- fread(file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(SAMPLE_ID, "_", MATRIX, "_prediction.csv")))
 
-#-------------------------------------
-# Add the first prediction in the 
-# seurat object
-#-------------------------------------
+# Assigning predictions and probabilities to data
 data$labels <- pred$first_pred
 Idents(data) <- data$labels
+diff_prob <- pred$first_prob - pred$second_prob
+data$diff_prob <- diff_prob
 
-#-------------------------------------
-# Create a list for each label
-#-------------------------------------
-liste <- unique(pred$first_pred)
+# Threshold to say if a cell is unknown or not
+threshold <- snakemake@params[["threshold"]]
+cells_threshold <- colnames(data)[data$diff_prob < threshold]
 
-#-------------------------------------
-# Create a list to stock each UMAP
-#-------------------------------------
-plots_list <- list()
+Idents(object = data, cells = cells_threshold) <- "unknown"
 
-#-------------------------------------
-# To create a UMAP for each label
-#-------------------------------------
-for (i in liste) {
-  cells <- WhichCells(data, idents = c(i))
-  umap <- DimPlot(data, reduction = "umap", group.by = "labels", cells.highlight= list(cells), cols.highlight = c("lightblue"), cols= "grey") + ggtitle(i)
-  plots_list[[i]] <- umap
+# List of unique identifiers
+list <- unique(Idents(data))
+# list <- c(list,"unknown")
+
+# Function for creating graphics
+createUMAPPlot <- function(data, id) {
+  cells <- WhichCells(data, idents = id)
+  umap <- DimPlot(data, reduction = "umap", group.by = "labels", 
+                  cells.highlight = list(cells), cols.highlight = c("brown2"), 
+                  cols = "grey") + ggtitle(id)
+  return(umap)
 }
 
+# Create a list of graphics
+plots_list <- lapply(list, createUMAPPlot, data = data)
+
+# Set number of graphics per page
+num_graphiques_par_page <- 1
+
+# Divide graphics into pages
+pages <- split(plots_list, ceiling(seq_along(plots_list) / num_graphiques_par_page))
+
 #-------------------------------------
 # Save each figure on a pdf
 #-------------------------------------
-num_graphiques_par_page <- 1
-pages <- split(plots_list, ceiling(seq_along(plots_list) / num_graphiques_par_page))
-pdf(file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, "umap_per_labels.csv"))
-for (i in seq_along(pages)) {
-    for(plot in pages[[i]]){
-        print(plot)
-    }
+pdf(file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(SAMPLE_ID, "_umap_per_labels.pdf")))
+for (page in pages) {
+  current_page <- plot_grid(plotlist = page, ncol = 1)
+  print(current_page)
 }
 dev.off()
 
+# Create PDF with global UMAP graph (with sims assignation)
+title <- paste("UMAP", SAMPLE_ID, sep=' ')
+pdf(file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(SAMPLE_ID, "_umap_sims.pdf")), width=12, height=12)
+UMAP <- DimPlot(data, group.by = "labels", label.size = 6,repel = TRUE) + theme(legend.position = "bottom") + ggtitle(title)
+print(UMAP)
+dev.off()
+
+# Create PDF with UMAP graph of cells exceeding threshold (without NA cells)
+cells_threshold <- subset(data, subset = diff_prob >= threshold)
+title_threshold <- paste("UMAP filtered", SAMPLE_ID, sep=' ')
+
+pdf(file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(SAMPLE_ID, "_umap_sims_threshold.pdf")), width=12, height=12)
+UMAP_threshold <- DimPlot(cells_threshold, group.by = "labels", repel = TRUE) + theme(legend.position = "bottom") + ggtitle(title_threshold)
+print(UMAP_threshold)
+dev.off()
+
 #-------------------------------------
-# Save the globa figure on a pdf
+# Update the label in the metadata with
+# unknown cells and save the RDS
 #-------------------------------------
-pdf(file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, "umap.csv"))
-UMAP <- DimPlot(data, group.by = "labels", repel = TRUE) + theme(legend.position = "right")
-print(UMAP)
-dev.off()
\ No newline at end of file
+
+data$labels <- Idents(data)
+saveRDS(data, file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(SAMPLE_ID, "_", RDS_ASSIGNATION)))
+
+#-------------------------------------
+# create the predicted matrix with the
+# output file for the snakemake rule
+#-------------------------------------
+
+tab_pred_filtered <- paste(Cells(cells_threshold),cells_threshold$labels,cells_threshold$diff_prob)
+write.table(file=file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(SAMPLE_ID, "_", PRED_FILTERED)), tab_pred_filtered ,sep=",", quote=F, row.names=F, col.names=F)
+
+#-------------------------------------
+# create the output file for the snakemake rule
+#-------------------------------------
+
+TEXT_OUTPUT <- snakemake@output[["umapAssignation_output"]]
+
+output_file<-file(TEXT_OUTPUT)
+writeLines(c("Rules UMAP cell assignation finished"), output_file)
+close(output_file)
\ No newline at end of file
diff --git a/04_Workflow/SIMS.smk b/04_Workflow/SIMS.smk
index 7decb73..14a90d0 100644
--- a/04_Workflow/SIMS.smk
+++ b/04_Workflow/SIMS.smk
@@ -4,16 +4,16 @@
 # single-cell RNA-seq data
 #----------------------------------------
 
-rule sims:
+rule sims_training:
     input:
         anndata_for_sims_output = expand(OUTPUTDIR + "03_sims/anndata_for_sims_output.txt"),
 
     output:
-        sims_output = expand(OUTPUTDIR + "03_sims/output_sims.txt"),
+        sims_training_output = expand(OUTPUTDIR + "03_sims/output_sims_training.txt"),
 
     params:
         sims_rule = config["rules"]["sims_rule"],
-        sample_id = config["reference_sims"]["sample_id"],
+        reference_name = config["reference_sims"]["reference_name"],
         reference_matrix = config["reference_sims"]["output_name_ref_matrix"],
         project_name = config["sims"]["project_name"],
         class_label = config["sims"]["class_label"],
@@ -31,32 +31,42 @@ rule sims:
         """
         if [ {params.sims_rule} = "FALSE" ]
         then
-            touch {output.sims_output}
+            touch {output.sims_training_output}
         else
-            pip install --use-pep517 git+https://github.com/braingeneers/SIMS.git
-            python 03_Script/sims.py {params.sample_id} {params.reference_matrix} {params.project_name} {params.class_label} \
+            pip install --use-pep517 git+https://github.com/braingeneers/SIMS.git@c3cc547e9223e979fdc70f9af2ae932b729da88e
+            python 03_Script/sims_training.py {params.reference_name} {params.reference_matrix} {params.project_name} {params.class_label} \
             {params.num_workers} {params.monitor} {params.patience} {params.max_epoch} {params.matrix} {params.key}
-            touch {output.sims_output}
+            touch {output.sims_training_output}
         fi
         """
 
+rule sims_prediction:
+    input:
+        sims_training_output = expand(OUTPUTDIR + "03_sims/output_sims_training.txt"),
 
+    output:
+        sims_prediction_output = expand(OUTPUTDIR + "03_sims/output_sims_prediction.txt"),
+        # sims_prediction_report = report(expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_data_matrix_prediction.csv", sample_id = SAMPLE_ID), caption = REPORT + "label.rst", category = "03 sims"),
+        # sims_prediction_unknown_report = report(expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_data_matrix_prediction_filtered.csv", sample_id = SAMPLE_ID), caption = REPORT + "label.rst", category = "03 sims"),
 
-    # Si ne marche pas de ma façon : A REFAIRE COMME CA !!!
-    # shell:
-    #     """
-    #     if [ {params.sims.rule} = "FALSE" ]
-    #     then
-    #         touch {output.sims.output}
-    #     else
-    #         export PATH="/home/lchabot/.local/bin:$PATH"
-    #         export PATH="/scratch/lchabot/BIOINFO_PROJECT/workflow_scrnaseq_P5-30/.local/bin:$PATH"
-    #         pip install --upgrade pip setuptools wheel
-    #         pip install protobuf==3.20.*
-    #         pip install --use-pep517 git+https://github.com/braingeneers/SIMS.git
-    #         python 03_Script/sims.py {params.reference_0} {params.metadata_0} {params.project_name} {params.class_label} {params.max_epoch}
-    #         touch {output.sims.output}
-    #     fi
-    #     """
-
-    # export Path => Chemin ou est localiser l'instalation python et pip
\ No newline at end of file
+    params:
+        sims_rule = config["rules"]["sims_rule"],
+        sample_id = config["reference_sims"]["sample_id"],
+        matrix = config["reference_sims"]["output_name_matrix"],
+        epoch = config["sims"]["epoch"],
+        reference_name = config["reference_sims"]["reference_name"],
+
+    message:
+        "Run SIMS for cell annotation"
+
+    shell:
+        """
+        if [ {params.sims_rule} = "FALSE" ]
+        then
+            touch {output.sims_prediction_output}
+        else
+            pip install --use-pep517 git+https://github.com/braingeneers/SIMS.git@c3cc547e9223e979fdc70f9af2ae932b729da88e
+            python 03_Script/sims_prediction.py {params.sample_id} {params.matrix} {params.epoch} {params.reference_name}
+            touch {output.sims_prediction_output}
+        fi
+        """
\ No newline at end of file
diff --git a/04_Workflow/knnor.smk b/04_Workflow/knnor.smk
index 0620545..dfdd0f7 100644
--- a/04_Workflow/knnor.smk
+++ b/04_Workflow/knnor.smk
@@ -32,7 +32,7 @@ rule knnor:
             touch {output.knnor_output}
         else
             
-            python 03_Script/imbalenced_datasets.py {params.sample_id} {params.matrix} {params.metadata} {params.class_label} "{params.max_oversampling}" {params.cells_column}
+            python 03_Script/imbalanced_datasets.py {params.sample_id} {params.matrix} {params.metadata} {params.class_label} "{params.max_oversampling}" {params.cells_column}
             touch {output.knnor_output}
         fi
         """
\ No newline at end of file
diff --git a/04_Workflow/prepare_data_sims.smk b/04_Workflow/prepare_data_sims.smk
index 051dc73..862b379 100644
--- a/04_Workflow/prepare_data_sims.smk
+++ b/04_Workflow/prepare_data_sims.smk
@@ -13,6 +13,7 @@ rule data_for_sims:
         # sims_rule = config["rules"]["sims_rule"],
         ### General
         r_script = config["reference_sims"]["r_script"],
+        reference_name = config["reference_sims"]["reference_name"],
         sample_id = config["reference_sims"]["sample_id"],
         output_name_ref_metadata = config["reference_sims"]["output_name_ref_metadata"],
         output_name_ref_matrix = config["reference_sims"]["output_name_ref_matrix"],
@@ -20,10 +21,7 @@ rule data_for_sims:
         norm_method = config["seurat"]["norm_method"],
         norm_scale_factor = config["seurat"]["norm_scale_factor"],
         ### Reference Arlotta
-        # arlotta_metadata = config["reference_sims"]["arlotta_metadata"],
-        # arlotta_matrix = config["reference_sims"]["arlotta_matrix"],
-        # arlotta_cells = config["reference_sims"]["arlotta_cells"],
-        # arlotta_features = config["reference_sims"]["arlotta_features"],
+        arlotta_metadata = config["reference_sims"]["arlotta_metadata"],
         developmental_time = config["reference_sims"]["developmental_time"],
         ### Reference Allen mouse whole cortex
         allen_metadata = config["reference_sims"]["allen_metadata"],
@@ -57,6 +55,7 @@ rule anndata_for_sims:
         output_name_matrix = config["reference_sims"]["output_name_matrix"],
         output_name_ref_metadata = config["reference_sims"]["output_name_ref_metadata"],
         cells_column = config["reference_sims"]["cells_column"],
+        reference_name = config["reference_sims"]["reference_name"],
 
     conda:
         CONTAINER + "preparation_sims.yaml"
@@ -70,7 +69,7 @@ rule anndata_for_sims:
         then
             touch {output.anndata_for_sims_output}
         else
-            python 03_Script/to_anndata_file.py {params.sample_id} {params.output_name_ref_matrix} {params.output_name_matrix} {params.output_name_ref_metadata} {params.cells_column}
+            python 03_Script/to_anndata_file.py {params.sample_id} {params.output_name_ref_matrix} {params.output_name_matrix} {params.output_name_ref_metadata} {params.cells_column} {params.reference_name}
             touch {output.anndata_for_sims_output}
         fi
         """
\ No newline at end of file
diff --git a/04_Workflow/remi.smk b/04_Workflow/remi.smk
new file mode 100644
index 0000000..5710b35
--- /dev/null
+++ b/04_Workflow/remi.smk
@@ -0,0 +1,18 @@
+rule remi:
+    input:
+        ref_cellranger_output = expand(OUTPUTDIR + "01_cellranger/ref_cellranger_output.txt"),
+
+    output: 
+        remi_output = expand(OUTPUTDIR + "remi_output.txt"),
+
+    conda:
+        CONTAINER + "seurat.yaml"
+
+    message:
+        "TEST REMI"
+
+    shell:
+        """
+        Rscript 03_Script/remi.R
+        touch {output.remi_output}
+        """
\ No newline at end of file
diff --git a/04_Workflow/umapCellAssignation.smk b/04_Workflow/umapCellAssignation.smk
index 11975ce..e233e31 100644
--- a/04_Workflow/umapCellAssignation.smk
+++ b/04_Workflow/umapCellAssignation.smk
@@ -1,19 +1,25 @@
 rule umapCellAssignation:
     input:
-        sims_output = expand(OUTPUTDIR + "03_sims/output_sims.txt"),
+        sims_prediction_output = expand(OUTPUTDIR + "03_sims/output_sims_prediction.txt"),
 
     output:
         umapAssignation_output = expand(OUTPUTDIR + "03_sims/umapAssignation_output.txt"),
+        umap_sims_report = report(expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_umap_sims.pdf", sample_id = SAMPLE_ID), caption = REPORT + "umap_sims.rst", category = "03 sims"),
+        umap_sims_threshold_report = report(expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_umap_sims_threshold.pdf", sample_id = SAMPLE_ID), caption = REPORT + "umap_sims.rst", category = "03 sims"),
+        umap_per_labels_report = report(expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_umap_per_labels.pdf", sample_id = SAMPLE_ID), caption = REPORT + "umap_sims.rst", category = "03 sims"),
 
     params:
         sample_id = config["reference_sims"]["sample_id"],
         matrix = config["reference_sims"]["output_name_matrix"],
-    
-    conda :
+        threshold = config["sims"]["threshold"],
+        rds_assignation = config["sims"]["rds_assignation"],
+        pred_filtered = config["sims"]["pred_filtered"],
+
+    conda:
         CONTAINER + "seurat.yaml"
 
     message:
         "Cells assignation on UMAP"
 
     script:
-        SCRIPTDIR + "umapCellAssignation.R"
+        SCRIPTDIR + "umapCellAssignation.R"
\ No newline at end of file
diff --git a/Snakefile b/Snakefile
index 737300a..e082ef0 100644
--- a/Snakefile
+++ b/Snakefile
@@ -59,74 +59,43 @@ rule all:
 		# fastqc_output = expand(OUTPUTDIR + "00_clean/fastqc_output.txt"),
 		### multiqc ###
 		# multiqc_output = expand(OUTPUTDIR + "00_clean/multiqc_output.txt"),
+		# raw_multiqc_html = expand(OUTPUTDIR + "00_clean/raw_multiqc.html"),
 		### ReferenceEnhancer to generate a scRNA-seq optimized transcriptomic reference ###
 		### To improve with manual curration of the file overlapping_gene_list
 		# reference_enhancer_output = expand(OUTPUTDIR + "01_cellranger/reference_enhancer_output.txt"),
-		### reference for cellranger ###
+		# ### reference for cellranger ###
 		# ref_cellranger_output = expand(OUTPUTDIR + "01_cellranger/ref_cellranger_output.txt"),
 		### Cell Multiplexing with cellranger multi ###
 		# cellranger_output = expand(OUTPUTDIR + "01_cellranger/cellranger_output.txt"),
-		## If you need to aggregate your data
+		# cellranger_html = expand(OUTPUTDIR + "01_cellranger/{sample_id}/{sample_id}_web_summary.html", sample_id = SAMPLE_ID),
+		### If you need to aggregate your data
 		# aggrcsv = ROOTDIR + "/aggregation.csv",
 		# out_aggregate = expand(OUTPUTDIR + "01_cellranger/{agrr}/outs/aggregate_web_summary.html", agrr=AGRR),
-		## If demuxiplexing is used
+		### If demuxiplexing is used
 		# bcf = OUTPUTDIR + "01_cellranger/Mix_MM_lines/outs/demuxlet_Mix_MM_lines.bcf",
 		# demuxlet = OUTPUTDIR + "01_cellranger/Mix_MM_lines/outs/demuxlet_Mix_MM_lines.best",
 		# tabdemuxlet = OUTPUTDIR + "01_cellranger/Mix_MM_lines/outs/demuxlet_Mix_MM_lines.tsv",
-		# Seurat
+		### Seurat
 		# seurat_report = expand(OUTPUTDIR + "02_seurat/{sample_id}/{sample_id}_seurat_report.html", sample_id = SAMPLE_ID),
+		### Remi data
+		# remi_output = expand(OUTPUTDIR + "remi_output.txt"),
 		### Prepare data for SIMS
 		# data_for_sims_output = expand(OUTPUTDIR + "03_sims/data_for_sims_output.txt"),
-		anndata_for_sims_output = expand(OUTPUTDIR + "03_sims/anndata_for_sims_output.txt"),
+		# anndata_for_sims_output = expand(OUTPUTDIR + "03_sims/anndata_for_sims_output.txt"),
 		### KNNOR
 		# knnor_output = expand(OUTPUTDIR +"03_sims/knnor_output.txt"),
 		# anndata_for_sims_output = expand(OUTPUTDIR + "03_sims/anndata_for_sims_output.txt"),
 		### SIMS
-		# sims_output = expand(OUTPUTDIR + "03_sims/output_sims.txt"),
-		# umapAssignation_output = expand(OUTPUTDIR + "03_sims/umapAssignation_output.txt"),
-		################### REMI
-		# remi_output = expand(OUTPUTDIR + "remi_output.txt"),
+		# sims_training_output = expand(OUTPUTDIR + "03_sims/output_sims_training.txt"),
+		# sims_prediction_output = expand(OUTPUTDIR + "03_sims/output_sims_prediction.txt"),
+		# sims_prediction_report = expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_data_matrix_prediction.csv", sample_id = SAMPLE_ID),
+		# sims_prediction_unknown_report = expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_data_matrix_prediction_filtered.csv", sample_id = SAMPLE_ID),
+		### Representing cellular assignation on UMAP
+		umapAssignation_output = expand(OUTPUTDIR + "03_sims/umapAssignation_output.txt"),
+		# umap_sims_report = expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_umap_sims.pdf", sample_id = SAMPLE_ID),
+		# umap_sims_threshold_report = expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_umap_sims_threshold.pdf", sample_id = SAMPLE_ID),
+		# umap_per_labels_report = expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_umap_per_labels.pdf", sample_id = SAMPLE_ID),
 
-
-
-		## Differential expression analyses
-		# violinplot = expand(OUTPUTDIR + "03_diffexp/violin_plot/{features}_violin_plot.pdf", features=FEATURES),
-		# umapfeature = expand(OUTPUTDIR + "03_diffexp/umap_plot/{features}_umapfeature_plot.pdf", features=FEATURES),
-		# tsnefeature = expand(OUTPUTDIR + "03_diffexp/tsne_plot/{features}_tsnefeature_plot.pdf", features=FEATURES),
-		# ridgefeature = expand(OUTPUTDIR + "03_diffexp/ridge_plot/{features}_ridgefeature_plot.pdf", features=FEATURES),
-		# heatmapfeature = OUTPUTDIR + "03_diffexp/heatmap/heatmapfeature.pdf",
-		# diffexp_report = OUTPUTDIR + "03_diffexp/" + NPROJ + "_diffexp_report.html",
-		# defile_allcells = expand(OUTPUTDIR + "03_diffexp/differential_expression_features/{wt}_vs_AllCellMarker_DE.csv", wt=WT),
-		# defile = expand(OUTPUTDIR + "03_diffexp/differential_expression_features/{wt}_vs_{cellmarker}_DE.csv", wt=WT, cellmarker=CELLMARKER),
-		# volcano_allcluster = expand(OUTPUTDIR + "03_diffexp/volcanoplot/volcano_{wt}_vs_AllCellMarker.pdf", wt=WT),
-		# volcano = expand(OUTPUTDIR + "03_diffexp/volcanoplot/volcano_{wt}_vs_{cellmarker}.pdf", wt=WT, cellmarker=CELLMARKER),
-		# sign_up_allcell = expand(OUTPUTDIR + "03_diffexp/de_significant/{wt}_vs_AllCellMarker_signif-up-regulated.txt", wt=WT),
-		# sign_up = expand(OUTPUTDIR + "03_diffexp/de_significant/{wt}_vs_{cellmarker}_signif-up-regulated.txt", wt=WT, cellmarker=CELLMARKER),
-		# sign_down_allcell = expand(OUTPUTDIR + "03_diffexp/de_significant/{wt}_vs_AllCellMarker_signif-down-regulated.txt", wt=WT),
-		# sign_down = expand(OUTPUTDIR + "03_diffexp/de_significant/{wt}_vs_{cellmarker}_signif-down-regulated.txt", wt=WT, cellmarker=CELLMARKER),
-		### Differential expression analyses inside each cluster after subset (by gene expression)
-		# diffexp_subset_report = OUTPUTDIR + "04_diffexp_subset/" + NPROJ + "_diffexp_subset_report.html",
-		# defile_subset = expand(OUTPUTDIR + "04_diffexp_subset/differential_expression/{cluster}_subcluster_DE.csv", cluster=CLUSTER),
-		# volcano_subset = expand(OUTPUTDIR + "04_diffexp_subset/volcanoplot/volcano_{cluster}.pdf", cluster=CLUSTER),
-		# sign_up_subset = expand(OUTPUTDIR + "04_diffexp_subset/de_significant/{cluster}_signif-up-regulated.txt", cluster=CLUSTER),
-		# sign_down_subset = expand(OUTPUTDIR + "04_diffexp_subset/de_significant/{cluster}_signif-down-regulated.txt", cluster=CLUSTER),
-		# violinplot_subset = expand(OUTPUTDIR + "04_diffexp_subset/violin_plot/{features}_violin_plot.pdf", features=FEATURES),
-		### Compress result files and figures ###
-		# data_matrix_tar = OUTPUTDIR + "02_seurat/data_matrix.tar.gz",
-		# violinplot_tar = OUTPUTDIR + "03_diffexp/violin_plot.tar.gz",
-		# umapfeature_tar = OUTPUTDIR + "03_diffexp/umapfeature_plot.tar.gz",
-		# tsnefeature_tar = OUTPUTDIR + "03_diffexp/tsnefeature_plot.tar.gz",
-		# ridgefeature_tar = OUTPUTDIR + "03_diffexp/ridgefeature_plot.tar.gz",
-		# heatmapfeature_tar = OUTPUTDIR + "03_diffexp/heatmapfeature.tar.gz",
-		# defile_tar = OUTPUTDIR + "03_diffexp/differencial_expression_tests.tar.gz",
-		# volcano_tar = OUTPUTDIR + "03_diffexp/volcano_plot.tar.gz",
-		# sign_up_down_tar = OUTPUTDIR + "03_diffexp/up_down_regulated_genes_list.tar.gz",
-		# defile_subset_tar = OUTPUTDIR + "04_diffexp_subset/differencial_expression_tests.tar.gz",
-		# volcano_subset_tar = OUTPUTDIR + "04_diffexp_subset/volcano_plot.tar.gz",
-		# sign_up_subset_tar = OUTPUTDIR + "04_diffexp_subset/up_down_regulated_genes_list.tar.gz",
-		# violinplot_subset_tar = OUTPUTDIR + "04_diffexp_subset/violin_plot.tar.gz",
-		# clean = OUTPUTDIR + "03_diffexp/clean.txt",
-  
 # ----------------------------------------------
 # Load rules 
 # ----------------------------------------------
@@ -135,28 +104,22 @@ run_demultiplex = config["run_demultiplex"]
 run_multiplex = config["run_multiplex"]
 
 if run_demultiplex:
-	include: ENVDIR + "clean_demultiplex.smk"
-	include: ENVDIR + "cellranger_demultiplex.smk"
-	include: ENVDIR + "seurat.smk"
-	include: ENVDIR + "prepare_data_sims.smk"
-	include: ENVDIR + "SIMS.smk"
-	include: ENVDIR + "knnor.smk"
-	include: ENVDIR + "remi.smk"
+	# include: ENVDIR + "clean_demultiplex.smk"
+	# include: ENVDIR + "cellranger_demultiplex.smk"
+	# include: ENVDIR + "seurat.smk"
+	# include: ENVDIR + "remi.smk"
+	# include: ENVDIR + "prepare_data_sims.smk"
+	# include: ENVDIR + "SIMS.smk"
+	# include: ENVDIR + "knnor.smk"
 	include: ENVDIR + "umapCellAssignation.smk"
 
 if run_multiplex:	
-	include: ENVDIR + "clean.smk"
-	include: ENVDIR + "cellranger.smk"
-	include: ENVDIR + "seurat.smk"
-	include: ENVDIR + "prepare_data_sims.smk"
-	include: ENVDIR + "SIMS.smk"
-	include: ENVDIR + "knnor.smk"
-	include: ENVDIR + "remi.smk"
+	# include: ENVDIR + "clean.smk"
+	# include: ENVDIR + "cellranger.smk"
+	# include: ENVDIR + "seurat.smk"
+	# include: ENVDIR + "remi.smk"
+	# include: ENVDIR + "prepare_data_sims.smk"
+	# include: ENVDIR + "SIMS.smk"
+	# include: ENVDIR + "knnor.smk"
 	include: ENVDIR + "umapCellAssignation.smk"
 
-
-# include: ENVDIR + "demuxlet.smk"
-# include: ENVDIR + "seurat.smk"
-# include: ENVDIR + "diffexp.smk"
-# include: ENVDIR + "diffexp_subset.smk"
-# include: ENVDIR + "report.smk"
diff --git a/config.yaml b/config.yaml
index 060cdcd..5f00fd9 100644
--- a/config.yaml
+++ b/config.yaml
@@ -9,11 +9,11 @@ sample: sample.tsv
 #-------------------------------------
 
 # If you run the workflow with slurm (sbatch run_slurm.sh)
-run_slurm: false
+run_slurm: true
 # If you data are demultiplexed = true
-run_demultiplex: false
+run_demultiplex: true
 # If your data are not demultiplexed = true
-run_multiplex: true
+run_multiplex: false
 
 rules:
     # Optimizing and assembling genome annotations for 3’ single-cell RNA-sequencing analysis
@@ -21,7 +21,9 @@ rules:
     # Build the reference with cellranger
     ref_cellranger_rule: "FALSE"
     # Run SIMS to anotate cells
-    sims_rule: "FALSE"
+    sims_rule: "TRUE"
+    # Run Knnor to help with imbalanced datasets
+    knnor_rule: "TRUE"
 
 run:
     # FASTQ file naming convention : [Sample name]_S1_L00[Lane Number]_[Read Type]_001.fastq.gz
@@ -45,12 +47,12 @@ fastq:
 
 # Genome and gff link to create the pre-built Cell Ranger reference
 reference:
-    link_ref_fasta: "https://ftp.ensembl.org/pub/release-98/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz"
-    ref_fasta: "Mus_musculus.GRCm38.dna.primary_assembly.fa"
-    link_ref_gtf: "https://storage.googleapis.com/generecovery/mouse_mm10_optimized_annotation_v2.gtf.gz"
+    link_ref_fasta: "https://ftp.ensembl.org/pub/release-109/fasta/mus_musculus/dna/Mus_musculus.GRCm39.dna.primary_assembly.fa.gz"
+    ref_fasta: "Mus_musculus.GRCm39.dna.primary_assembly.fa"
+    link_ref_gtf: "https://ftp.ensembl.org/pub/release-109/gtf/mus_musculus/Mus_musculus.GRCm39.109.gtf.gz"
     ref_gtf: "Mus_musculus.GRCm39.109.gtf"
     # Name for the file of the cellranger reference
-    ref_cellranger: "refdata-cellranger-mm10"
+    ref_cellranger: "refdata-cellranger-mm39"
     # Optional reference version string to include with
     ref_version: "1.0.0"
 
@@ -62,7 +64,7 @@ reference:
 # Parameter for cellranger multi
 multi:
     # A unique run ID string: e.g. sample345 that is also the output folder name. Cannot be more than 64 characters.
-    id: "S004647_to_S004650"
+    id: "E14"
     # Path to config CSV file with input libraries and analysis parameters.
     config: "multi_config.csv"
 
@@ -94,7 +96,7 @@ seurat:
     # Number of cells above which use ggplot instead of interactive plotly
     plot_raster_nbcells_threshold : 10000
     # CellRanger count path for each sample for Read10X
-    cell_ranger_count_path: "/count/sample_filtered_feature_bc_matrix/"
+    cell_ranger_count_path: "/filtered_feature_bc_matrix/"
 
     ##### Filtering 
     # Switch from QC exploration mode and QC filtering
@@ -108,18 +110,18 @@ seurat:
     min_features: 200
     # Cells with number of UMIs outside the range will be excluded
     # Can be a list of UMIs but MUST be on the same order than sample.tsv id.
-    filter_umi_min: "0,0,0,0"
-    filter_umi_max: "55000,42000,42000,40000"
+    filter_umi_min: "0"
+    filter_umi_max: "40000"
     # Filter cells that have unique feature counts over/less n
     # Can be a list of feature counts but MUST be on the same order than sample.tsv id.
-    filter_feature_min: "1700,1700,1400,1400"
-    filter_feature_max: "7900,7500,7700,7200"
+    filter_feature_min: "0"
+    filter_feature_max: "7500"
     # Filter cells that have > n% mitochondrial counts
-    filter_percent_mt: "10,10,8,8"
+    filter_percent_mt: "17"
     # Filter cells that have < n% mitochondrial counts
-    filter_percent_mt_min: "0.2,0.2,0.2,0.2"
+    filter_percent_mt_min: "0"
     # Filter cells that have < n% ribosomal counts
-    filter_percent_rb: "0,0,0,0"
+    filter_percent_rb: "0"
     # Pattern to match features against => mitochondrial genes | for mouse : "^mt-" / for human : "^MT-"
     pattern_mt: "^mt-"
     # Pattern to match features against => ribosomal genes
@@ -166,72 +168,80 @@ seurat:
 # Parameters to build correct reference(s) for sims
 reference_sims:
     ##### General #####
-    #  Path of the script you want to execute (Corespond to the reference you want to use)
-    # For now : arlotta.R, allen_mouse_whole_cortex.R, mousegastrulation.R
-    r_script: "mousegastrulation.R"
-    #
-    output_name_ref_metadata:  "mousegastrulation_reference_metadata"
-    #
-    output_name_ref_matrix: "mousegastrulation_reference_matrix"
-    #
-    output_name_matrix: "mousegastrulation_matrix"
+    #  Name of the script you want to execute (Corespond to the reference you want to use)
+    # For now : arlotta.R, allen_mouse_whole_cortex.R
+    r_script: "allen_mouse_whole_cortex.R"
+    # Output folder name for the reference
+    reference_name: "Allen"
+    # Output file name for the reference metadata
+    output_name_ref_metadata: "metadata"
+    # Output file name for the reference matrix
+    output_name_ref_matrix: "reference_matrix"
+    # Output file name for the matrix to annotate
+    output_name_matrix: "data_matrix"
+    # Name of the column that contain the cells
+    # allen mouse whole cortex : "sample_name", arlotta : "NEW_NAME"
+    cells_column: "sample_name"
 
     ##### Our Matrix to annotate #####
-    # The name of the folder of the sample you want to annotate (Folder in the Seurat folder)
-    sample_id: "72h"
+    # The name of the folder of the sample you want to annotate (One of the output in the seurat folder)
+    sample_id: "P30_WT"
 
     ##### Paola Arlotta #####
     # Path for the arlotta metadata
     arlotta_metadata: "arlotta/metaData_scDevSC.txt"
-    #
-    arlotta_matrix: "arlotta/gene_sorted-matrix.mtx.gz"
-    #
-    arlotta_cells: "arlotta/barcodes.tsv"
-    #
-    arlotta_features: "arlotta/genes.tsv"
     # Pattern that correspond to the mouse age that we want to keep for the training of our data
-    # It's possible to select multiple ones if separate by a coma
-    # In this reference : E10, E11, E12, E13, E14, E15, E16, E17, E18, P1, P4
-    pattern_to_keep: "P4"
-
+    # It's possible to select multiple ones if separate by a coma, exemple: ["E10", "E11", "P4"]
+    # In this reference : E10, E11, E12, E13, E14, E15, E16, E17, E18_S1, E18_S3, P1, P1_S1, P4
+    developmental_time: ["E10", "E11", "E12", "E13", "E14", "E15", "E16", "E17", "E18_S1", "E18_S3", "P1", "P1_S1", "P4"]
+    
     ##### Allen_mouse_whole_cortex #####
-    #
-    allen_metadata: "allen/"
-    #
-    allen_matrix: "allen/"
-    #
-    allen_genes: "allen/"
-    #
-    allen_cells: "allen/"
-
-    ##### mousegastrulation #####
-    # Samples from the reference as described in EmbryoAtlasData object (for lenne project : c(1:20,23:37)
-    # mousegastrulation_samples: "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37"
-    mousegastrulation_samples: "1,5"
+    # Path for the arlotta metadata
+    allen_metadata: "allen/metadata.csv"
+    # Path for the allen matrix (matrix + genes + cells)
+    allen_matrix: "allen/expression_matrix.hdf5"
 
+# An oversampling technique for imalanced datasets
+knnor:
+    # Name of the column in your metadata you want to oversample
+    class_label: "subclass_label"
+    # This method will increase the minority labels up to the value of the majority label.
+    # If you want to reduce the maximum number of new labels created, you need to remove the majority labels.
+    # If multiple labels to delete, put a comma between them (with NO SPACE). (You can also keep them all by not writting)
+    max_oversampling : 
+        # "Apical progenitors,UL CPN,Migrating neurons,Interneurons,Intermediate progenitors,Layer 4,CThPN,DL CPN,SCPN,Astrocytes,Immature neurons,Oligodendrocytes,Cycling glial cells,Cajal Retzius cells,DL_CPN_1,NP,VLMC"
 sims:
     ##### General
     # Name of the column in your metadata on which you wish to train your model.
-    class_label: "New_cellType"
-    # 
-    # https://lightning.ai/docs/pytorch/stable/advanced/speed.html
-    num_workers: 12
-    #
-    max_epoch: 100
+    # Aallen mouse whole cortex : "subclass_label", arlotta : "New_cellType"
+    class_label: "celltype_label"
+    # Your personnal key to connect to your wandb account
+    key: "5f86a9267b08ff77c21a8872d763cb76048dfaca"
+    # Number of cores (Put the same in the run_slurm.sh if necessary)
+    num_workers: 18
+    # Maximum number of epoch to train our model (Will stop after, start counting from 0)
+    max_epoch: 500
 
     ##### Logger
-    # Name of the project (Will be the name of the folder where the model checkpoint will be saved)
-    project_name: "P5_WT_test"
+    # Name of the project (Will be the name of the run and checkpoint)
+    project_name: "Allen_norm_slurm"
 
     ##### Early Stopping
     # The validation metric you wish to monitor for early stop.
     monitor: "val_loss"
-    # The number of epochs during which the validation metric may not improve before early shutdown is triggered.
-    # 10% of the number of epoch to beggin with
+    # The number of epochs during which the validation metric (monitor) may not improve before early shutdown is triggered. (To prevent overtraining)
     patience: 10
 
-    ##### Model Checkpoint
-    #
+    # Epoch file from wandb to chosse for the cell assignation
+    epoch: "epoch=47-step=86784.ckpt"
+
+    ##### Cells annotation
+    # Threshold of the difference between the first and second score of the prediction to say if a cell is unknown or not
+    threshold: 0.1
+    # Seurat object with sims cell_assignation
+    rds_assignation: filtered_assigned_seurat_object.rds
+    # Matrix with cell predtion after filtration with the threshold
+    pred_filtered: data_matrix_prediction_filtered.csv
 
 # Differentially expressed features analyses
 diffexp:
@@ -256,6 +266,3 @@ diffexpsubset:
     genetoremove: ""
     # Cluster on the seurat object (choose cluster with high number of the marker gene of interest)
     cluster: "Normal_Melanocytes,M15,M27,A375,MM057,MM087,MM001,MM074"
-
-
-    
diff --git a/feature_config.csv b/feature_config.csv
new file mode 100644
index 0000000..afc355e
--- /dev/null
+++ b/feature_config.csv
@@ -0,0 +1,16 @@
+id,name,read,pattern,sequence,feature_type
+HTO-1,HTO-1,R2,^(BC),TTCCTGCCATTACTA,Antibody Capture
+HTO-2,HTO-2,R2,^(BC),CCGTACCTCATTGTT,Antibody Capture
+HTO-3,HTO-3,R2,^(BC),GGTAGATGTCCTCAG,Antibody Capture
+HTO-4,HTO-4,R2,^(BC),TGGTGTCATTCTTGA,Antibody Capture
+HTO-5,HTO-5,R2,^(BC),ATGATGAACAGCCAG,Antibody Capture
+HTO-6,HTO-6,R2,^(BC),CTCGAACGCTTATCG,Antibody Capture
+HTO-7,HTO-7,R2,^(BC),CTTATCACCGCTCAA,Antibody Capture
+HTO-8,HTO-8,R2,^(BC),TGACGCCGTTGTTGT,Antibody Capture
+HTO-9,HTO-9,R2,^(BC),GCCTAGTATGATCCA,Antibody Capture
+HTO-10,HTO-10,R2,^(BC),AGTCACAGTATTCCA,Antibody Capture
+HTO-11,HTO-11,R2,^(BC),TTGGCCTTTGTATCG,Antibody Capture
+HTO-12,HTO-12,R2,^(BC),AACGCCAGTATGAAC,Antibody Capture
+HTO-13,HTO-13,R2,^(BC),GCTTCCGTATATCTG,Antibody Capture
+HTO-14,HTO-14,R2,^(BC),AGCAACTCACTCTTT,Antibody Capture
+HTO-15,HTO-15,R2,^(BC),CACTAGAGCTTGGAT,Antibody Capture
\ No newline at end of file
diff --git a/multi_config.csv b/multi_config.csv
index ef14fd5..593db0a 100644
--- a/multi_config.csv
+++ b/multi_config.csv
@@ -1,15 +1,18 @@
 [gene-expression]
-reference,/scratch/tvannier/ibdm_lenne_scrnaseq_gastruloids/01_Reference/mouse_mm10_optimized_reference_v2
-expect-cells,10000
+reference, /scratch/lchabot/scRNAseq_analysis_workflow/01_Reference/
+expect-cells,30000
 include-introns,true
 
+[feature]
+reference, /scratch/lchabot/scRNAseq_analysis_workflow/feature_config.csv
+
 [libraries]
 fastq_id,fastqs,feature_types
-S004647_to_S004650_1,/scratch/tvannier/ibdm_lenne_scrnaseq_gastruloids/00_RawData/S004647_to_S004650_1,Gene Expression
-S004647_to_S004650_2,/scratch/tvannier/ibdm_lenne_scrnaseq_gastruloids/00_RawData/S004647_to_S004650_2,Multiplexing Capture
+NGc-3-CTX,/scratch/lchabot/scRNAseq_analysis_workflow/00_RawData/NGc_3_CTX,Gene Expression
+NGh-HTO-CTX,/scratch/lchabot/scRNAseq_analysis_workflow/00_RawData/NGh_HTO_CTX,Antibody Capture
 
 [samples]
-sample_id,cmo_ids,description
+sample_id
 72h,CMO301,72h
 80h,CMO302,80h
 86h,CMO303,86h
diff --git a/run_slurm.sh b/run_slurm.sh
index 910cc71..d93aadd 100644
--- a/run_slurm.sh
+++ b/run_slurm.sh
@@ -1,10 +1,10 @@
 #!/bin/sh
 
-#SBATCH -J allen_SIMS
-#SBATCH -p kepler
+#SBATCH -J P30_WT_umap
+#SBATCH -p skylake
 #SBATCH --ntasks-per-node 18
 #SBATCH -A b324
-#SBATCH -t 20:00:00
+#SBATCH -t 01:00:00
 #SBATCH -o ./%N.%x.out
 #SBATCH -e ./%N.%x.errs
 
@@ -24,7 +24,7 @@ pip install snakemake==6.3.0
 
 # For the worflow in general
 snakemake --unlock
-# snakemake --snakefile Snakefile --use-singularity --use-conda --conda-frontend conda --conda-not-block-search-path-envvars --singularity-args="-B /scratch/$SLURM_JOB_USER/scRNAseq_analysis_workflow/" --cores 18
+snakemake --snakefile Snakefile --use-singularity --use-conda --conda-frontend conda --conda-not-block-search-path-envvars --singularity-args="-B /scratch/$SLURM_JOB_USER/scRNAseq_analysis_workflow/" --cores 16
 
-### For sims
-snakemake --snakefile Snakefile --use-singularity --singularity-args="-B /scratch/$SLURM_JOB_USER/scRNAseq_analysis_workflow/" --cores 18
\ No newline at end of file
+# ### For sims
+# snakemake --snakefile Snakefile --use-singularity --singularity-args="-B /scratch/$SLURM_JOB_USER/scRNAseq_analysis_workflow/" --cores 18
\ No newline at end of file

From d3eb412b2641dd7f909f5ef514f7cce12dd53a11 Mon Sep 17 00:00:00 2001
From: LeaChabot <lea-chabot@orange.fr>
Date: Fri, 3 May 2024 15:51:28 +0200
Subject: [PATCH 12/15] htoDemux_seurat

---
 03_Script/01_prepare_data.R            |  14 +-
 03_Script/03_cell_heterogeneity_PCA.R  |  26 ++-
 03_Script/06_markers.R                 | 209 +++++++++++++++++++++++++
 03_Script/arlotta.R                    |  26 +--
 03_Script/seurat.Rmd                   | 184 +++++++++++++++++++++-
 03_Script/seurat_reports_compilation.R |   4 +
 03_Script/to_anndata_file.py           |   6 +-
 04_Workflow/seurat.smk                 |   2 +
 04_Workflow/umapCellAssignation.smk    |   6 +-
 Snakefile                              |  14 +-
 config.yaml                            |  42 ++---
 sample.tsv                             |   8 +-
 12 files changed, 483 insertions(+), 58 deletions(-)
 create mode 100644 03_Script/06_markers.R

diff --git a/03_Script/01_prepare_data.R b/03_Script/01_prepare_data.R
index facba7d..38100a8 100644
--- a/03_Script/01_prepare_data.R
+++ b/03_Script/01_prepare_data.R
@@ -25,21 +25,29 @@ source(file.path(dirname(DIRECTORY), "03_Script/00_general_deps.R"))
 in_data_dir = file.path( OUTPUTDIR, STEP1)
 samples <- dir(in_data_dir)
 
-
-
 MIN_CELLS <- as.numeric(MIN_CELLS)
 MIN_FEATURES <- as.numeric(MIN_FEATURES)
+HTO <- unlist(strsplit(HTO, ","))
 
 #........................................
 # Read data
 #........................................
 seurat_data <- Read10X(paste0(in_data_dir,SAMPLE_ID,CELL_RANGER_COUNT_PATH))
+
 # Need a condition if data are depultiplexed with cell ranger multi
 if( RUN_DEMULTIPLEX == TRUE){
         seurat_obj <- CreateSeuratObject(counts = seurat_data, min.cells = MIN_CELLS, min.features = MIN_FEATURES, project = SAMPLE_ID)
-} else {
+} else if (as.logical(MULTIPLEX) == TRUE) {
+        seurat_obj <- CreateSeuratObject(counts = seurat_data$`Gene Expression`, min.cells = MIN_CELLS, min.features = MIN_FEATURES, project = SAMPLE_ID) 
+        hto <- seurat_data$`Antibody Capture`
+        hto <- hto[c(HTO), ]
+        seurat_obj[["HTO"]] <- CreateAssayObject(counts = hto)
+        seurat_obj <- subset(x = seurat_obj, subset = nCount_HTO > 0)
+        seurat_obj <- NormalizeData(seurat_obj, assay = "HTO", normalization.method = "CLR")       
+} else{
         seurat_obj <- CreateSeuratObject(counts = seurat_data$`Gene Expression`, min.cells = MIN_CELLS, min.features = MIN_FEATURES, project = SAMPLE_ID) 
 }
+
 seurat_obj$SampleID <- SAMPLE_ID
 
 # ..........................................................................................................
diff --git a/03_Script/03_cell_heterogeneity_PCA.R b/03_Script/03_cell_heterogeneity_PCA.R
index 223ae86..1dcd73c 100644
--- a/03_Script/03_cell_heterogeneity_PCA.R
+++ b/03_Script/03_cell_heterogeneity_PCA.R
@@ -3,12 +3,31 @@
 #  of cells using dimension reduction techniques
 # ########################################################
 
-## @knitr ldr
-
 # Scales and centers features in the dataset
 seurat_obj = ScaleData( object = seurat_obj,
                    verbose = .VERBOSE)
 
+# ..........................................................................................................
+## @knitr HTODemux
+# ..........................................................................................................
+
+# Demultiplexing data 
+seurat_obj <- HTODemux(seurat_obj, assay = "HTO", positive.quantile = 0.99)
+cat("<br><br>Removed cells after filtering Doublet et Negative:", sum(seurat_obj[["hash.ID"]] == "Negative" | seurat_obj[["hash.ID"]] == "Doublet"));
+cat("<br><br>Remaining cells after filtering:", sum(seurat_obj$hash.ID %in% HTO));
+cat("\n<br>\n");
+seurat_obj <- subset(seurat_obj, idents = c("Doublet","Negative"), invert = TRUE) ### Keep only the Singlet cells (Remove Negative and Doublet)
+
+seurat_obj <- FindVariableFeatures(seurat_obj, selection.method = "mean.var.plot")
+seurat_obj <- ScaleData(seurat_obj, features = VariableFeatures(seurat_obj))
+seurat_obj <- RunPCA(seurat_obj)
+seurat_obj <- RunUMAP(seurat_obj, dims = 1:25)
+print(DimPlot(seurat_obj))
+
+# ..........................................................................................................
+## @knitr ldr
+# ..........................................................................................................
+
 # Perform linear dimensional reduction
 seurat_obj <- RunPCA(seurat_obj, features = VariableFeatures(object = seurat_obj))
 VizDimLoadings(seurat_obj, dims = 1:DIMS, reduction = "pca")
@@ -16,7 +35,10 @@ VizDimLoadings(seurat_obj, dims = 1:DIMS, reduction = "pca")
 # DimPlot(seurat_obj, reduction = "pca", group.by = "categorie")
 DimHeatmap(seurat_obj, dims = 1:DIMS, cells = 500, balanced = TRUE)
 
+# ..........................................................................................................
 ## @knitr elbowplot
+# ..........................................................................................................
+
 ElbowPlot(seurat_obj, ndims = DIMS)
 
 # ..........................................................................................................
diff --git a/03_Script/06_markers.R b/03_Script/06_markers.R
new file mode 100644
index 0000000..40fca2f
--- /dev/null
+++ b/03_Script/06_markers.R
@@ -0,0 +1,209 @@
+# ..........................................................................................................
+## @knitr GlutN
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Neurod2","Sv2b", "Nrn1", "Tbr1", "Satb2"))
+)
+
+# ..........................................................................................................
+## @knitr Cajal_Retzius_cells_markers
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Reln")) # Ndnf ?
+)
+
+# ..........................................................................................................
+## @knitr L2_3_IT
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Cux2","Stard8", "Lpl", "Prex1"))
+)
+
+# ..........................................................................................................
+## @knitr L4_IT_SSC
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Rorb","Rspol", "Kcnab3"))
+)
+
+# ..........................................................................................................
+## @knitr L4_IT_PC
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Rorb","S100b", "Ccdc3", "Scna7a"))
+)
+
+# ..........................................................................................................
+## @knitr L5_IT
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Hs3st2","Lvzl4", "Etv6"))
+)
+
+# ..........................................................................................................
+## @knitr L5_PT
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Fezf2","Oma1"))
+)
+
+# ..........................................................................................................
+## @knitr L5_6_NP
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Tshz2","Slc17a8", "Stard5", "Dkk2"))
+)
+
+# ..........................................................................................................
+## @knitr L6_IT
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Cryab","Plekha2", "1110032F04Rik"))
+)
+
+# ..........................................................................................................
+## @knitr L6_CT
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Foxp2","Jam3", "Sennc2"))
+)
+
+# ..........................................................................................................
+## @knitr L6b
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Cplx3","Ctgf", "Fbx17", "Clic5"))
+)
+
+# ..........................................................................................................
+## @knitr GABAN
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Gad1","Gad2", "Dlx1", "Dlx2"))
+)
+
+# ..........................................................................................................
+## @knitr Meis2
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Meis2","Pbx3"))
+)
+
+# ..........................................................................................................
+## @knitr Lamp5_A7C_CNC
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Lamp5","Innt1", "Cox6a2", "Igfb2"))
+)
+
+# ..........................................................................................................
+## @knitr Lamp5_A7C_NGC
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Lamp5","Innt1", "Egln3"))
+)
+
+# ..........................................................................................................
+## @knitr Sncg_CCK_BC
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Sncg","Npas3", "Frem1"))
+)
+
+# ..........................................................................................................
+## @knitr Vip_BP_BTC
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Vip","Stat3"))
+)
+
+# ..........................................................................................................
+## @knitr Sst_T_MC
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Sst","Nyap1", "Nudt11", "Igfb2"))
+)
+
+# ..........................................................................................................
+## @knitr Sst_fan_MC
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Ptpn9","Exoc2", "Junb"))
+)
+
+# ..........................................................................................................
+## @knitr Sst_NMC
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Ptpn9","Exoc2"))
+)
+
+# ..........................................................................................................
+## @knitr Sst_IVC
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Hpse","Paqr8", "Hs3st1"))
+)
+
+# ..........................................................................................................
+## @knitr Sst_FS_like
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Atpla2","Prkg2"))
+)
+
+# ..........................................................................................................
+## @knitr Sst_Chold_LPC
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Zfhx3","Isl1", "Six3"))
+)
+
+# ..........................................................................................................
+## @knitr Pvalb_CHC
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Sp9","Rprm", "Gnb2l1", "Id3"))
+)
+
+# ..........................................................................................................
+## @knitr Pvalb_FS_BC
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Pvalb","Tafa2", "Tac1"))
+)
+
+# ..........................................................................................................
+## @knitr astro_markers
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Apoe","Aldh1l1","Slc1a3"))
+)
+
+# ..........................................................................................................
+## @knitr oligo_markers
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Pdgfra","Olig1","Olig2"))
+)
+
+# ..........................................................................................................
+## @knitr microglia_markers
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Aif1","Tmem119"))
+)
+
+# ..........................................................................................................
+## @knitr Endothelial_cells_markers
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Cldn5"))
+)
+
+# ..........................................................................................................
+## @knitr Pericyte_markers
+# ..........................................................................................................
+print(
+  FeaturePlot(seurat_obj, features = c("Pdgfrb"))
+)
\ No newline at end of file
diff --git a/03_Script/arlotta.R b/03_Script/arlotta.R
index eed037e..f555b93 100755
--- a/03_Script/arlotta.R
+++ b/03_Script/arlotta.R
@@ -177,16 +177,16 @@ print(dim(reference_matrix))
 print("Here the first 10 rows and 10 columns :")
 print(reference_matrix[1:10, 1:10])
 
-#-------------------------------------
-# Load our matrix to annotate
-#-------------------------------------
-matrix <- fread(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_normalized_matrix.csv"))) # The normalized matrix is used for this reference.
-matrix <- as.matrix(matrix)
-rownames(matrix) <- matrix[,"V1"]
-matrix <- matrix[, -1]
+# #-------------------------------------
+# # Load our matrix to annotate
+# #-------------------------------------
+# matrix <- fread(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_normalized_matrix.csv"))) # The normalized matrix is used for this reference.
+# matrix <- as.matrix(matrix)
+# rownames(matrix) <- matrix[,"V1"]
+# matrix <- matrix[, -1]
 
-print("Our matrix to annotate is loaded, here the first rows : ")
-print(matrix[1:10, 1:10])
+# print("Our matrix to annotate is loaded, here the first rows : ")
+# print(matrix[1:10, 1:10])
 
 #-------------------------------------
 # Wtrite the two new matrix +
@@ -198,10 +198,10 @@ write.csv(reference_matrix, file.path(OUTPUTDIR, STEP3, REFERENCE_NAME, paste0(O
 print("The reference matrix is written")
 
 
-dir.create(file.path(OUTPUTDIR, STEP3, SAMPLE_ID))
-class(matrix) <- "numeric"
-write.csv(matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_MATRIX,".csv")))
-print("The matrix is written")
+# dir.create(file.path(OUTPUTDIR, STEP3, SAMPLE_ID))
+# class(matrix) <- "numeric"
+# write.csv(matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_MATRIX,".csv")))
+# print("The matrix is written")
 
 fwrite(x = metadata, file = file.path(OUTPUTDIR, STEP3, REFERENCE_NAME, paste0(OUTPUT_NAME_REF_METADATA,".csv")), row.names = TRUE)
 print("The metadaa is written")
diff --git a/03_Script/seurat.Rmd b/03_Script/seurat.Rmd
index aaa4f41..5989e15 100644
--- a/03_Script/seurat.Rmd
+++ b/03_Script/seurat.Rmd
@@ -49,11 +49,12 @@ read_chunk( path=file.path( SCRIPTDIR, "02_variables_genes.R"));
 read_chunk( path=file.path( SCRIPTDIR, "03_cell_heterogeneity_PCA.R"));
 read_chunk( path=file.path( SCRIPTDIR, "04_cell_heterogeneity_tSNE_UMAP.R"));
 read_chunk( path=file.path( SCRIPTDIR, "05_cell_heterogeneity_clustering.R"));
+read_chunk( path=file.path( SCRIPTDIR, "06_markers.R"));
 ```
 
 This report present the analyse of dataset generated with 10x Genomics Single Cell Protocol.
 
-# Primary analysis : QC and selecting cells for further analysis of the sample `r snakemake@params[["sample_id"]]`
+# Primary analysis : QC and selecting cells for further analysis of the sample `r SAMPLE_ID`
 
 ```{r loadData, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
 <<loadData>>
@@ -118,6 +119,23 @@ Top `r VARIABLE_FEATURES_SHOWTOP` most variable annotations:
 
 <br /><br />
 
+```{r, echo = FALSE}
+if (as.logical(MULTIPLEX) == TRUE) {
+  include_chunk <- TRUE
+  include_title <- TRUE
+} else {
+  include_chunk <- FALSE
+  include_title <- FALSE
+}
+```
+
+`r if(include_title) "## Demultiplexing"`
+```{r HTODemux, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+if (include_chunk) {<<HTODemux>>}
+```
+
+<br /><br />
+
 ## PCA results
 
 Visualizing both cells and features that define the PCA
@@ -228,11 +246,173 @@ rm("useReduction");
 useReduction='tsne'
 <<heterogeneity_dimReduc_with_clusters_tSNE>>
 rm("useReduction");
+```
 
-###
+### {-}
 
 <br /><br />
 
 ```{r save_RDS, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE, warning = FALSE}
 <<save_RDS>>
 ```
+
+
+##  GABAergic markers {.tabset .tabset-fade}
+Here you can vizualize GABAergic markers expression.
+
+### GABAN
+```{r GABAN, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<GABAN>>
+```
+
+### Meis2
+```{r Meis2, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<Meis2>>
+```
+
+### Lamp A7C|CNC
+```{r Lamp5_A7C_CNC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<Lamp5_A7C_CNC>>
+```
+
+### Lamp A7C|NGC
+```{r Lamp5_A7C_NGC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<Lamp5_A7C_NGC>>
+```
+
+### Sngc CCK|BC
+```{r Sncg_CCK_BC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<Sncg_CCK_BC>>
+```
+
+### Vip BP|BTC
+```{r Vip_BP_BTC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<Vip_BP_BTC>>
+```
+
+### Sst T-MC
+```{r Sst_T_MC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<Sst_T_MC>>
+```
+
+### Sst fan-MC
+```{r Sst_fan_MC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<Sst_fan_MC>>
+```
+
+### Sst NMC
+```{r Sst_NMC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<Sst_NMC>>
+```
+
+### Sst IVC
+```{r Sst_IVC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<Sst_IVC>>
+```
+
+### Sst FS-like
+```{r Sst_FS_like, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<Sst_FS_like>>
+```
+
+### Sst Chold|LPC
+```{r Sst_Chold_LPC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<Sst_Chold_LPC>>
+```
+
+### Pvalb CHC
+```{r Pvalb_CHC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<Pvalb_CHC>>
+```
+
+### Pvalb FS|BC
+```{r Pvalb_FS_BC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<Pvalb_FS_BC>>
+```
+
+## {-}
+
+##  Glutamatergic markers {.tabset .tabset-fade}
+Here you can vizualize Glutamatergic markers expression.
+
+### Cajal retzius cells
+```{r Cajal_Retzius_cells_markers, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<Cajal_Retzius_cells_markers>>
+```
+
+### L2/3 IT
+```{r L2_3_IT, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<L2_3_IT>>
+```
+
+### L4 IT SSC
+```{r L4_IT_SSC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<L4_IT_SSC>>
+```
+
+### L4 IT PC
+```{r L4_IT_PC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<L4_IT_PC>>
+```
+
+### L5 IT
+```{r L5_IT, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<L5_IT>>
+```
+
+### L5 PT
+```{r L5_PT, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<L5_PT>>
+```
+
+### L5/6 NP
+```{r L5_6_NP, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<L5_6_NP>>
+```
+
+### L6 IT
+```{r L6_IT, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<L6_IT>>
+```
+
+### L6 CT
+```{r L6_CT, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<L6_CT>>
+```
+
+### L6b
+```{r L6b, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<L6b>>
+```
+
+## {-}
+
+##  Non-Neuronal markers {.tabset .tabset-fade}
+Here you can vizualize GABAergic markers expression.
+
+### Astrocytes
+```{r astro_markers, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<astro_markers>>
+```
+
+### Oligodendrocytes
+```{r oligo_markers, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<oligo_markers>>
+```
+
+### Microglia
+```{r microglia_markers, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<microglia_markers>>
+```
+
+### Endothelial
+```{r Endothelial_cells_markers, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<Endothelial_cells_markers>>
+```
+
+### Pericyte
+```{r Pericyte_markers, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<Pericyte_markers>>
+```
+
+## {-}
diff --git a/03_Script/seurat_reports_compilation.R b/03_Script/seurat_reports_compilation.R
index 0ab3548..e928ea5 100644
--- a/03_Script/seurat_reports_compilation.R
+++ b/03_Script/seurat_reports_compilation.R
@@ -16,6 +16,8 @@ SEURAT_OUTPUT <- snakemake@output[["seurat_output"]]
 SC_DATA_PATH_LIST <- snakemake@params[["sc_data"]]
 REPORT_LIST <- snakemake@output[["seurat_report"]]
 SAMPLE_ID_LIST <- snakemake@params[["sample_id"]]
+MULTIPLEX_LIST <- snakemake@params[["multiplex"]]
+HTO_LIST <- snakemake@params[["hto"]]
 PLOT_RASTER_NBCELLS_THRESHOLD <- snakemake@params[["plot_raster_nbcells_threshold"]]
 CELL_RANGER_COUNT_PATH <- snakemake@params[["cell_ranger_count_path"]]
 RUN_DEMULTIPLEX <- snakemake@params[["run_demultiplex"]]
@@ -74,6 +76,8 @@ while(i <= len)
     FILTER_PERCENT_MT <- FILTER_PERCENT_MT_LIST[i]
     FILTER_PERCENT_MT_MIN <- FILTER_PERCENT_MT_MIN_LIST[i]
     FILTER_PERCENT_RB <- FILTER_PERCENT_RB_LIST[i]
+    MULTIPLEX <- MULTIPLEX_LIST[i]
+    HTO <- HTO_LIST[i]
 
     rmarkdown::render( input = file.path(SCRIPTDIR, "seurat.Rmd"),
                     output_dir = file.path(OUTPUTDIR, STEP, SAMPLE_ID),
diff --git a/03_Script/to_anndata_file.py b/03_Script/to_anndata_file.py
index 1c40063..7ba168b 100644
--- a/03_Script/to_anndata_file.py
+++ b/03_Script/to_anndata_file.py
@@ -26,9 +26,9 @@
 
 STEP3 = "03_sims/"
 
-# #-------------------------------------
-# # Load reference matrix
-# #-------------------------------------
+#-------------------------------------
+# Load reference matrix
+#-------------------------------------
 
 # reference_matrix = an.read_csv(os.path.join(OUTPUTDIR, STEP3, REFERENCE_NAME, OUTPUT_NAME_REF_MATRIX + ".csv"))
 
diff --git a/04_Workflow/seurat.smk b/04_Workflow/seurat.smk
index 0c6ac8d..883a3ce 100644
--- a/04_Workflow/seurat.smk
+++ b/04_Workflow/seurat.smk
@@ -18,6 +18,8 @@ rule seurat:
         sc_data = expand(OUTPUTDIR + "01_cellranger/{sample_id}/count/sample_filtered_feature_bc_matrix/",sample_id = SAMPLE_ID),
         cell_ranger_count_path = config["seurat"]["cell_ranger_count_path"],
         sample_id = expand("{sample_id.id}", sample_id = sample_id.itertuples()),
+        multiplex = expand("{multiplex.multiplex}", multiplex = multiplex.itertuples()),
+        hto = expand("{hto.hto}", hto = hto.itertuples()),
         plot_raster_nbcells_threshold = config["seurat"]["plot_raster_nbcells_threshold"],
         qc_exploration_mode = config["seurat"]["qc_exploration_mode"],
         min_cells = config["seurat"]["min_cells"],
diff --git a/04_Workflow/umapCellAssignation.smk b/04_Workflow/umapCellAssignation.smk
index e233e31..5f7d02f 100644
--- a/04_Workflow/umapCellAssignation.smk
+++ b/04_Workflow/umapCellAssignation.smk
@@ -4,9 +4,9 @@ rule umapCellAssignation:
 
     output:
         umapAssignation_output = expand(OUTPUTDIR + "03_sims/umapAssignation_output.txt"),
-        umap_sims_report = report(expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_umap_sims.pdf", sample_id = SAMPLE_ID), caption = REPORT + "umap_sims.rst", category = "03 sims"),
-        umap_sims_threshold_report = report(expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_umap_sims_threshold.pdf", sample_id = SAMPLE_ID), caption = REPORT + "umap_sims.rst", category = "03 sims"),
-        umap_per_labels_report = report(expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_umap_per_labels.pdf", sample_id = SAMPLE_ID), caption = REPORT + "umap_sims.rst", category = "03 sims"),
+        # umap_sims_report = report(expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_umap_sims.pdf", sample_id = SAMPLE_ID), caption = REPORT + "umap_sims.rst", category = "03 sims"),
+        # umap_sims_threshold_report = report(expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_umap_sims_threshold.pdf", sample_id = SAMPLE_ID), caption = REPORT + "umap_sims.rst", category = "03 sims"),
+        # umap_per_labels_report = report(expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_umap_per_labels.pdf", sample_id = SAMPLE_ID), caption = REPORT + "umap_sims.rst", category = "03 sims"),
 
     params:
         sample_id = config["reference_sims"]["sample_id"],
diff --git a/Snakefile b/Snakefile
index e082ef0..387861e 100644
--- a/Snakefile
+++ b/Snakefile
@@ -33,6 +33,8 @@ if run_slurm:
 rawsample = pd.read_table(config["sample"]).set_index(["rawsample"], drop=False)
 sample_id = pd.read_table(config["sample"]).set_index(["id"], drop=False) 
 sample_name = pd.read_table(config["sample"]).set_index(["name"], drop=False) 
+multiplex = pd.read_table(config["sample"]).set_index(["multiplex"], drop=False)
+hto = pd.read_table(config["sample"]).set_index(["hto"], drop=False)
 
 RAWSAMPLE = expand("{rawsample.rawsample}", rawsample = rawsample.itertuples()) # 2_S1_L001, 3_S1_L001 ...
 SAMPLE_ID = expand("{sample_id.id}", sample_id = sample_id.itertuples()) # P5_KO,P5_WT,P30_KO,P30_WT
@@ -76,7 +78,7 @@ rule all:
 		# demuxlet = OUTPUTDIR + "01_cellranger/Mix_MM_lines/outs/demuxlet_Mix_MM_lines.best",
 		# tabdemuxlet = OUTPUTDIR + "01_cellranger/Mix_MM_lines/outs/demuxlet_Mix_MM_lines.tsv",
 		### Seurat
-		# seurat_report = expand(OUTPUTDIR + "02_seurat/{sample_id}/{sample_id}_seurat_report.html", sample_id = SAMPLE_ID),
+		seurat_report = expand(OUTPUTDIR + "02_seurat/{sample_id}/{sample_id}_seurat_report.html", sample_id = SAMPLE_ID),
 		### Remi data
 		# remi_output = expand(OUTPUTDIR + "remi_output.txt"),
 		### Prepare data for SIMS
@@ -91,7 +93,7 @@ rule all:
 		# sims_prediction_report = expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_data_matrix_prediction.csv", sample_id = SAMPLE_ID),
 		# sims_prediction_unknown_report = expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_data_matrix_prediction_filtered.csv", sample_id = SAMPLE_ID),
 		### Representing cellular assignation on UMAP
-		umapAssignation_output = expand(OUTPUTDIR + "03_sims/umapAssignation_output.txt"),
+		# umapAssignation_output = expand(OUTPUTDIR + "03_sims/umapAssignation_output.txt"),
 		# umap_sims_report = expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_umap_sims.pdf", sample_id = SAMPLE_ID),
 		# umap_sims_threshold_report = expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_umap_sims_threshold.pdf", sample_id = SAMPLE_ID),
 		# umap_per_labels_report = expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_umap_per_labels.pdf", sample_id = SAMPLE_ID),
@@ -106,20 +108,20 @@ run_multiplex = config["run_multiplex"]
 if run_demultiplex:
 	# include: ENVDIR + "clean_demultiplex.smk"
 	# include: ENVDIR + "cellranger_demultiplex.smk"
-	# include: ENVDIR + "seurat.smk"
+	include: ENVDIR + "seurat.smk"
 	# include: ENVDIR + "remi.smk"
 	# include: ENVDIR + "prepare_data_sims.smk"
 	# include: ENVDIR + "SIMS.smk"
 	# include: ENVDIR + "knnor.smk"
-	include: ENVDIR + "umapCellAssignation.smk"
+	# include: ENVDIR + "umapCellAssignation.smk"
 
 if run_multiplex:	
 	# include: ENVDIR + "clean.smk"
 	# include: ENVDIR + "cellranger.smk"
-	# include: ENVDIR + "seurat.smk"
+	include: ENVDIR + "seurat.smk"
 	# include: ENVDIR + "remi.smk"
 	# include: ENVDIR + "prepare_data_sims.smk"
 	# include: ENVDIR + "SIMS.smk"
 	# include: ENVDIR + "knnor.smk"
-	include: ENVDIR + "umapCellAssignation.smk"
+	# include: ENVDIR + "umapCellAssignation.smk"
 
diff --git a/config.yaml b/config.yaml
index 5f00fd9..a7fe9a6 100644
--- a/config.yaml
+++ b/config.yaml
@@ -11,15 +11,15 @@ sample: sample.tsv
 # If you run the workflow with slurm (sbatch run_slurm.sh)
 run_slurm: true
 # If you data are demultiplexed = true
-run_demultiplex: true
+run_demultiplex: false
 # If your data are not demultiplexed = true
-run_multiplex: false
+run_multiplex: true
 
 rules:
     # Optimizing and assembling genome annotations for 3’ single-cell RNA-sequencing analysis
     reference_enhancer_rule: "FALSE"
     # Build the reference with cellranger
-    ref_cellranger_rule: "FALSE"
+    ref_cellranger_rule: "TRUE"
     # Run SIMS to anotate cells
     sims_rule: "TRUE"
     # Run Knnor to help with imbalanced datasets
@@ -110,18 +110,18 @@ seurat:
     min_features: 200
     # Cells with number of UMIs outside the range will be excluded
     # Can be a list of UMIs but MUST be on the same order than sample.tsv id.
-    filter_umi_min: "0"
-    filter_umi_max: "40000"
+    filter_umi_min: "0,0"
+    filter_umi_max: "8000,10000"
     # Filter cells that have unique feature counts over/less n
     # Can be a list of feature counts but MUST be on the same order than sample.tsv id.
-    filter_feature_min: "0"
-    filter_feature_max: "7500"
+    filter_feature_min: "0,0"
+    filter_feature_max: "3400,3400"
     # Filter cells that have > n% mitochondrial counts
-    filter_percent_mt: "17"
+    filter_percent_mt: "2,11"
     # Filter cells that have < n% mitochondrial counts
-    filter_percent_mt_min: "0"
+    filter_percent_mt_min: "0,0"
     # Filter cells that have < n% ribosomal counts
-    filter_percent_rb: "0"
+    filter_percent_rb: "0,0"
     # Pattern to match features against => mitochondrial genes | for mouse : "^mt-" / for human : "^MT-"
     pattern_mt: "^mt-"
     # Pattern to match features against => ribosomal genes
@@ -144,7 +144,7 @@ seurat:
     # Number of dimention to show for the vizualisation
     dims: 30
     # Default number of dimensions to use for PCA (see Seurat::RunPCA())
-    pca_npc: 22
+    pca_npc: 25
     # Number of dimensions to show in PCA-related plots
     pca_plots_nbdims : 3
     #  Number of'top' features to show when plotting PCA loadings
@@ -152,11 +152,11 @@ seurat:
 
     ##### Dimensionality reduction parameters (TSNE/UMAP)
     # Number of dimensions to use from PCA results
-    dimreduc_use_pca_nbdims : 22
+    dimreduc_use_pca_nbdims : 25
 
     ##### Cluster identification parameters
     # Number of dimensions to use from PCA results
-    findclusters_use_pca_nbdims : 22
+    findclusters_use_pca_nbdims : 30
     # Nearest-neighbor graph construction
     findneighbors_k : 30
     # Value of the resolution parameter, use a value above (below) 1.0 if you want to obtain a larger (smaller) number of communities.
@@ -172,7 +172,7 @@ reference_sims:
     # For now : arlotta.R, allen_mouse_whole_cortex.R
     r_script: "allen_mouse_whole_cortex.R"
     # Output folder name for the reference
-    reference_name: "Allen"
+    reference_name: "02_Allen_for_P30_WT_norm"
     # Output file name for the reference metadata
     output_name_ref_metadata: "metadata"
     # Output file name for the reference matrix
@@ -181,7 +181,7 @@ reference_sims:
     output_name_matrix: "data_matrix"
     # Name of the column that contain the cells
     # allen mouse whole cortex : "sample_name", arlotta : "NEW_NAME"
-    cells_column: "sample_name"
+    cells_column: "NEW_NAME"
 
     ##### Our Matrix to annotate #####
     # The name of the folder of the sample you want to annotate (One of the output in the seurat folder)
@@ -193,7 +193,7 @@ reference_sims:
     # Pattern that correspond to the mouse age that we want to keep for the training of our data
     # It's possible to select multiple ones if separate by a coma, exemple: ["E10", "E11", "P4"]
     # In this reference : E10, E11, E12, E13, E14, E15, E16, E17, E18_S1, E18_S3, P1, P1_S1, P4
-    developmental_time: ["E10", "E11", "E12", "E13", "E14", "E15", "E16", "E17", "E18_S1", "E18_S3", "P1", "P1_S1", "P4"]
+    developmental_time: ["P4"]
     
     ##### Allen_mouse_whole_cortex #####
     # Path for the arlotta metadata
@@ -218,26 +218,26 @@ sims:
     # Your personnal key to connect to your wandb account
     key: "5f86a9267b08ff77c21a8872d763cb76048dfaca"
     # Number of cores (Put the same in the run_slurm.sh if necessary)
-    num_workers: 18
+    num_workers: 12
     # Maximum number of epoch to train our model (Will stop after, start counting from 0)
     max_epoch: 500
 
     ##### Logger
     # Name of the project (Will be the name of the run and checkpoint)
-    project_name: "Allen_norm_slurm"
+    project_name: "Allen_for_P30_WT_norm"
 
     ##### Early Stopping
     # The validation metric you wish to monitor for early stop.
     monitor: "val_loss"
     # The number of epochs during which the validation metric (monitor) may not improve before early shutdown is triggered. (To prevent overtraining)
-    patience: 10
+    patience: 20
 
     # Epoch file from wandb to chosse for the cell assignation
-    epoch: "epoch=47-step=86784.ckpt"
+    epoch: "epoch=40-step=74128.ckpt"
 
     ##### Cells annotation
     # Threshold of the difference between the first and second score of the prediction to say if a cell is unknown or not
-    threshold: 0.1
+    threshold: 0.0
     # Seurat object with sims cell_assignation
     rds_assignation: filtered_assigned_seurat_object.rds
     # Matrix with cell predtion after filtration with the threshold
diff --git a/sample.tsv b/sample.tsv
index 8064522..d68b8da 100644
--- a/sample.tsv
+++ b/sample.tsv
@@ -1,5 +1,3 @@
-name	id	rawsample	sample	lane	fastq1	fastq2	barcode
-2	72h	S004647_to_S004650	S004647	L001	/00_RawData/2_S1_L001_R1_001.fastq.gz	/00_RawData/2_S1_L001_R2_001.fastq.gz	/00_RawData/2_S1_L001_I1_001.fastq.gz
-3	80h	S004647_to_S004650	S004648	L001	/00_RawData/3_S1_L001_R1_001.fastq.gz	/00_RawData/3_S1_L001_R2_001.fastq.gz	/00_RawData/3_S1_L001_I1_001.fastq.gz
-6	86h	S004647_to_S004650	S004649	L001	/00_RawData/6_S1_L001_R1_001.fastq.gz	/00_RawData/6_S1_L001_R2_001.fastq.gz	/00_RawData/6_S1_L001_I1_001.fastq.gz
-7	96h	S004647_to_S004650	S004650	L001	/00_RawData/7_S1_L001_R1_001.fastq.gz	/00_RawData/7_S1_L001_R2_001.fastq.gz	/00_RawData/7_S1_L001_I1_001.fastq.gz
\ No newline at end of file
+name	id	rawsample	sample	lane	multiplex	hto
+	P30				TRUE	HTO-5,HTO-10
+	E14				TRUE	HTO-1,HTO-3
\ No newline at end of file

From 637db47c4047faa807cc518b3d38d1db7004a016 Mon Sep 17 00:00:00 2001
From: LeaChabot <lea-chabot@orange.fr>
Date: Tue, 14 May 2024 10:59:15 +0200
Subject: [PATCH 13/15] config

---
 03_Script/arlotta.R | 34 +++++++++++++++++-----------------
 config.yaml         | 24 ++++++++++++------------
 multi_config.csv    | 12 +++++-------
 sample.tsv          |  3 +--
 4 files changed, 35 insertions(+), 38 deletions(-)

diff --git a/03_Script/arlotta.R b/03_Script/arlotta.R
index f555b93..632c8d4 100755
--- a/03_Script/arlotta.R
+++ b/03_Script/arlotta.R
@@ -119,9 +119,9 @@ print("The diferent matrix for diferent developmental time have been loaded in a
 # (cell in rows) + Transform them
 # in matrix
 #-------------------------------------
-list_reference_matrix <- lapply(list_reference_matrix, function(x){
-    normalized_matrix <- NormalizeData( object = x, normalization.method = NORM_METHOD, scale.factor = NORM_SCALE_FACTOR, verbose = FALSE)
-})
+# list_reference_matrix <- lapply(list_reference_matrix, function(x){
+#     normalized_matrix <- NormalizeData( object = x, normalization.method = NORM_METHOD, scale.factor = NORM_SCALE_FACTOR, verbose = FALSE)
+# })
 
 list_reference_matrix <-  lapply(list_reference_matrix, function(x) as.matrix(x))
 list_reference_matrix <-  lapply(list_reference_matrix, function(x) t(x))
@@ -143,7 +143,7 @@ list_reference_matrix <- lapply(list_reference_matrix, clean_rownames)
 
 #-------------------------------------
 # Keep cells that match both the 
-# metadata an matrix of reference
+# metadata and matrix of reference
 #-------------------------------------
 metadata <- data.table()
 
@@ -177,16 +177,16 @@ print(dim(reference_matrix))
 print("Here the first 10 rows and 10 columns :")
 print(reference_matrix[1:10, 1:10])
 
-# #-------------------------------------
-# # Load our matrix to annotate
-# #-------------------------------------
-# matrix <- fread(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_normalized_matrix.csv"))) # The normalized matrix is used for this reference.
-# matrix <- as.matrix(matrix)
-# rownames(matrix) <- matrix[,"V1"]
-# matrix <- matrix[, -1]
+#-------------------------------------
+# Load our matrix to annotate
+#-------------------------------------
+matrix <- fread(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_count_matrix.csv"))) # The normalized matrix is used for this reference.
+matrix <- as.matrix(matrix)
+rownames(matrix) <- matrix[,"V1"]
+matrix <- matrix[, -1]
 
-# print("Our matrix to annotate is loaded, here the first rows : ")
-# print(matrix[1:10, 1:10])
+print("Our matrix to annotate is loaded, here the first rows : ")
+print(matrix[1:10, 1:10])
 
 #-------------------------------------
 # Wtrite the two new matrix +
@@ -198,10 +198,10 @@ write.csv(reference_matrix, file.path(OUTPUTDIR, STEP3, REFERENCE_NAME, paste0(O
 print("The reference matrix is written")
 
 
-# dir.create(file.path(OUTPUTDIR, STEP3, SAMPLE_ID))
-# class(matrix) <- "numeric"
-# write.csv(matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_MATRIX,".csv")))
-# print("The matrix is written")
+dir.create(file.path(OUTPUTDIR, STEP3, SAMPLE_ID))
+class(matrix) <- "numeric"
+write.csv(matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_MATRIX,".csv")))
+print("The matrix is written")
 
 fwrite(x = metadata, file = file.path(OUTPUTDIR, STEP3, REFERENCE_NAME, paste0(OUTPUT_NAME_REF_METADATA,".csv")), row.names = TRUE)
 print("The metadaa is written")
diff --git a/config.yaml b/config.yaml
index a7fe9a6..d565fea 100644
--- a/config.yaml
+++ b/config.yaml
@@ -110,18 +110,18 @@ seurat:
     min_features: 200
     # Cells with number of UMIs outside the range will be excluded
     # Can be a list of UMIs but MUST be on the same order than sample.tsv id.
-    filter_umi_min: "0,0"
-    filter_umi_max: "8000,10000"
+    filter_umi_min: "0"
+    filter_umi_max: "7500"
     # Filter cells that have unique feature counts over/less n
     # Can be a list of feature counts but MUST be on the same order than sample.tsv id.
-    filter_feature_min: "0,0"
-    filter_feature_max: "3400,3400"
+    filter_feature_min: "0"
+    filter_feature_max: "4000"
     # Filter cells that have > n% mitochondrial counts
-    filter_percent_mt: "2,11"
+    filter_percent_mt: "10"
     # Filter cells that have < n% mitochondrial counts
-    filter_percent_mt_min: "0,0"
+    filter_percent_mt_min: "0"
     # Filter cells that have < n% ribosomal counts
-    filter_percent_rb: "0,0"
+    filter_percent_rb: "0"
     # Pattern to match features against => mitochondrial genes | for mouse : "^mt-" / for human : "^MT-"
     pattern_mt: "^mt-"
     # Pattern to match features against => ribosomal genes
@@ -170,9 +170,9 @@ reference_sims:
     ##### General #####
     #  Name of the script you want to execute (Corespond to the reference you want to use)
     # For now : arlotta.R, allen_mouse_whole_cortex.R
-    r_script: "allen_mouse_whole_cortex.R"
+    r_script: "arlotta.R"
     # Output folder name for the reference
-    reference_name: "02_Allen_for_P30_WT_norm"
+    reference_name: "03_Arlotta_not_norm"
     # Output file name for the reference metadata
     output_name_ref_metadata: "metadata"
     # Output file name for the reference matrix
@@ -185,7 +185,7 @@ reference_sims:
 
     ##### Our Matrix to annotate #####
     # The name of the folder of the sample you want to annotate (One of the output in the seurat folder)
-    sample_id: "P30_WT"
+    sample_id: "P5_WT"
 
     ##### Paola Arlotta #####
     # Path for the arlotta metadata
@@ -193,7 +193,7 @@ reference_sims:
     # Pattern that correspond to the mouse age that we want to keep for the training of our data
     # It's possible to select multiple ones if separate by a coma, exemple: ["E10", "E11", "P4"]
     # In this reference : E10, E11, E12, E13, E14, E15, E16, E17, E18_S1, E18_S3, P1, P1_S1, P4
-    developmental_time: ["P4"]
+    developmental_time: ["E10", "E11", "E12", "E13", "E14", "E15", "E16", "E17", "E18_S1", "E18_S3", "P1", "P1_S1", "P4"]
     
     ##### Allen_mouse_whole_cortex #####
     # Path for the arlotta metadata
@@ -224,7 +224,7 @@ sims:
 
     ##### Logger
     # Name of the project (Will be the name of the run and checkpoint)
-    project_name: "Allen_for_P30_WT_norm"
+    project_name: "03_Arlotta_not_norm"
 
     ##### Early Stopping
     # The validation metric you wish to monitor for early stop.
diff --git a/multi_config.csv b/multi_config.csv
index 593db0a..a9ba8ac 100644
--- a/multi_config.csv
+++ b/multi_config.csv
@@ -1,10 +1,10 @@
 [gene-expression]
-reference, /scratch/lchabot/scRNAseq_analysis_workflow/01_Reference/
+reference,/scratch/lchabot/scRNAseq_analysis_workflow/01_Reference/refdata-cellranger-mm39/
 expect-cells,30000
 include-introns,true
 
 [feature]
-reference, /scratch/lchabot/scRNAseq_analysis_workflow/feature_config.csv
+reference,/scratch/lchabot/scRNAseq_analysis_workflow/feature_config.csv
 
 [libraries]
 fastq_id,fastqs,feature_types
@@ -12,8 +12,6 @@ NGc-3-CTX,/scratch/lchabot/scRNAseq_analysis_workflow/00_RawData/NGc_3_CTX,Gene
 NGh-HTO-CTX,/scratch/lchabot/scRNAseq_analysis_workflow/00_RawData/NGh_HTO_CTX,Antibody Capture
 
 [samples]
-sample_id
-72h,CMO301,72h
-80h,CMO302,80h
-86h,CMO303,86h
-96h,CMO304,96h
\ No newline at end of file
+sample_id,cmo_ids
+E14_WT,HTO1
+E14_KO,HTO3
\ No newline at end of file
diff --git a/sample.tsv b/sample.tsv
index d68b8da..8bf6467 100644
--- a/sample.tsv
+++ b/sample.tsv
@@ -1,3 +1,2 @@
 name	id	rawsample	sample	lane	multiplex	hto
-	P30				TRUE	HTO-5,HTO-10
-	E14				TRUE	HTO-1,HTO-3
\ No newline at end of file
+	P8_P16_WT				TRUE	HTO-11,HTO-15
\ No newline at end of file

From 24e12d720d9977d841506e69678a3bc14ecf565e Mon Sep 17 00:00:00 2001
From: "Lea CHABOT lea.chabot@univ-amu.fr" <lchabot@login01.cluster>
Date: Mon, 22 Jul 2024 11:45:40 +0200
Subject: [PATCH 14/15] label transfert + DEG

---
 02_Container/DE_analysis.yaml         |  11 +
 03_Script/03_cell_heterogeneity_PCA.R |  30 +-
 03_Script/DEG_analysis.R              | 140 +++++++
 03_Script/labelTransfert_seurat.R     | 544 ++++++++++++++++++++++++++
 03_Script/remi.R                      | 284 +++++++++++---
 03_Script/seurat.Rmd                  |   8 +-
 04_Workflow/remi.smk                  |   2 +-
 04_Workflow/seurat.smk                |  48 ++-
 Snakefile                             |   6 +-
 config.yaml                           |  34 +-
 10 files changed, 1011 insertions(+), 96 deletions(-)
 create mode 100644 02_Container/DE_analysis.yaml
 create mode 100644 03_Script/DEG_analysis.R
 create mode 100644 03_Script/labelTransfert_seurat.R

diff --git a/02_Container/DE_analysis.yaml b/02_Container/DE_analysis.yaml
new file mode 100644
index 0000000..135c46f
--- /dev/null
+++ b/02_Container/DE_analysis.yaml
@@ -0,0 +1,11 @@
+channels :
+    - r
+    - bioconda
+    - conda-forge
+    - defaults
+dependencies :
+    - r-ggplot2 =3.3.5
+    - r-data.table =1.14.2
+    - r-seurat =4.0.3
+    - r-dplyr =1.0.9
+    - r-ggrepel
diff --git a/03_Script/03_cell_heterogeneity_PCA.R b/03_Script/03_cell_heterogeneity_PCA.R
index 1dcd73c..f37de19 100644
--- a/03_Script/03_cell_heterogeneity_PCA.R
+++ b/03_Script/03_cell_heterogeneity_PCA.R
@@ -3,6 +3,10 @@
 #  of cells using dimension reduction techniques
 # ########################################################
 
+# ..........................................................................................................
+## @knitr scaleData
+# ..........................................................................................................
+
 # Scales and centers features in the dataset
 seurat_obj = ScaleData( object = seurat_obj,
                    verbose = .VERBOSE)
@@ -11,18 +15,20 @@ seurat_obj = ScaleData( object = seurat_obj,
 ## @knitr HTODemux
 # ..........................................................................................................
 
-# Demultiplexing data 
-seurat_obj <- HTODemux(seurat_obj, assay = "HTO", positive.quantile = 0.99)
-cat("<br><br>Removed cells after filtering Doublet et Negative:", sum(seurat_obj[["hash.ID"]] == "Negative" | seurat_obj[["hash.ID"]] == "Doublet"));
-cat("<br><br>Remaining cells after filtering:", sum(seurat_obj$hash.ID %in% HTO));
-cat("\n<br>\n");
-seurat_obj <- subset(seurat_obj, idents = c("Doublet","Negative"), invert = TRUE) ### Keep only the Singlet cells (Remove Negative and Doublet)
-
-seurat_obj <- FindVariableFeatures(seurat_obj, selection.method = "mean.var.plot")
-seurat_obj <- ScaleData(seurat_obj, features = VariableFeatures(seurat_obj))
-seurat_obj <- RunPCA(seurat_obj)
-seurat_obj <- RunUMAP(seurat_obj, dims = 1:25)
-print(DimPlot(seurat_obj))
+if(as.logical(MULTIPLEX) == TRUE){
+  # Demultiplexing data 
+  seurat_obj <- HTODemux(seurat_obj, assay = "HTO", positive.quantile = 0.99)
+  cat("<br><br>Removed cells after filtering Doublet et Negative:", sum(seurat_obj[["hash.ID"]] == "Negative" | seurat_obj[["hash.ID"]] == "Doublet"));
+  cat("<br><br>Remaining cells after filtering:", sum(seurat_obj$hash.ID %in% HTO));
+  cat("\n<br>\n");
+  seurat_obj <- subset(seurat_obj, idents = c("Doublet","Negative"), invert = TRUE) ### Keep only the Singlet cells (Remove Negative and Doublet)
+
+  seurat_obj <- FindVariableFeatures(seurat_obj, selection.method = "mean.var.plot")
+  seurat_obj <- ScaleData(seurat_obj, features = VariableFeatures(seurat_obj))
+  seurat_obj <- RunPCA(seurat_obj)
+  seurat_obj <- RunUMAP(seurat_obj, dims = 1:25)
+  print(DimPlot(seurat_obj))
+}
 
 # ..........................................................................................................
 ## @knitr ldr
diff --git a/03_Script/DEG_analysis.R b/03_Script/DEG_analysis.R
new file mode 100644
index 0000000..b573c70
--- /dev/null
+++ b/03_Script/DEG_analysis.R
@@ -0,0 +1,140 @@
+#========================================================================
+# DEG analysis
+#========================================================================
+
+#=====================================
+# Library
+#=====================================
+library(Seurat)
+library(ggplot2)
+library(dplyr)
+# library(ggrepel)
+library(data.table)
+
+#=====================================
+# Path to file and folder
+#=====================================
+DIRECTORY = getwd()
+OUTPUTDIR = file.path((DIRECTORY), "05_Output")
+
+TIME_POINT = snakemake@params[["time_point"]]
+
+STEP2 = "02_seurat/"
+STEP4 = "04_diffexp/"
+
+dir.create(file.path(OUTPUTDIR, STEP4))
+
+#========================================================================
+# Load the data
+#========================================================================
+WT <- readRDS(file = file.path(OUTPUTDIR, STEP2, paste0(TIME_POINT,"_WT/" ,TIME_POINT, "_WT_seurat_labelTransfert.rds"))) 
+KO <- readRDS(file = file.path(OUTPUTDIR, STEP2, paste0(TIME_POINT,"_KO/" ,TIME_POINT, "_KO_seurat_labelTransfert.rds"))) 
+
+# Merge the two condition together
+merge_data <- merge(WT, y = KO, add.cell.ids = c("WT", "KO"))
+
+### DE analysis
+Idents(merge_data) <- "SCT"
+
+#=====================================
+# Per condition
+#=====================================
+merge_data <- PrepSCTFindMarkers(merge_data)
+de_results_per_condition <- FindMarkers(merge_data, ident.1 = "WT", ident.2 = "KO", group.by = "condition", logfc.threshold = -Inf)
+
+de_results_per_condition$diffexpressed <- "Not significant"
+de_results_per_condition$diffexpressed[de_results_per_condition$avg_log2FC > 0.6 & de_results_per_condition$p_val_adj < 0.05] <- "Upregulated"
+de_results_per_condition$diffexpressed[de_results_per_condition$avg_log2FC < -0.6 & de_results_per_condition$p_val_adj < 0.05] <- "Downregulated"
+table(de_results_per_condition$diffexpressed)
+
+de_results_per_condition$delabel <- NA
+de_results_per_condition$delabel[de_results_per_condition$diffexpressed != "Not significant"] <- rownames(de_results_per_condition)[de_results_per_condition$diffexpressed != "Not significant"]
+
+# When pvalue = 0 for plotting
+de_results_per_condition$plot_p_val_adj <- ifelse(de_results_per_condition$p_val_adj == 0, 1e-300, de_results_per_condition$p_val_adj)
+
+# plot adding up all layers we have seen so far
+plot <- ggplot(data=de_results_per_condition, aes(x=avg_log2FC, y=-log10(plot_p_val_adj), col=diffexpressed, label=delabel)) +
+  geom_point() + 
+  theme_minimal() +
+  geom_text_repel() +
+  scale_color_manual(values=c("blue", "black", "red")) +
+  geom_vline(xintercept=c(-0.6, 0.6), col="red") +
+  geom_hline(yintercept=-log10(0.05), col="red")
+
+# Save the file in one folder
+ggsave(path = file.path(OUTPUTDIR, STEP4, TIME_POINT), filename = paste0("/", TIME_POINT, "_volcanoplot_per_condition.pdf"), plot = plot, device = "pdf")
+fwrite(x = de_results_per_condition, file = file.path(OUTPUTDIR, STEP4, TIME_POINT, paste0("/", TIME_POINT, "_diffexp_per_condition.csv")), row.names = TRUE) 
+
+#=====================================
+# Per cluster
+#=====================================
+# Function to have minimum 3 cells per label
+filter_cell_types <- function(seurat_obj, min_cells = 3) {
+  celltype_counts <- table(seurat_obj$supertype_label)
+  valid_celltypes <- names(celltype_counts[celltype_counts >= min_cells])
+  seurat_obj_filtered <- subset(seurat_obj, subset = supertype_label %in% valid_celltypes)
+  return(seurat_obj_filtered)
+}
+
+WT <- filter_cell_types(WT)
+KO <- filter_cell_types(KO)
+
+### Keep only the cells type that are in both condition
+common_cell_types <- intersect(unique(KO$supertype_label), unique(WT$supertype_label))
+
+KO <- subset(KO, subset = supertype_label %in% common_cell_types)
+WT <- subset(WT, subset = supertype_label %in% common_cell_types)
+
+merge_data <- merge(WT, y = KO, add.cell.ids = c("WT", "KO"))
+
+de_list_per_cluster <- list()
+
+for (cell_type in common_cell_types) {
+  print(cell_type)
+  cell_type_subset <- subset(merge_data, subset = supertype_label == cell_type)
+  print(table(cell_type_subset$orig.ident))
+  cell_type_subset <- PrepSCTFindMarkers(cell_type_subset)
+  de_results <- FindMarkers(cell_type_subset, ident.1 = "WT", ident.2 = "KO", group.by = "condition", logfc.threshold = -Inf)
+  de_list_per_cluster[[cell_type]] <- de_results
+}
+
+# Assuming de_list_per_cluster is your list containing data frames for each cell type
+for (cell_type in names(de_list_per_cluster)) {
+  de_results_per_cluster <- de_list_per_cluster[[cell_type]]
+  
+  # Classify differentially expressed genes
+  de_results_per_cluster$diffexpressed <- "Not significant"
+  de_results_per_cluster$diffexpressed[de_results_per_cluster$avg_log2FC > 0.6 & de_results_per_cluster$p_val_adj < 0.05] <- "Upregulated"
+  de_results_per_cluster$diffexpressed[de_results_per_cluster$avg_log2FC < -0.6 & de_results_per_cluster$p_val_adj < 0.05] <-  "Downregulated"
+  
+  # Display the table of diffexpressed values
+  print(table(de_results_per_cluster$diffexpressed))
+  
+  # Label differentially expressed genes
+  de_results_per_cluster$delabel <- NA
+  de_results_per_cluster$delabel[de_results_per_cluster$diffexpressed != "Not significant"] <- rownames(de_results_per_cluster)[de_results_per_cluster$diffexpressed != "Not significant"]
+  
+  de_results_per_cluster$plot_p_val_adj <- ifelse(de_results_per_cluster$p_val_adj == 0, 1e-300, de_results_per_cluster$p_val_adj)
+  
+  # Store the modified data frame back in the list
+  de_list_per_cluster[[cell_type]] <- de_results_per_cluster
+  
+  plot <- ggplot(data=de_results_per_cluster, aes(x=avg_log2FC, y=-log10(plot_p_val_adj), col=diffexpressed, label=delabel)) +
+    geom_point() + 
+    theme_minimal() +
+    # geom_text_repel() +
+    scale_color_manual(values=c("blue", "black", "red")) +
+    geom_vline(xintercept=c(-0.6, 0.6), col="red") +
+    geom_hline(yintercept=-log10(0.05), col="red") +
+    labs(title=paste("Volcano Plot for", cell_type))
+  
+  # Save the plot
+  ggsave(path = file.path(OUTPUTDIR, STEP4, TIME_POINT), filename = paste0("/", TIME_POINT, "_volcanoplot_", cell_type, ".pdf"), plot = plot, device = "pdf")
+  fwrite(x = de_results_per_cluster, file = file.path(OUTPUTDIR, STEP4, TIME_POINT, paste0("/", TIME_POINT, "_diffexp_", cell_type, ".csv")), row.names = TRUE) 
+}
+
+TEXT_OUTPUT <- snakemake@output[["seurat_DEG_output"]]
+output_file<-file(TEXT_OUTPUT)
+writeLines(c("Rules seurat DE finished"), output_file)
+close(output_file)
\ No newline at end of file
diff --git a/03_Script/labelTransfert_seurat.R b/03_Script/labelTransfert_seurat.R
new file mode 100644
index 0000000..13c0285
--- /dev/null
+++ b/03_Script/labelTransfert_seurat.R
@@ -0,0 +1,544 @@
+#========================================================================
+# Label transfert seurat
+#========================================================================
+
+#=====================================
+# Library
+#=====================================
+library(Seurat)
+library(ggplot2)
+library(dplyr)
+library(data.table)
+
+#=====================================
+# Path to file and folder
+#=====================================
+DIRECTORY = getwd()
+OUTPUTDIR = file.path((DIRECTORY), "05_Output")
+REF = file.path((DIRECTORY), "01_Reference")
+
+SAMPLE_ID = snakemake@params[["sample_id"]]
+
+STEP2 = "02_seurat/"
+
+#========================================================================
+# Load the data
+#========================================================================
+
+### Reference dataset Allen 
+ref_data <- readRDS(file = file.path(REF, "/remi/refA21_all.rds"))
+premier_modele <- ref_data@assays[["SCT"]]@SCTModel.list[[1]]
+ref_data@assays[["SCT"]]@SCTModel.list <- list(premier_modele)
+
+# ### Reference dataset arlotta
+# ref_data <- readRDS(file = file.path(REF, "/modify_arlotta_seurat_object.rds"))
+# ref_data <- PercentageFeatureSet(ref_data, pattern = "^mt-", col.name = "percent.mt")
+# ref_data <- SCTransform(ref_data, vars.to.regress = "percent.mt", assay = "RNA", new.assay.name = "SCT", verbose = FALSE)
+
+### Querry dataset
+querry_data <- readRDS(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "_filtered_seurat_object.rds")))### To change here
+querry_data <- SCTransform(querry_data, vars.to.regress = "percent.mt", assay = "RNA", new.assay.name = "SCT", verbose = FALSE)
+
+# Change the default assay to SCT
+DefaultAssay(ref_data) <- "SCT"
+DefaultAssay(querry_data) <- "SCT"
+
+# Keep genes that are in common between querry and ref datasets
+ref_genes <- as.data.frame(rownames(ref_data))
+querry_genes <- as.data.frame(rownames(querry_data))
+all_genes <- intersect(ref_genes$`rownames(ref_data)`, querry_genes$`rownames(querry_data)`)
+
+#========================================================================
+# Create functions
+#========================================================================
+
+# For findTransfertAnchors function
+calculate_k_score <- function(num_cells, proportion = 0.1, min_value = 1) {
+  k_score <- max(min_value, floor(num_cells * proportion))
+  return(k_score)
+}
+
+# For transferData function
+calculate_k_weight <- function(sample_size, proportion = 0.2, min_value = 1) {
+  k_weight <- round(sample_size * proportion)
+  return(max(min_value, k_weight))
+}
+
+#========================================================================
+# Step 1 : annotate with broad class
+#========================================================================
+
+# Create a new column in metadata
+querry_data$class_label <- NA
+
+K_SCORE <- 30 # Seurat option
+
+if(ncol(querry_data) < K_SCORE){
+
+  # Calculate a new k_score
+  k_score_reference <- calculate_k_score(num_cells = ncol(ref_data))
+  k_score_query <- calculate_k_score(num_cells = ncol(querry_data))
+  k_score <- min(k_score_reference, k_score_query)
+
+  anchors <- FindTransferAnchors(reference = ref_data,
+                               query = querry_data,
+                               k.score = k_score,
+                               normalization.method = "SCT",
+                               features = all_genes)
+} else{
+  anchors <- FindTransferAnchors(reference = ref_data,
+                               query = querry_data,
+                               normalization.method = "SCT",
+                               features = all_genes)
+}
+
+# Number of anchors
+num_anchors <- nrow(anchors@anchors)
+
+if(ncol(querry_data) < num_anchors){
+
+  # Calculate the k weight
+  k_weight <- calculate_k_weight(sample_size = num_anchors)
+
+  # Prediction with transfertData
+  prediction <- TransferData(anchorset = anchors,
+                           refdata = ref_data$class_label,
+                           k.weight = k_weight
+                           )
+} else{
+  prediction <- TransferData(anchorset = anchors,
+                           refdata = ref_data$class_label)
+}
+
+querry_data <- AddMetaData(querry_data, metadata = prediction)
+querry_data$class_label[Cells(querry_data)] <- querry_data$predicted.id
+
+# Plot UMAP
+
+pdf(file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "_seurat_labelTransfert.pdf")))
+DimPlot(querry_data, group.by = "class_label", label.size = 6) + theme(legend.position = "bottom",legend.text=element_text(size=4))
+
+# Plot UMAP per labels
+labels <- unique(querry_data@meta.data$`class_label`)
+
+for (label in labels) {
+  plot <- DimPlot(querry_data, group.by = "class_label", cells.highlight = WhichCells(querry_data, expression = class_label == label)) + ggtitle(label)
+  print(plot)
+}
+
+### UMAP plot with the prediction
+FeaturePlot(querry_data, features = "prediction.score.max", reduction = "umap")
+
+
+
+#========================================================================
+# Step 2 : apply a threshold
+#========================================================================
+
+ggplot(querry_data@meta.data, aes(x = prediction.score.max)) + 
+  geom_histogram(binwidth = 0.05) + 
+  theme_minimal() + 
+  xlab("Prediction Score Max") + 
+  ylab("Frequency") + 
+  ggtitle("Distribution of Prediction Scores (Broad Class)")
+
+dev.off()
+
+# Calculate mean and standard deviation
+mean_score <- mean(querry_data@meta.data$prediction.score.max)
+print(mean_score)
+
+sd_score <- sd(querry_data@meta.data$prediction.score.max)
+print(sd_score)
+
+# Define threshold
+threshold <- mean_score - sd_score
+print(threshold)
+
+# Filter cells
+querry_data <- subset(querry_data, subset = prediction.score.max >= threshold)
+gc()
+
+pdf(file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "_seurat_labelTransfert.pdf")))
+
+DimPlot(querry_data, group.by = "class_label", label.size = 6) + theme(legend.position = "bottom",legend.text=element_text(size=4))
+
+labels <- unique(querry_data@meta.data$`class_label`)
+
+for (label in labels) {
+  plot <- DimPlot(querry_data, group.by = "class_label", cells.highlight = WhichCells(querry_data, expression = class_label == label)) + ggtitle(label)
+  print(plot)
+}
+
+dev.off()
+
+#========================================================================
+# Step 3 : annotate with supertype
+#========================================================================
+
+querry_data$supertype_label <- NA
+
+for (class in unique(querry_data$class_label)) {
+  print("The supertype class :")
+  print(class)
+  
+  # Set idents=
+  Idents(querry_data) <- "class_label"
+  Idents(ref_data) <- "class_label"
+  
+  # Per class
+  subset_querry_data <- subset(querry_data, idents = class)
+  subset_ref_data <- subset(ref_data, idents = class)
+  
+  # Check subset sizes
+  print(paste("Number of cells in query subset:", ncol(subset_querry_data)))
+  print(paste("Number of cells in reference subset:", ncol(subset_ref_data)))
+
+  # Keep the same genes
+  ref_genes <- as.data.frame(rownames(subset_ref_data))
+  querry_genes <- as.data.frame(rownames(subset_querry_data))
+  all_genes <- intersect(ref_genes$`rownames(subset_ref_data)`, querry_genes$`rownames(subset_querry_data)`)
+  
+  if(ncol(subset_querry_data) < K_SCORE){
+
+  # Calculate a new k_score
+  k_score_reference <- calculate_k_score(num_cells = ncol(subset_ref_data))
+  k_score_query <- calculate_k_score(num_cells = ncol(subset_querry_data))
+  k_score <- min(k_score_reference, k_score_query)
+
+  anchors <- FindTransferAnchors(reference = subset_ref_data,
+                               query = subset_querry_data,
+                               k.score = k_score,
+                               normalization.method = "SCT",
+                               features = all_genes)
+} else{
+  anchors <- FindTransferAnchors(reference = subset_ref_data,
+                               query = subset_querry_data,
+                               normalization.method = "SCT",
+                               features = all_genes)
+}
+
+# Number of anchors
+num_anchors <- nrow(anchors@anchors)
+
+if(ncol(subset_querry_data) < num_anchors){
+
+  # Calculate the k weight
+  k_weight <- calculate_k_weight(sample_size = num_anchors)
+
+  # Prediction with transfertData
+  prediction <- TransferData(anchorset = anchors,
+                           refdata = subset_ref_data$supertype_label,
+                           k.weight = k_weight
+                           )
+} else{
+  prediction <- TransferData(anchorset = anchors,
+                           refdata = subset_ref_data$supertype_label)
+}
+  
+  # Add to the metadata
+  subset_querry_data <- AddMetaData(subset_querry_data, metadata = prediction)
+  querry_data$supertype_label[Cells(subset_querry_data)] <- subset_querry_data$predicted.id
+}
+pdf(file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "_seurat_labelTransfert.pdf")))
+
+
+### Plot UMAP
+Idents(querry_data) <- querry_data$supertype_label
+DimPlot(querry_data, group.by = "supertype_label", label.size = 6) + theme(legend.position = "bottom",legend.text=element_text(size=4))
+
+### Plot UMAP per labels
+labels <- unique(querry_data@meta.data$`supertype_label`)
+
+for (label in labels) {
+  plot <- DimPlot(querry_data, group.by = "supertype_label", cells.highlight = WhichCells(querry_data, expression = supertype_label == label)) + ggtitle(label)
+  print(plot)
+}
+
+### UMAP plot with the prediction
+FeaturePlot(querry_data, features = "prediction.score.max", reduction = "umap")
+
+dev.off()
+
+saveRDS(querry_data, file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID,paste0(SAMPLE_ID, "_seurat_labelTransfert.rds")))
+
+
+TEXT_OUTPUT <- snakemake@output[["seurat_labelTransfert_output"]]
+
+output_file<-file(TEXT_OUTPUT)
+writeLines(c("Rules seurat Label Transfert finished"), output_file)
+close(output_file)
+
+
+
+################################################################################################ LABEL TRANSFERT FROM REMI'S ANNOTATION : WT TO KO
+
+# #========================================================================
+# # Load the data
+# #========================================================================
+
+# ### Ouvrir la matrice de référence pour le NN
+# remi_nn <- readRDS("/scratch/lchabot/inmed_deChevigny_scrnaseq/scRNAseq_analysis_workflow/01_Reference/remi/GEcortex_subRef_cluster_class.list.rds")
+# remi_nn <- remi_nn[["P30.scRNA-seq.10X"]]
+# gc()
+
+# ### Matrice à annoter 
+# # querry_data <- readRDS("/scratch/lchabot/inmed_deChevigny_scrnaseq/scRNAseq_analysis_workflow/05_Output/02_seurat/P5_KO/P5_KO_filtered_seurat_object.rds")
+# querry_data <- readRDS("/scratch/lchabot/inmed_deChevigny_scrnaseq/scRNAseq_analysis_workflow/05_Output/02_seurat/P30_KO_integrated/P30_KO_integrated_filtered_seurat_object.rds")
+
+# querry_data <- SCTransform(querry_data, vars.to.regress = "percent.mt", assay = "RNA", new.assay.name = "SCT", verbose = FALSE)
+# gc()
+
+# ### Set the default assay to SCT for booth seurat object
+# DefaultAssay(remi_nn) <- "SCT"
+# DefaultAssay(querry_data) <- "SCT"
+
+# # Keep genes that are in common between querry and ref datasets
+# ref_genes <- as.data.frame(rownames(remi_nn))
+# querry_genes <- as.data.frame(rownames(querry_data))
+# all_genes <- intersect(ref_genes$`rownames(remi_nn)`, querry_genes$`rownames(querry_data)`)
+
+# #========================================================================
+# # Create functions
+# #========================================================================
+
+# # For findTransfertAnchors function
+# calculate_k_score <- function(num_cells, proportion = 0.1, min_value = 1) {
+#   k_score <- max(min_value, floor(num_cells * proportion))
+#   return(k_score)
+# }
+
+# # For transferData function
+# calculate_k_weight <- function(sample_size, proportion = 0.2, min_value = 1) {
+#   k_weight <- round(sample_size * proportion)
+#   return(max(min_value, k_weight))
+# }
+
+# #========================================================================
+# # Step 1 : annotate broad class 
+# #========================================================================
+
+# # Create a new column in metadata
+# querry_data$class_label <- NA
+
+# K_SCORE <- 30 # Seurat option
+
+# if(ncol(querry_data) < K_SCORE){
+
+#   # Calculate a new k_score
+#   k_score_reference <- calculate_k_score(num_cells = ncol(remi_nn))
+#   k_score_query <- calculate_k_score(num_cells = ncol(querry_data))
+#   k_score <- min(k_score_reference, k_score_query)
+
+#   anchors <- FindTransferAnchors(reference = remi_nn,
+#                                query = querry_data,
+#                                k.score = k_score,
+#                                normalization.method = "SCT",
+#                                features = all_genes)
+# } else{
+#   anchors <- FindTransferAnchors(reference = remi_nn,
+#                                query = querry_data,
+#                                normalization.method = "SCT",
+#                                features = all_genes)
+# }
+
+# # Number of anchors
+# num_anchors <- nrow(anchors@anchors)
+
+# if(ncol(querry_data) < num_anchors){
+
+#   # Calculate the k weight
+#   k_weight <- calculate_k_weight(sample_size = num_anchors)
+
+#   # Prediction with transfertData
+#   prediction <- TransferData(anchorset = anchors,
+#                            refdata = remi_nn$class_label,
+#                            k.weight = k_weight
+#                            )
+# } else{
+#   prediction <- TransferData(anchorset = anchors,
+#                            refdata = remi_nn$class_label)
+# }
+
+# querry_data <- AddMetaData(querry_data, metadata = prediction)
+# querry_data$class_label[Cells(querry_data)] <- querry_data$predicted.id
+
+# # Plot UMAP
+# DimPlot(querry_data, group.by = "class_label", label.size = 6) + theme(legend.position = "bottom",legend.text=element_text(size=4))
+
+# # Plot UMAP per labels
+# labels <- unique(querry_data@meta.data$`class_label`)
+
+# for (label in labels) {
+#   plot <- DimPlot(querry_data, group.by = "class_label", cells.highlight = WhichCells(querry_data, expression = class_label == label)) + ggtitle(label)
+#   print(plot)
+# }
+
+# ### UMAP plot with the prediction
+# FeaturePlot(querry_data, features = "prediction.score.max", reduction = "umap")
+
+
+# #=============================================
+# # Apply a threshold
+# #=============================================
+
+# ### Remove cells that are not in the rest of the studies
+# querry_data <- subset(querry_data, subset = class_label != "Non-Neuronal")
+# querry_data <- subset(querry_data, subset = class_label != "Immature/Migrating")
+# querry_data <- subset(querry_data, subset = class_label != "Undetermined")
+# querry_data <- subset(querry_data, subset = class_label != "CR")
+
+# DimPlot(querry_data, group.by = "class_label")
+
+# ggplot(querry_data@meta.data, aes(x = prediction.score.max)) + 
+#   geom_histogram(binwidth = 0.05) + 
+#   theme_minimal() + 
+#   xlab("Prediction Score Max") + 
+#   ylab("Frequency") + 
+#   ggtitle("Distribution of Prediction Scores (Broad Class)")
+
+# # Calculate mean and standard deviation
+# mean_score <- mean(querry_data@meta.data$prediction.score.max)
+# print(mean_score)
+
+# sd_score <- sd(querry_data@meta.data$prediction.score.max)
+# print(sd_score)
+
+# # Define threshold
+# threshold <- mean_score - sd_score
+# print(threshold)
+
+# # Filter cells
+# querry_data <- subset(querry_data, subset = prediction.score.max >= threshold)
+# gc()
+
+# DimPlot(querry_data, group.by = "class_label", label.size = 6) + theme(legend.position = "bottom",legend.text=element_text(size=4))
+
+# labels <- unique(querry_data@meta.data$`class_label`)
+
+# for (label in labels) {
+#   plot <- DimPlot(querry_data, group.by = "class_label", cells.highlight = WhichCells(querry_data, expression = class_label == label)) + ggtitle(label)
+#   print(plot)
+# }
+
+# #========================================================================
+# # Step 2 : annotate with subclass
+# #========================================================================
+
+# ref_data <- readRDS(file = file.path(OUTPUTDIR, STEP_REMI, "P30_WT_remi.rds"))
+# premier_modele <- ref_data@assays[["SCT"]]@SCTModel.list[[1]]
+# ref_data@assays[["SCT"]]@SCTModel.list <- list(premier_modele)
+
+# ### Set the default assay to SCT for booth seurat object
+# DefaultAssay(ref_data) <- "SCT"
+# DefaultAssay(querry_data) <- "SCT"
+
+# querry_data$supertype_label <- NA
+
+# for (class in unique(querry_data$class_label)) {
+#   print("The supertype class :")
+#   print(class)
+  
+#   # Set idents
+#   Idents(querry_data) <- "class_label"
+#   Idents(ref_data) <- "class_label"
+  
+#   # Per class
+#   subset_querry_data <- subset(querry_data, idents = class)
+#   subset_ref_data <- subset(ref_data, idents = class)
+  
+#   # Check subset sizes
+#   print(paste("Number of cells in query subset:", ncol(subset_querry_data)))
+#   print(paste("Number of cells in reference subset:", ncol(subset_ref_data)))
+
+#   # Keep the same genes
+#   ref_genes <- as.data.frame(rownames(subset_ref_data))
+#   querry_genes <- as.data.frame(rownames(subset_querry_data))
+#   all_genes <- intersect(ref_genes$`rownames(subset_ref_data)`, querry_genes$`rownames(subset_querry_data)`)
+  
+#   if(ncol(subset_querry_data) < K_SCORE){
+
+#   # Calculate a new k_score
+#   k_score_reference <- calculate_k_score(num_cells = ncol(subset_ref_data))
+#   k_score_query <- calculate_k_score(num_cells = ncol(subset_querry_data))
+#   k_score <- min(k_score_reference, k_score_query)
+
+#   anchors <- FindTransferAnchors(reference = subset_ref_data,
+#                                query = subset_querry_data,
+#                                k.score = k_score,
+#                                normalization.method = "SCT",
+#                                features = all_genes)
+# } else{
+#   anchors <- FindTransferAnchors(reference = subset_ref_data,
+#                                query = subset_querry_data,
+#                                normalization.method = "SCT",
+#                                features = all_genes)
+# }
+
+# # Number of anchors
+# num_anchors <- nrow(anchors@anchors)
+
+# if(ncol(subset_querry_data) < num_anchors){
+
+#   # Calculate the k weight
+#   k_weight <- calculate_k_weight(sample_size = num_anchors)
+
+#   # Prediction with transfertData
+#   prediction <- TransferData(anchorset = anchors,
+#                            refdata = subset_ref_data$supertype_label,
+#                            k.weight = k_weight
+#                            )
+# } else{
+#   prediction <- TransferData(anchorset = anchors,
+#                            refdata = subset_ref_data$supertype_label)
+# }
+  
+#   # Add to the metadata
+#   subset_querry_data <- AddMetaData(subset_querry_data, metadata = prediction)
+#   querry_data$supertype_label[Cells(subset_querry_data)] <- subset_querry_data$predicted.id
+# }
+
+# ### Plot UMAP
+# Idents(querry_data) <- querry_data$supertype_label
+# DimPlot(querry_data, group.by = "supertype_label", label.size = 6) + theme(legend.position = "bottom",legend.text=element_text(size=4))
+
+# ### Plot UMAP per labels
+# labels <- unique(querry_data@meta.data$`supertype_label`)
+
+# for (label in labels) {
+#   plot <- DimPlot(querry_data, group.by = "supertype_label", cells.highlight = WhichCells(querry_data, expression = supertype_label == label)) + ggtitle(label)
+#   print(plot)
+# }
+
+# ### UMAP plot with the prediction
+# FeaturePlot(querry_data, features = "prediction.score.max", reduction = "umap")
+
+# saveRDS(querry_data, file = "Rplots.rds")
+
+################################################################################################ CREATE THE ARLOTTA REFERENCE SEURAT OBJECT (USED IN PREVIOUS STEPS)
+
+# # Read the gene expression matrix (genes as columns, cells as rows)
+# expression_matrix <- read.csv("/scratch/lchabot/inmed_deChevigny_scrnaseq/scRNAseq_analysis_workflow/01_Reference/reference_matrix.csv", row.names = 1)
+
+# expression_matrix <- t(expression_matrix)
+
+# # Read the metadata (cells as rows)
+# metadata <- read.csv("/scratch/lchabot/inmed_deChevigny_scrnaseq/scRNAseq_analysis_workflow/01_Reference/metadata.csv", row.names = 1)
+
+# cell_id_column <- "NEW_NAME"
+# rownames(metadata) <- metadata[[cell_id_column]]
+# metadata <- metadata[, !names(metadata) %in% cell_id_column]
+# metadata <- metadata[rownames(metadata) %in% colnames(expression_matrix), ]
+# expression_matrix <- expression_matrix[, colnames(expression_matrix) %in% rownames(metadata)]
+
+# # Create the Seurat object
+# seurat_object <- CreateSeuratObject(counts = expression_matrix)
+
+# # Add metadata to the Seurat object
+# seurat_object <- AddMetaData(seurat_object, metadata = metadata)
+
+# # Check the Seurat object
+# print("Metadata in Seurat object:")
+# print(head(seurat_object@meta.data))
+
+# saveRDS(seurat_object, "/scratch/lchabot/inmed_deChevigny_scrnaseq/scRNAseq_analysis_workflow/01_Reference/modify_arlotta_seurat_object.rds")
\ No newline at end of file
diff --git a/03_Script/remi.R b/03_Script/remi.R
index 559be00..ada6bcd 100644
--- a/03_Script/remi.R
+++ b/03_Script/remi.R
@@ -1,14 +1,14 @@
-#..............................................................
-# Preparation of new metadatas using Remi work for the Allen
-# and Di Bella references
-#..............................................................
+#=======================================================================================
+# Create the arlotta reference with Remi's work
+#=======================================================================================
 
 #-------------------------------------
 # Library
 #-------------------------------------
 library(Seurat)
-library(data.table)
 library(ggplot2)
+library(dplyr)
+library(data.table)
 
 #-------------------------------------
 # Path to files / folders
@@ -17,87 +17,245 @@ DIRECTORY = getwd()
 OUTPUTDIR = file.path((DIRECTORY), "05_Output")
 REF = file.path((DIRECTORY), "01_Reference")
 
-STEP2 = "02_seurat/"
-STEP3 = "03_sims/"
 STEP_REMI = "03_remi/"
 
-dir.create(file.path(OUTPUTDIR, STEP_REMI))
+dir.create(file.path(OUTPUTDIR, STEP_REMI)) ### A voir !!!!!
 
-#-------------------------------------
-# Fichier celltype manipulation
-#-------------------------------------
+#====================================
+# Open the arlotta metadata :
+# https://singlecell.broadinstitute.org/single_cell/study/SCP1290/molecular-logic-of-cellular-diversification-in-the-mammalian-cerebral-cortex
+#====================================
+arlotta_metadata <- fread(file = file.path(REF, "arlotta/metaData_scDevSC.txt"))
+
+### Remove some low quality cells an keep only NN, Migrating/Immature and Dorsal Pallium Progenitor :
+patterns_to_delete <- c("Interneurons", "Cajal Retzius cells", "UL CPN", "Layer 4", "DL CPN", "DL_CPN_1", "DL_CPN_2", "SCPN", "NP", "CThPN", "Layer 6b", "Doublet", "group", "Low quality cells", "Red blood cells") 
+rows_to_delete <- apply(sapply(patterns_to_delete, grepl, arlotta_metadata$New_cellType), 1, any)
+arlotta_metadata <- subset(arlotta_metadata, !rows_to_delete)
+
+### Keep only some column
+arlotta_metadata <- subset(arlotta_metadata, select = c("NAME", "New_cellType", "orig_ident")) # Cells, subclass (for NN), and the age of the mouse
+
+### Add the class that corespond to each New Celltype => Rémi's thesis
+pattern_class <- c("Apical progenitors" = "Dorsal Pallium Progenitor",
+                   "Astrocytes" = "Non Neuronal",
+                   "Cycling glial cells" = "Non Neuronal",
+                   "Endothelial cells" = "Non Neuronal",
+                   "Ependymocytes" = "Non Neuronal",
+                   "Immature neurons" = "Immature/Migrating",
+                   "Intermediate progenitors" = "Dorsal Pallium Progenitor",
+                   "Microglia" = "Non Neuronal",
+                   "Migrating neurons" = "Immature/Migrating",
+                   "Oligodendrocytes" = "Non Neuronal",
+                   "Pericytes" = "Non Neuronal",
+                   "VLMC" = "Non Neuronal")
+
+assign_class <- function(cell_type, patterns) {
+  match <- patterns[grep(cell_type, names(patterns))]
+  if (length(match) > 0) {
+    return(match)
+  } else {
+    return(NA)
+  }
+}
 
-# Open the RDS and write the structure as a .txt file :
-remi_celltypes <- readRDS(file.path(REF, "remi/GECortex_PostMsub_celltypefinal.rds" )) # ~ 16 GB
-sink(file.path(OUTPUTDIR, STEP_REMI, "remi_celltypefinal_structure.txt" ))
-str(remi_celltypes)
-sink()
+arlotta_metadata$class_label <- sapply(arlotta_metadata$New_cellType, assign_class, pattern_class)
+arlotta_metadata$supertype_label <- arlotta_metadata$New_cellType
+arlotta_metadata$celltype_label <- arlotta_metadata$New_cellType
 
-# Write the metadata as a csv file :
-remi_celltypes_metadata <- remi_celltypes@meta.data
-write.csv(remi_celltypes_metadata, file.path(OUTPUTDIR, STEP_REMI, "remi_celltypefinal_metadata.csv"), row.names = TRUE)
+### Rename column for a futur merge
+setnames(arlotta_metadata, "NAME", "sample_name")
+setnames(arlotta_metadata, "New_cellType", "subclass_label")
+setnames(arlotta_metadata, "orig_ident", "biosample_id")
+
+#====================================
+# Open Remi's metadata : reanotation
+# of the arlotta reference
+#====================================
+rds_remi <- readRDS(file.path(REF, "remi/GECortex_PostMsub_celltypefinal.rds" ))
+rds_metadata_remi <- rds_remi@meta.data
 
-# write metadata for both Arlotta and Allen reference :
+rm("rds_remi")
+gc()
+
+### Looking for the arlotta annotation in the whole metadata
 motif_arlotta <- "J.Di Bella et al., Nature 2021"
-remi_arlotta_metadata <- subset(remi_celltypes_metadata, grepl(motif_arlotta, remi_celltypes_metadata[["study_label"]]))
-write.csv(remi_arlotta_metadata, file.path(OUTPUTDIR, STEP_REMI, "remi_arlotta_metadata.csv"), row.names = TRUE)
+remi_arlotta_metadata <- subset(rds_metadata_remi, grepl(motif_arlotta, rds_metadata_remi[["study_label"]]))
 
-motif_allen <- "Yao et al., Cell 2021"
-remi_allen_metadata <- subset(remi_celltypes_metadata, grepl(motif_allen, remi_celltypes_metadata[["study_label"]]))
-remi_allen_metadata <- remi_allen_metadata[remi_allen_metadata$platform_label %in% "10X", ] # I only use this matrix in my analysis
-write.csv(remi_allen_metadata, file.path(OUTPUTDIR, STEP_REMI, "remi_allen_metadata.csv"), row.names = TRUE)
+### Only keep some column
+remi_arlotta_metadata <- subset(remi_arlotta_metadata, select = c("sample_name", "class_label", "biosample_id", "supertype_label", "subclass_label", "celltype_label"))
 
-#-------------------------------------
-# ARLOTTA 
-#-------------------------------------
-# Open the metadata
-arlotta_metadata <- fread(file = file.path(REF, "arlotta/metaData_scDevSC.txt"))
+### Merge the two metadata 
+modify_arlotta_metadata <- rbind(remi_arlotta_metadata, arlotta_metadata)
+
+### Create a new column for cell name, to only keep A,T,C,C (to match the matrix) + write it as csv
+modify_arlotta_metadata <- as.data.table(modify_arlotta_metadata)
+modify_arlotta_metadata <- modify_arlotta_metadata[, new_sample_name := gsub("[^ATCG]", "", sample_name)]
+write.csv(modify_arlotta_metadata, file = file.path(OUTPUTDIR, STEP_REMI, "modify_arlotta_metadata.csv"), row.names = FALSE)
+
+#====================================
+# Match our arlotta matrices to 
+# the new metadata
+# https://singlecell.broadinstitute.org/single_cell/study/SCP1290/molecular-logic-of-cellular-diversification-in-the-mammalian-cerebral-cortex
+#====================================
+
+### Path of every h5 matrices in a list
+list_files <- list(
+E10 <- file.path(REF, "arlotta/GSM5277843_E10_v1_filtered_feature_bc_matrix.h5"),
+E11 <- file.path(REF, "arlotta/GSM4635072_E11_5_filtered_gene_bc_matrices_h5.h5"),
+E12 <- file.path(REF, "arlotta/GSM4635073_E12_5_filtered_gene_bc_matrices_h5.h5"),
+E13 <- file.path(REF, "arlotta/GSM4635074_E13_5_filtered_gene_bc_matrices_h5.h5"),
+E14 <- file.path(REF, "arlotta/GSM4635075_E14_5_filtered_gene_bc_matrices_h5.h5"),
+E15 <- file.path(REF, "arlotta/GSM4635076_E15_5_S1_filtered_gene_bc_matrices_h5.h5"),
+E16 <- file.path(REF, "arlotta/GSM4635077_E16_filtered_gene_bc_matrices_h5.h5"),
+E17 <- file.path(REF, "arlotta/GSM5277844_E17_5_filtered_feature_bc_matrix.h5"),
+E18_S1 <- file.path(REF, "arlotta/GSM4635078_E18_5_S1_filtered_gene_bc_matrices_h5.h5"),
+E18_S3 <- file.path(REF, "arlotta/GSM4635079_E18_S3_filtered_gene_bc_matrices_h5.h5"),
+P1 <- file.path(REF, "arlotta/GSM4635081_P1_S2_filtered_gene_bc_matrices_h5.h5"),
+P1_S1 <- file.path(REF, "arlotta/GSM4635080_P1_S1_filtered_gene_bc_matrices_h5.h5"),
+P4 <- file.path(REF, "arlotta/GSM5277845_P4_filtered_feature_bc_matrix.h5")
+)
+
+list_files <- setNames(list_files, c("E10", "E11", "E12", "E13", "E14", "E15", "E16", "E17", "E18_S1", "E18_S3", "P1", "P1_S1", "P4"))
 
-# Remove every cells that are not Non Neuronal
-strings_to_remove <- c("Interneurons", "Cajal Retzius cells", "UL CPN", "Layer 4", "DL CPN", "DL_CPN_1", "DL_CPN_2", "SCPN", "NP", "CThPN", "Layer 6b", "Doublet", "group", "Low quality cells", "Red blood cells")
-for (pattern in strings_to_remove) {
-  rows_to_remove <- grep(pattern, arlotta_metadata$New_cellType)
-  arlotta_metadata <- arlotta_metadata[-rows_to_remove, ]
+list_reference_matrix <- list()
+
+### Read every h5 fie
+for (x in names(list_files)){
+    filepath <- list_files[[x]]
+    list_reference_matrix[[x]] <- Read10X_h5(filepath)
 }
 
-# Keep only some column in both files for a futur merge
-arlotta_metadata <- subset(arlotta_metadata, select = c("NAME", "New_cellType", "orig_ident")) # Cells, subclass (for NN), and the age of the mouse
-remi_arlotta_metadata <- subset(remi_arlotta_metadata, select = c("sample_name", "celltype_label", "biosample_id"))# Cells, celltype (for GABAergic and Glutamatergic), and the age of the mouse
+### Make sure the cellnames are written in the same way as the metadata
+clean_colnames <- function(x){
+  current_colnames <- colnames(x)
+  modified_colnames <- gsub("[^ATCG]", "", current_colnames)
+  colnames(x) <- modified_colnames
+  return(x)
+}
+list_reference_matrix <- lapply(list_reference_matrix, clean_colnames)
 
-# Give the two data table the same column name for the merge
-setnames(arlotta_metadata, "NAME", "sample_name")
-setnames(arlotta_metadata, "New_cellType", "celltype_label")
-setnames(arlotta_metadata, "orig_ident", "biosample_id")
+print("The diferent matrix for diferent developmental time have been loaded in a list")
 
-# Merge the two files
-final_arlotta <- rbind(remi_arlotta_metadata, arlotta_metadata)
-write.csv(final_arlotta, file.path(OUTPUTDIR, STEP_REMI, "final_arlotta.csv"), row.names = FALSE)
+### Keep cells that match both the metadata and the matrix
+metadata <- data.table()
 
-#-------------------------------------
-# ALLEN
-#-------------------------------------
-# Open the metadata
-allen_metadata <- fread(file = file.path(REF, "allen/metadata.csv"))
+for (x in names(list_reference_matrix)) {
+  print("The developmental time :")
+  print(x)
+  if (x %in% modify_arlotta_metadata$biosample_id) {
+    rows_metadata <- modify_arlotta_metadata[modify_arlotta_metadata$biosample_id == x] # Look in the metadata and keep only the rows that corespond to the matrix (Keep P4 rows for the P4 matrix for exemple)
+    cell_metadata <- rows_metadata$new_sample_name # Cells name in the metadata (With just A,T,G,C)
+    cell_reference_matrix <- colnames(list_reference_matrix[[x]]) # Cells in the matrix
+    matching_cells <- intersect(cell_metadata, cell_reference_matrix) # Look at cells that match in metadata and matrix
+    
+    # Create a temporary metadata for each matrix with only the cells that match and remove the others
+    temporary_metadata <- subset(rows_metadata, new_sample_name %in% matching_cells)
+    temporary_metadata$new_sample_name <- paste(x, temporary_metadata$new_sample_name, sep="_") # Add the developmental age in front of cell name
+
+    # Append the new metadata for each matrix to the final one
+    metadata <- rbind(metadata, temporary_metadata) 
+    
+    # Now keep only the matching cells in the matrix and remove the others
+    matching_indices <- which(cell_reference_matrix %in% matching_cells)
+    list_reference_matrix[[x]] <- list_reference_matrix[[x]][, matching_indices, drop=FALSE]
+    colnames(list_reference_matrix[[x]]) <- paste(x, colnames(list_reference_matrix[[x]]), sep = "_") # Add the developmental age in front of cell name
+  }
+}
+
+print("The final size of the metadata : ")
+print(dim(metadata))
+
+reference_matrix <- do.call(cbind, list_reference_matrix)
+print("The size of the matrix of reference : ")
+print(dim(reference_matrix))
+
+#====================================
+# Create a Seurat Object + save it
+#====================================
+rownames(metadata) <- metadata$new_sampl e_name
+seurat_object <- CreateSeuratObject(counts = reference_matrix, meta.data = metadata)
+
+saveRDS(seurat_object, file = file.path(OUTPUTDIR, STEP_REMI, "modify_arlotta_seurat_object.rds"))
+
+
+################################################################################################ CREATE FILES TO HAVE ALL CLASS FOR SIMS ANNOTATION
+
+# #=====================================
+# # Fichier celltype manipulation
+# #=====================================
+# # Open the RDS and write the structure as a .txt file :
+# remi_celltypes <- readRDS(file.path(REF, "remi/GECortex_PostMsub_celltypefinal.rds" )) # ~ 16 GB
+# sink(file.path(OUTPUTDIR, STEP_REMI, "remi_celltypefinal_structure.txt" ))
+# str(remi_celltypes)
+# sink()
+
+# # Write the metadata as a csv file :
+# remi_celltypes_metadata <- remi_celltypes@meta.data
+# write.csv(remi_celltypes_metadata, file.path(OUTPUTDIR, STEP_REMI, "remi_celltypefinal_metadata.csv"), row.names = TRUE)
+
+# # write metadata for both Arlotta and Allen reference :
+# motif_arlotta <- "J.Di Bella et al., Nature 2021"
+# remi_arlotta_metadata <- subset(remi_celltypes_metadata, grepl(motif_arlotta, remi_celltypes_metadata[["study_label"]]))
+# write.csv(remi_arlotta_metadata, file.path(OUTPUTDIR, STEP_REMI, "remi_arlotta_metadata.csv"), row.names = TRUE)
+
+# # motif_allen <- "Yao et al., Cell 2021"
+# remi_allen_metadata <- subset(remi_celltypes_metadata, grepl(motif_allen, remi_celltypes_metadata[["study_label"]]))
+# remi_allen_metadata <- remi_allen_metadata[remi_allen_metadata$platform_label %in% "10X", ] # I only use this matrix in my analysis
+# write.csv(remi_allen_metadata, file.path(OUTPUTDIR, STEP_REMI, "remi_allen_metadata.csv"), row.names = TRUE)
+
+# #=====================================
+# # ARLOTTA 
+# #=====================================
+# # Open the metadata
+# arlotta_metadata <- fread(file = file.path(REF, "arlotta/metaData_scDevSC.txt"))
+
+# # Remove every cells that are not Non Neuronal
+# strings_to_remove <- c("Interneurons", "Cajal Retzius cells", "UL CPN", "Layer 4", "DL CPN", "DL_CPN_1", "DL_CPN_2", "SCPN", "NP", "CThPN", "Layer 6b", "Doublet", "group", "Low quality cells", "Red blood cells")
+# for (pattern in strings_to_remove) {
+#   rows_to_remove <- grep(pattern, arlotta_metadata$New_cellType)
+#   arlotta_metadata <- arlotta_metadata[-rows_to_remove, ]
+# }
 
-# Keep only cells that are Non Neuronal
-strings_to_keep <- c("Astro", "Endo", "Micro-PVM", "Oligo", "SMC-Peri", "VLMC")
-allen_metadata <- allen_metadata[allen_metadata$subclass_label %in% strings_to_keep]
+# # Keep only some column in both files for a futur merge
+# arlotta_metadata <- subset(arlotta_metadata, select = c("NAME", "New_cellType", "orig_ident")) # Cells, subclass (for NN), and the age of the mouse
+# remi_arlotta_metadata <- subset(remi_arlotta_metadata, select = c("sample_name", "supertype_label", "biosample_id"))# Cells, celltype (for GABAergic and Glutamatergic), and the age of the mouse
 
-# Keep only some column in both files for a futur merge
-allen_metadata <- subset(allen_metadata, select = c("sample_name", "subclass_label"))# Cells, subclass (for NN)
-remi_allen_metadata <- subset(remi_allen_metadata, select = c("exp_component_name", "celltype_label"))
+# # Give the two data table the same column name for the merge
+# setnames(arlotta_metadata, "NAME", "sample_name")
+# setnames(arlotta_metadata, "New_cellType", "supertype_label")
+# setnames(arlotta_metadata, "orig_ident", "biosample_id")
 
-# Give the two data table the same column name for the merge
-setnames(remi_allen_metadata, "exp_component_name", "sample_name")
-setnames(allen_metadata, "subclass_label", "celltype_label")
+# # Merge the two files
+# final_arlotta <- rbind(remi_arlotta_metadata, arlotta_metadata)
+# write.csv(final_arlotta, file.path(OUTPUTDIR, STEP_REMI, "supertype_final_arlotta.csv"), row.names = FALSE)
 
-# Merge the two files
-final_allen <- rbind(remi_allen_metadata, allen_metadata)
-write.csv(final_allen, file.path(OUTPUTDIR, STEP_REMI, "final_allen.csv"), row.names = FALSE)
+# #=====================================
+# # ALLEN
+# #=====================================
+# # Open the metadata
+# allen_metadata <- fread(file = file.path(REF, "allen/metadata.csv"))
 
+# # Keep only cells that are Non Neuronal
+# strings_to_keep <- c("Astro", "Endo", "Micro-PVM", "Oligo", "SMC-Peri", "VLMC")
+# allen_metadata <- allen_metadata[allen_metadata$subclass_label %in% strings_to_keep]
 
+# # Keep only some column in both files for a futur merge
+# allen_metadata <- subset(allen_metadata, select = c("sample_name", "subclass_label"))# Cells, subclass (for NN)
+# remi_allen_metadata <- subset(remi_allen_metadata, select = c("exp_component_name", "celltype_label"))
 
+# # Give the two data table the same column name for the merge
+# setnames(remi_allen_metadata, "exp_component_name", "sample_name")
+# setnames(allen_metadata, "subclass_label", "celltype_label")
 
+# # Merge the two files
+# final_allen <- rbind(remi_allen_metadata, allen_metadata)
+# write.csv(final_allen, file.path(OUTPUTDIR, STEP_REMI, "final_allen.csv"), row.names = FALSE)
 
+################################################################################################ CREATE FILES TO HAVE REMI'S ANNOTATION 
 
+# remi <- readRDS(file.path(REF, "remi/GECortex_PostMsub_celltypefinal.rds" )) # ~ 16 GB
+# cell_names <- grep("^P30_wt_", colnames(remi), value = TRUE)
+# remi <- subset(remi, cells = cell_names) 
+# saveRDS(remi, file = file.path(OUTPUTDIR, STEP_REMI, "P30_WT_remi.rds"))
 
+################################################################################################ LABEL TRANSFERT FROM THE REFERENCE : ARLOTTA AND ALLEN 
\ No newline at end of file
diff --git a/03_Script/seurat.Rmd b/03_Script/seurat.Rmd
index 5989e15..dbee8fd 100644
--- a/03_Script/seurat.Rmd
+++ b/03_Script/seurat.Rmd
@@ -1,8 +1,8 @@
 ---
-title: "Single-cell RNAseq analysis of gastruloids"
+title: "Single-cell RNAseq analysis of the somatosensory cortex"
 subtitle: "Seurat | QC and Clustering"
 author:
-- Thomas Vannier
+- Léa Chabot
 date: "`r format(Sys.time(), '%d %B %Y')`"
 output:
     html_document:
@@ -117,6 +117,10 @@ Top `r VARIABLE_FEATURES_SHOWTOP` most variable annotations:
 <<findVariableGenes_summaryTable>>
 ```
 
+```{r scaleData, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<scaleData>>
+```
+
 <br /><br />
 
 ```{r, echo = FALSE}
diff --git a/04_Workflow/remi.smk b/04_Workflow/remi.smk
index 5710b35..b18fd1f 100644
--- a/04_Workflow/remi.smk
+++ b/04_Workflow/remi.smk
@@ -6,7 +6,7 @@ rule remi:
         remi_output = expand(OUTPUTDIR + "remi_output.txt"),
 
     conda:
-        CONTAINER + "seurat.yaml"
+        CONTAINER + "preparation_sims.yaml"
 
     message:
         "TEST REMI"
diff --git a/04_Workflow/seurat.smk b/04_Workflow/seurat.smk
index 883a3ce..db8919e 100644
--- a/04_Workflow/seurat.smk
+++ b/04_Workflow/seurat.smk
@@ -52,4 +52,50 @@ rule seurat:
         "Run Seurat for the clustering"
 
     script:
-        SCRIPTDIR + "seurat_reports_compilation.R"
\ No newline at end of file
+        SCRIPTDIR + "seurat_reports_compilation.R"
+
+#----------------------------------------
+# Label Transfert with Seurat
+#----------------------------------------
+
+rule seurat_labelTransfert:
+    input:
+        seurat_output = expand(OUTPUTDIR + "02_seurat/seurat_output.txt"),
+
+    output:
+        seurat_labelTransfert_output = expand(OUTPUTDIR + "02_seurat/seurat_labelTransfert_output.txt"),
+
+    conda:
+        CONTAINER + "seurat.yaml"
+
+    params:
+        sample_id = expand("{sample_id.id}", sample_id = sample_id.itertuples()),
+
+    message:
+        "Run Seurat for the transfert data"
+
+    script:
+        SCRIPTDIR + "labelTransfert_seurat.R"
+
+#----------------------------------------
+# DEG analysis with Seurat
+#----------------------------------------
+
+rule seurat_DEG:
+    input:
+        seurat_labelTransfert_output = expand(OUTPUTDIR + "02_seurat/seurat_labelTransfert_output.txt"),
+
+    output:
+        seurat_DEG_output = expand(OUTPUTDIR + "04_diffexp/seurat_DEG_output.txt"),
+
+    conda:
+        CONTAINER + "seurat.yaml"
+
+    params:
+        time_point = config["diffexp"]["time_point"],
+
+    message:
+        "Run Seurat for the DE analysis"
+
+    script:
+        SCRIPTDIR + "DEG_analysis.R"
diff --git a/Snakefile b/Snakefile
index 387861e..c9c2c6d 100644
--- a/Snakefile
+++ b/Snakefile
@@ -78,9 +78,13 @@ rule all:
 		# demuxlet = OUTPUTDIR + "01_cellranger/Mix_MM_lines/outs/demuxlet_Mix_MM_lines.best",
 		# tabdemuxlet = OUTPUTDIR + "01_cellranger/Mix_MM_lines/outs/demuxlet_Mix_MM_lines.tsv",
 		### Seurat
-		seurat_report = expand(OUTPUTDIR + "02_seurat/{sample_id}/{sample_id}_seurat_report.html", sample_id = SAMPLE_ID),
+		# seurat_report = expand(OUTPUTDIR + "02_seurat/{sample_id}/{sample_id}_seurat_report.html", sample_id = SAMPLE_ID),
 		### Remi data
 		# remi_output = expand(OUTPUTDIR + "remi_output.txt"),
+		### Label Transfert with seurat
+		# seurat_labelTransfert_output = expand(OUTPUTDIR + "02_seurat/seurat_labelTransfert_output.txt"),
+		### DEG analysis with seurat
+		seurat_DEG_output = expand(OUTPUTDIR + "04_diffexp/seurat_DEG_output.txt"),
 		### Prepare data for SIMS
 		# data_for_sims_output = expand(OUTPUTDIR + "03_sims/data_for_sims_output.txt"),
 		# anndata_for_sims_output = expand(OUTPUTDIR + "03_sims/anndata_for_sims_output.txt"),
diff --git a/config.yaml b/config.yaml
index d565fea..914eac4 100644
--- a/config.yaml
+++ b/config.yaml
@@ -11,9 +11,9 @@ sample: sample.tsv
 # If you run the workflow with slurm (sbatch run_slurm.sh)
 run_slurm: true
 # If you data are demultiplexed = true
-run_demultiplex: false
+run_demultiplex: true
 # If your data are not demultiplexed = true
-run_multiplex: true
+run_multiplex: false
 
 rules:
     # Optimizing and assembling genome annotations for 3’ single-cell RNA-sequencing analysis
@@ -110,12 +110,12 @@ seurat:
     min_features: 200
     # Cells with number of UMIs outside the range will be excluded
     # Can be a list of UMIs but MUST be on the same order than sample.tsv id.
-    filter_umi_min: "0"
-    filter_umi_max: "7500"
+    filter_umi_min: "500"
+    filter_umi_max: ""
     # Filter cells that have unique feature counts over/less n
     # Can be a list of feature counts but MUST be on the same order than sample.tsv id.
-    filter_feature_min: "0"
-    filter_feature_max: "4000"
+    filter_feature_min: "500"
+    filter_feature_max: ""
     # Filter cells that have > n% mitochondrial counts
     filter_percent_mt: "10"
     # Filter cells that have < n% mitochondrial counts
@@ -142,9 +142,9 @@ seurat:
 
     ##### PCA parameters
     # Number of dimention to show for the vizualisation
-    dims: 30
+    dims: 50
     # Default number of dimensions to use for PCA (see Seurat::RunPCA())
-    pca_npc: 25
+    pca_npc: 
     # Number of dimensions to show in PCA-related plots
     pca_plots_nbdims : 3
     #  Number of'top' features to show when plotting PCA loadings
@@ -152,11 +152,11 @@ seurat:
 
     ##### Dimensionality reduction parameters (TSNE/UMAP)
     # Number of dimensions to use from PCA results
-    dimreduc_use_pca_nbdims : 25
+    dimreduc_use_pca_nbdims :
 
     ##### Cluster identification parameters
     # Number of dimensions to use from PCA results
-    findclusters_use_pca_nbdims : 30
+    findclusters_use_pca_nbdims :
     # Nearest-neighbor graph construction
     findneighbors_k : 30
     # Value of the resolution parameter, use a value above (below) 1.0 if you want to obtain a larger (smaller) number of communities.
@@ -172,7 +172,7 @@ reference_sims:
     # For now : arlotta.R, allen_mouse_whole_cortex.R
     r_script: "arlotta.R"
     # Output folder name for the reference
-    reference_name: "03_Arlotta_not_norm"
+    reference_name: "02_Arlotta_not_norm"
     # Output file name for the reference metadata
     output_name_ref_metadata: "metadata"
     # Output file name for the reference matrix
@@ -181,11 +181,11 @@ reference_sims:
     output_name_matrix: "data_matrix"
     # Name of the column that contain the cells
     # allen mouse whole cortex : "sample_name", arlotta : "NEW_NAME"
-    cells_column: "NEW_NAME"
+    cells_column: "sample_name"
 
     ##### Our Matrix to annotate #####
     # The name of the folder of the sample you want to annotate (One of the output in the seurat folder)
-    sample_id: "P5_WT"
+    sample_id: "E14sn"
 
     ##### Paola Arlotta #####
     # Path for the arlotta metadata
@@ -224,7 +224,7 @@ sims:
 
     ##### Logger
     # Name of the project (Will be the name of the run and checkpoint)
-    project_name: "03_Arlotta_not_norm"
+    project_name: "02_Arlotta_not_norm"
 
     ##### Early Stopping
     # The validation metric you wish to monitor for early stop.
@@ -233,11 +233,11 @@ sims:
     patience: 20
 
     # Epoch file from wandb to chosse for the cell assignation
-    epoch: "epoch=40-step=74128.ckpt"
+    epoch: "epoch=58-step=80004.ckpt"
 
     ##### Cells annotation
     # Threshold of the difference between the first and second score of the prediction to say if a cell is unknown or not
-    threshold: 0.0
+    threshold: 0.1
     # Seurat object with sims cell_assignation
     rds_assignation: filtered_assigned_seurat_object.rds
     # Matrix with cell predtion after filtration with the threshold
@@ -245,6 +245,8 @@ sims:
 
 # Differentially expressed features analyses
 diffexp:
+    # The name of the time point analysis (without WT or KO, ex : E14)
+    time_point : "E14"
     # Features of interest
     features: "SOX10,SOX9,RIPOR2,MITF,DCT,MYC,FLT1,TNFRSF11B,XIRP2,ST3GAL1"
     # Cells of interest to find differentially expressed features against "Normal" or Wild Type cells

From 250e78ba0f843f7a9bfeea3a15ab0577c2554fde Mon Sep 17 00:00:00 2001
From: LeaChabot <lea-chabot@orange.fr>
Date: Fri, 18 Oct 2024 10:36:21 +0200
Subject: [PATCH 15/15] Seurat workflow + label transfert

---
 02_Container/labelTransfer.yaml              |  14 +
 03_Script/01_prepare_data.R                  |  61 +-
 03_Script/02_variables_genes.R               |   6 +-
 03_Script/03_cell_heterogeneity_PCA.R        | 154 +---
 03_Script/04_cell_heterogeneity_tSNE_UMAP.R  |  45 +-
 03_Script/05_cell_heterogeneity_clustering.R |  56 +-
 03_Script/allen_mouse_whole_cortex.R         | 126 ++-
 03_Script/arlotta.R                          |   2 +-
 03_Script/labelTransfert_seurat.R            | 802 +++++++++----------
 03_Script/remi.R                             |  10 +-
 03_Script/seurat.Rmd                         | 246 +-----
 04_Workflow/seurat.smk                       |  40 +-
 06_Schemas/config.schema.yaml                | 328 ++++----
 Snakefile                                    |   6 +-
 config.yaml                                  |  62 +-
 15 files changed, 849 insertions(+), 1109 deletions(-)
 create mode 100644 02_Container/labelTransfer.yaml

diff --git a/02_Container/labelTransfer.yaml b/02_Container/labelTransfer.yaml
new file mode 100644
index 0000000..2c97461
--- /dev/null
+++ b/02_Container/labelTransfer.yaml
@@ -0,0 +1,14 @@
+channels :
+    - r
+    - bioconda
+    - conda-forge
+    - defaults
+dependencies :
+    - r-ggplot2 =3.5.1
+    - r-seurat =5.1.0
+    - r-dplyr =1.1.4
+    - r-sctransform=0.4.1  # Include sctransform
+    - r-devtools  # Required for installing packages from GitHub
+    - bioconductor-glmgampoi
+
+ 
\ No newline at end of file
diff --git a/03_Script/01_prepare_data.R b/03_Script/01_prepare_data.R
index 38100a8..8a72bd5 100644
--- a/03_Script/01_prepare_data.R
+++ b/03_Script/01_prepare_data.R
@@ -23,31 +23,41 @@ STEP2 = "02_seurat/"
 source(file.path(dirname(DIRECTORY), "03_Script/00_general_deps.R"))
 
 in_data_dir = file.path( OUTPUTDIR, STEP1)
+in_data_dir_demux = file.path( OUTPUTDIR, STEP2)
 samples <- dir(in_data_dir)
 
 MIN_CELLS <- as.numeric(MIN_CELLS)
 MIN_FEATURES <- as.numeric(MIN_FEATURES)
-HTO <- unlist(strsplit(HTO, ","))
+# HTO <- unlist(strsplit(HTO, ","))
+
+options(Seurat.object.assay.version = 'v3')
 
 #........................................
 # Read data
 #........................................
-seurat_data <- Read10X(paste0(in_data_dir,SAMPLE_ID,CELL_RANGER_COUNT_PATH))
-
-# Need a condition if data are depultiplexed with cell ranger multi
 if( RUN_DEMULTIPLEX == TRUE){
+        seurat_data <- Read10X(paste0(in_data_dir,SAMPLE_ID,CELL_RANGER_COUNT_PATH))
         seurat_obj <- CreateSeuratObject(counts = seurat_data, min.cells = MIN_CELLS, min.features = MIN_FEATURES, project = SAMPLE_ID)
-} else if (as.logical(MULTIPLEX) == TRUE) {
-        seurat_obj <- CreateSeuratObject(counts = seurat_data$`Gene Expression`, min.cells = MIN_CELLS, min.features = MIN_FEATURES, project = SAMPLE_ID) 
-        hto <- seurat_data$`Antibody Capture`
-        hto <- hto[c(HTO), ]
-        seurat_obj[["HTO"]] <- CreateAssayObject(counts = hto)
-        seurat_obj <- subset(x = seurat_obj, subset = nCount_HTO > 0)
-        seurat_obj <- NormalizeData(seurat_obj, assay = "HTO", normalization.method = "CLR")       
-} else{
-        seurat_obj <- CreateSeuratObject(counts = seurat_data$`Gene Expression`, min.cells = MIN_CELLS, min.features = MIN_FEATURES, project = SAMPLE_ID) 
+} else if (RUN_DEMULTIPLEX == FALSE) {
+        seurat_obj <- readRDS(paste0(in_data_dir_demux,SAMPLE_ID, "/demultiplex_", SAMPLE_ID, ".rds"))     
 }
 
+
+
+# # Need a condition if data are depultiplexed with cell ranger multi
+# if( RUN_DEMULTIPLEX == TRUE){
+#         seurat_obj <- CreateSeuratObject(counts = seurat_data, min.cells = MIN_CELLS, min.features = MIN_FEATURES, project = SAMPLE_ID)
+# } else if (as.logical(MULTIPLEX) == TRUE) {
+#         seurat_obj <- CreateSeuratObject(counts = seurat_data$`Gene Expression`, min.cells = MIN_CELLS, min.features = MIN_FEATURES, project = SAMPLE_ID) 
+#         hto <- seurat_data$`Antibody Capture`
+#         hto <- hto[c(HTO), ]
+#         seurat_obj[["HTO"]] <- CreateAssayObject(counts = hto)
+#         seurat_obj <- subset(x = seurat_obj, subset = nCount_HTO > 0)
+#         seurat_obj <- NormalizeData(seurat_obj, assay = "HTO", normalization.method = "CLR")       
+# } else{
+#         seurat_obj <- CreateSeuratObject(counts = seurat_data$`Gene Expression`, min.cells = MIN_CELLS, min.features = MIN_FEATURES, project = SAMPLE_ID) 
+# }
+
 seurat_obj$SampleID <- SAMPLE_ID
 
 # ..........................................................................................................
@@ -379,18 +389,20 @@ ggplot( seurat_obj[[]][order( seurat_obj[["outlier"]]),], # Plot FALSE first and
   labs( x = "# Genes", y = "# UMIs") +
   theme( legend.position = "none")
 
-# Mitochondrial vs ribosomal distributions
-if(exists( "mito.drop") && exists( "ribo.drop"))
+if(exists( "mito.drop"))
 {
   ggplot( seurat_obj[[]][order( seurat_obj[["outlier"]]),], # Plot FALSE first and TRUE after
-          aes( x = percent.rb, 
+          aes( x = nCount_RNA, 
                y = percent.mt, 
                color = outlier)) + 
     geom_point( size = 0.5) +
-    geom_vline( xintercept = FILTER_PERCENT_RB, linetype = 2, color = "blue", alpha = 0.5) +
+    geom_vline( xintercept = c( FILTER_FEATURE_MIN, FILTER_FEATURE_MAX), 
+              linetype = 2, 
+              color = c( "blue", "red")[!sapply( list( FILTER_FEATURE_MIN, FILTER_FEATURE_MAX), is.null)], 
+              alpha = 0.5) +
     geom_hline( yintercept = FILTER_PERCENT_MT, linetype = 2, color = "red", alpha = 0.5) +
     scale_color_manual( values = c( "TRUE" = "#FF0000AA", "FALSE" = "#44444444")) + 
-    labs( x = "% Ribosomal genes", y = "% Mitochondrial genes") +
+    labs( x = "# UMIs", y = "% Mitochondrial genes") +
     theme( legend.position = "none")
 }
 
@@ -413,15 +425,8 @@ if( QC_EXPLORATION_MODE == FALSE){
 #              sep="\t");
 
 # ..........................................................................................................
-## @knitr normalizeData
+## @knitr SCTransform
 # ..........................................................................................................
 
-# Normalize the count data present in a given assay
-seurat_obj = NormalizeData( object = seurat_obj,
-                       normalization.method = NORM_METHOD,
-                       scale.factor = NORM_SCALE_FACTOR,
-                       verbose = .VERBOSE)
-
-# # Scales and centers features in the dataset
-# seurat_obj = ScaleData( object = seurat_obj,
-#                    verbose = .VERBOSE)
+# # Normalize the count data present in a given assay
+seurat_obj = SCTransform(seurat_obj, vars.to.regress = "percent.mt", verbose = FALSE)
diff --git a/03_Script/02_variables_genes.R b/03_Script/02_variables_genes.R
index 704cfae..44f54bb 100644
--- a/03_Script/02_variables_genes.R
+++ b/03_Script/02_variables_genes.R
@@ -35,7 +35,11 @@ suppressMessages( suppressWarnings( LabelPoints( plot = VariableFeaturePlot( seu
 ## @knitr findVariableGenes_summaryTable
 # ..........................................................................................................
 
-# Extract variable genes info as data.frame
+# Extract variable genes info as data.frame    feature_select_method: "vst"
+    # Number of features to select as top variable features; only used when select_method is set to 'dispersion' or 'vst'
+    variable_features: 2000
+    # For table in report
+    variable_features_showtop : 200
 variableAnnotationsDT = head( HVFInfo( object = seurat_obj, assay = "RNA", selection.method = FEATURE_SELECT_METHOD)[VariableFeatures( seurat_obj),], VARIABLE_FEATURES_SHOWTOP);
 variableAnnotationsDT = cbind("Gene" = rownames(variableAnnotationsDT), variableAnnotationsDT);
 
diff --git a/03_Script/03_cell_heterogeneity_PCA.R b/03_Script/03_cell_heterogeneity_PCA.R
index f37de19..e3286f3 100644
--- a/03_Script/03_cell_heterogeneity_PCA.R
+++ b/03_Script/03_cell_heterogeneity_PCA.R
@@ -3,39 +3,31 @@
 #  of cells using dimension reduction techniques
 # ########################################################
 
-# ..........................................................................................................
-## @knitr scaleData
-# ..........................................................................................................
-
-# Scales and centers features in the dataset
-seurat_obj = ScaleData( object = seurat_obj,
-                   verbose = .VERBOSE)
-
 # ..........................................................................................................
 ## @knitr HTODemux
 # ..........................................................................................................
 
-if(as.logical(MULTIPLEX) == TRUE){
-  # Demultiplexing data 
-  seurat_obj <- HTODemux(seurat_obj, assay = "HTO", positive.quantile = 0.99)
-  cat("<br><br>Removed cells after filtering Doublet et Negative:", sum(seurat_obj[["hash.ID"]] == "Negative" | seurat_obj[["hash.ID"]] == "Doublet"));
-  cat("<br><br>Remaining cells after filtering:", sum(seurat_obj$hash.ID %in% HTO));
-  cat("\n<br>\n");
-  seurat_obj <- subset(seurat_obj, idents = c("Doublet","Negative"), invert = TRUE) ### Keep only the Singlet cells (Remove Negative and Doublet)
+# if(as.logical(MULTIPLEX) == TRUE){
+#   # Demultiplexing data 
+#   seurat_obj <- HTODemux(seurat_obj, assay = "HTO", positive.quantile = 0.99)
+#   cat("<br><br>Removed cells after filtering Doublet et Negative:", sum(seurat_obj[["hash.ID"]] == "Negative" | seurat_obj[["hash.ID"]] == "Doublet"));
+#   cat("<br><br>Remaining cells after filtering:", sum(seurat_obj$hash.ID %in% HTO));
+#   cat("\n<br>\n");
+#   seurat_obj <- subset(seurat_obj, idents = c("Doublet","Negative"), invert = TRUE) ### Keep only the Singlet cells (Remove Negative and Doublet)
 
-  seurat_obj <- FindVariableFeatures(seurat_obj, selection.method = "mean.var.plot")
-  seurat_obj <- ScaleData(seurat_obj, features = VariableFeatures(seurat_obj))
-  seurat_obj <- RunPCA(seurat_obj)
-  seurat_obj <- RunUMAP(seurat_obj, dims = 1:25)
-  print(DimPlot(seurat_obj))
-}
+#   seurat_obj <- FindVariableFeatures(seurat_obj, selection.method = "mean.var.plot")
+#   seurat_obj <- ScaleData(seurat_obj, features = VariableFeatures(seurat_obj))
+#   seurat_obj <- RunPCA(seurat_obj)
+#   seurat_obj <- RunUMAP(seurat_obj, dims = 1:25)
+#   print(DimPlot(seurat_obj))
+# }
 
 # ..........................................................................................................
 ## @knitr ldr
 # ..........................................................................................................
 
 # Perform linear dimensional reduction
-seurat_obj <- RunPCA(seurat_obj, features = VariableFeatures(object = seurat_obj))
+seurat_obj <- RunPCA(seurat_obj, assay = "SCT", verbose = FALSE)
 VizDimLoadings(seurat_obj, dims = 1:DIMS, reduction = "pca")
 # DimPlot(seurat_obj, reduction = "pca", group.by = "orig.ident")
 # DimPlot(seurat_obj, reduction = "pca", group.by = "categorie")
@@ -47,106 +39,30 @@ DimHeatmap(seurat_obj, dims = 1:DIMS, cells = 500, balanced = TRUE)
 
 ElbowPlot(seurat_obj, ndims = DIMS)
 
+# https://hbctraining.github.io/scRNA-seq/lessons/elbow_plot_metric.html
+pct <- seurat_obj[["pca"]]@stdev / sum(seurat_obj[["pca"]]@stdev) * 100
+cumu <- cumsum(pct)
+co1 <- which(cumu > 90 & pct < 5)[1]
+co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1
+pcs <- min(co1, co2)
+# Create a dataframe with values
+plot_df <- data.frame(pct = pct, 
+           cumu = cumu, 
+           rank = 1:length(pct))
+
+# Elbow plot to visualize 
+  ggplot(plot_df, aes(cumu, pct, label = rank, color = rank > pcs)) + 
+  geom_text() + 
+  geom_vline(xintercept = 90, color = "grey") + 
+  geom_hline(yintercept = min(pct[pct > 5]), color = "grey") +
+  theme_bw()
+  
 # ..........................................................................................................
 ## @knitr heterogeneity_pca
 # ..........................................................................................................
 
 # Compute PCA on selected variable genes
-nbPC = PCA_NPC
-if(PCA_NPC > length(Cells(seurat_obj)))
-{
-  warning( paste0( "Number of cells in object (", length(Cells(seurat_obj)), ") smaller than requested number of PCs (", PCA_NPC,"), setting lower PC number..." ))
-  nbPC = length(Cells(seurat_obj))
-}           
 seurat_obj <- RunPCA( object = seurat_obj,
-                 features = VariableFeatures( seurat_obj),
-                 npcs     = nbPC,
-                 verbose  = .VERBOSE);
-
-# Plot PCA, highlighting seurat clusters for combination of dimensions
-invisible( apply( combn( 1:PCA_PLOTS_NBDIMS, 2), 2, function(dims)
-{
-  print( DimPlot( object = seurat_obj, reduction = "pca", dims = dims, group.by = "orig.ident") +
-           theme( legend.position = "none"));                              
-}));
-
-# ..........................................................................................................
-## @knitr heterogeneity_pca_umisCounts
-# ..........................................................................................................
-
-# Same PCA plots but highlight UMI counts
-invisible( apply( combn( 1:PCA_PLOTS_NBDIMS, 2), 2, function(dims)
-{
-  print( FeaturePlot( seurat_obj, feature = "nCount_RNA", reduction = "pca", dims = dims) +
-           theme( legend.position = "none",
-                  plot.margin = margin(0, 0, 0, 0, "cm"),
-                  plot.title = element_blank()));
-}));
-
-# ..........................................................................................................
-## @knitr heterogeneity_pca_genesCounts
-# ..........................................................................................................
-
-# Same PCA plots but highlight feature counts
-invisible( apply( combn( 1:PCA_PLOTS_NBDIMS, 2), 2, function(dims)
-{
-  print( FeaturePlot( seurat_obj, feature = "nFeature_RNA", reduction = "pca", dims = dims) +
-           theme( legend.position = "none",
-                  plot.margin = margin(0, 0, 0, 0, "cm"),
-                  plot.title = element_blank()));
-}));
-
-# ..........................................................................................................
-## @knitr heterogeneity_pca_correlations
-# ..........................................................................................................
-
-# Isolate the PCA location of cells in first dimensions, together with UMI and genes counts for correlation analysis (makes use of cbind recycling to repeat values for each stacked PC)
-relationToPC = suppressWarnings( cbind( stack( as.data.frame( Embeddings( seurat_obj, reduction = "pca")[ rownames( seurat_obj[[]]), paste0( "PC_", 1:PCA_PLOTS_NBDIMS) ])),
-                                        seurat_obj[[ c( "nCount_RNA", "nFeature_RNA") ]],
-                                        Cluster = Idents( seurat_obj)));
-
-# Plot relationship of UMIs and genes counts with PCs (combine plots using '/' from 'patchwork' lib)
-print( (ggplot( data = relationToPC, aes( x = values, y = nCount_RNA)) +
-          facet_wrap( ~ind) +
-          stat_binhex( bins = 60) +
-          #geom_point( aes(col = Cluster), alpha = 0.5) +
-          geom_smooth( method = 'lm') +
-          stat_cor( method = "spearman") +
-          ylab( "# UMIs") +
-          theme( axis.title.x = element_blank(),
-                 axis.text.x = element_blank(),
-                 axis.ticks.x = element_blank(),
-                 plot.margin = unit( c( 1, 1, -0.5, 0.5), "lines")))
-       /
-         (ggplot( data = relationToPC, aes( x = values, y = nFeature_RNA)) +
-            facet_wrap( ~ind) +
-            stat_binhex( bins = 60) +
-            #geom_point( aes(col = Cluster), alpha = 0.5) +
-            geom_smooth( method = 'lm') +
-            stat_cor( method = "spearman") +
-            xlab( "PC values") +
-            ylab( "# Genes")));
-
-# ..........................................................................................................
-## @knitr heterogeneity_pca_loadings
-# ..........................................................................................................
-
-# Plot PCA loadings
-invisible( apply( combn( 1:PCA_PLOTS_NBDIMS, 2), 2, function(dims)
-{
-  namesPC=paste0( "PC_", dims);
-  # Get the loading values for concerned PCs
-  loadingsMatrix = Loadings( seurat_obj, reduction = "pca")[ , namesPC ];
-  # Sort features by average absolute value and convert as DF with features names as column
-  loadingsMatrix = head( loadingsMatrix[ order( apply( loadingsMatrix, 1, function(x){ mean( abs( x)) }), decreasing = TRUE), ], PCA_PLOTS_NBFEATURES);
-  loadingsDF = data.frame( loadingsMatrix, features = rownames( loadingsMatrix));
-  
-  # Define symmetric and consistent axes for group of plots
-  axesLimit = max( abs( loadingsMatrix));
-  
-  # Plot arrows and features name
-  print( ggplot( data = loadingsDF, aes_( x = as.name( namesPC[1]), y = as.name( namesPC[2]))) +
-           coord_cartesian( xlim = c( -axesLimit, axesLimit), ylim = c( -axesLimit, axesLimit)) +
-           geom_text_repel( aes( label = features), max.iter = 10000) + #, max.overlaps = Inf
-           geom_segment( x = 0 , y = 0, aes_( xend = as.name(namesPC[1]), yend = as.name(namesPC[2])), col = "#00000044", arrow = arrow( length = unit( 2, "mm"))));
-}));
+                 npcs     = pcs,
+                 verbose  = .VERBOSE,
+                 assay = "SCT");
diff --git a/03_Script/04_cell_heterogeneity_tSNE_UMAP.R b/03_Script/04_cell_heterogeneity_tSNE_UMAP.R
index 20d8212..8ffc74b 100644
--- a/03_Script/04_cell_heterogeneity_tSNE_UMAP.R
+++ b/03_Script/04_cell_heterogeneity_tSNE_UMAP.R
@@ -7,21 +7,14 @@
 ## @knitr heterogeneity_dimReduc
 # ..........................................................................................................
 
-nbPC_dimreduc = DIMREDUC_USE_PCA_NBDIMS
-if(DIMREDUC_USE_PCA_NBDIMS > nbPC)
-{
-  warning( paste0( "Number of computed PCs  (", nbPC, ") smaller than requested PCs for 'dimreduc' (", DIMREDUC_USE_PCA_NBDIMS,"), setting lower PC number (", nbPC, ")..." ))
-  nbPC_dimreduc = nbPC
-}
-
 #........................................
 # Dimensional reduction (tSNE / UMAP)
 #........................................
 # Runs the Uniform Manifold Approximation and Projection (UMAP) dimensional reduction technique
-seurat_obj = RunUMAP( seurat_obj, dims = 1:nbPC_dimreduc);
+seurat_obj = RunUMAP( seurat_obj, dims = 1:pcs);
 
 # Run t-SNE dimensionality reduction on selected features
-seurat_obj = RunTSNE( seurat_obj, dims = 1:nbPC_dimreduc);
+seurat_obj = RunTSNE( seurat_obj, dims = 1:pcs);
 
 # Save resulting coordinates for all cells as 'tsv' files
 # write.table( Embeddings(seurat_obj, reduction = "umap"), 
@@ -42,23 +35,6 @@ seurat_obj = RunTSNE( seurat_obj, dims = 1:nbPC_dimreduc);
 ## @knitr dimreduc_ggplot_covariables
 # ..........................................................................................................
 
-#........................................
-# UMAP
-#........................................
-# # Plot the cells sample of origin in UMAP
-# DimPlot( seurat_obj, reduction = ifelse(exists("useReduction"), useReduction, "umap"),
-#          group.by = "orig.ident") + 
-#   ggtitle( "Map of cells by sample of origin")
-
-
-# # if QC EXPLORATION MODE is active, plot if the cells are outliers or not
-# if( QC_EXPLORATION_MODE == TRUE){
-#   print( DimPlot( seurat_obj, reduction = ifelse(exists("useReduction"), useReduction, "umap"),
-#                   group.by = "outlier") + 
-#                   ggtitle( "Map of cells by \nQC filtering status")
-#   )
-# }
-
 # Plot the cells percentage of ribosomal genes in UMAP
 FeaturePlot( seurat_obj, reduction = ifelse(exists("useReduction"), useReduction, "umap"), 
              features = "percent.rb") +
@@ -119,23 +95,6 @@ if( QC_EXPLORATION_MODE == TRUE){
 ## @knitr dimreduc_ggplot_covariables_tSNE
 # ..........................................................................................................
 
-#........................................
-# TSNE
-#........................................
-# # Plot the cells sample of origin in tSNE
-# DimPlot( seurat_obj, reduction = ifelse(exists("useReduction"), useReduction, "tsne"),
-#          group.by = "orig.ident") + 
-#   ggtitle( "Map of cells by sample of origin")
-
-
-# # if QC EXPLORATION MODE is active, plot if the cells are outliers or not
-# if( QC_EXPLORATION_MODE == TRUE){
-#   print( DimPlot( seurat_obj, reduction = ifelse(exists("useReduction"), useReduction, "tsne"),
-#                   group.by = "outlier") + 
-#                   ggtitle( "Map of cells by \nQC filtering status")
-#   )
-# }
-
 # Plot the cells percentage of ribosomal genes in tSNE
 FeaturePlot( seurat_obj, reduction = ifelse(exists("useReduction"), useReduction, "tsne"), 
              features = "percent.rb") +
diff --git a/03_Script/05_cell_heterogeneity_clustering.R b/03_Script/05_cell_heterogeneity_clustering.R
index 82ea05d..d58bb37 100644
--- a/03_Script/05_cell_heterogeneity_clustering.R
+++ b/03_Script/05_cell_heterogeneity_clustering.R
@@ -12,19 +12,11 @@ source(file.path(dirname(DIRECTORY), "03_Script/00_general_deps.R"))
 #........................................
 # Identify clusters
 #........................................
-# Identify clusters of cells by graph approach
-nbPC_findclusters = FINDCLUSTERS_USE_PCA_NBDIMS
-if(FINDCLUSTERS_USE_PCA_NBDIMS > nbPC)
-{
-  warning( paste0( "Number of computed PCs  (", nbPC, ") smaller than requested PCs for 'findclusters' (", FINDCLUSTERS_USE_PCA_NBDIMS,"), setting lower PC number (", nbPC, ")..." ))
-  nbPC_findclusters = nbPC
-}
 
 # Computes the k.param nearest neighbors for a given dataset
 seurat_obj <- FindNeighbors(object    = seurat_obj,
-                            k.param   = FINDNEIGHBORS_K, 
                             reduction = "pca",
-                            dims      = 1:nbPC_findclusters,
+                            dims      = 1:pcs,
                             verbose   = .VERBOSE);
 
 # Identify clusters of cells by a shared nearest neighbor (SNN) modularity optimization based clustering algorithm
@@ -294,26 +286,26 @@ if( exists( "CLUSTER_GROUP_LIST")){
 # Save each seurat object
 saveRDS(seurat_obj, file = file.path( OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID,"_filtered_seurat_object.rds")))
 
-# save a tsv file for each sample
-write.table( seurat_obj[[]], 
-             file= file.path( OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_filtered_seurat_object.tsv")), 
-             quote = FALSE, 
-             row.names = TRUE, 
-             col.names = NA, # Add a blank column name for row names (CSV convention)
-             sep="\t");
-
-# Save the count matrix as csv file 
-count_matrix <- GetAssayData(object = seurat_obj[["RNA"]], slot = "counts")
-count_matrix <- as.matrix(count_matrix)
-count_matrix <- t(count_matrix)
-count_matrix <- as.data.frame(count_matrix)
-
-fwrite(x = count_matrix, file = file.path( OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_count_matrix.csv")), row.names = TRUE)
-
-# Save the normalize matrix as csv file
-normalized_matrix <- GetAssayData(object = seurat_obj[["RNA"]], slot = "data")
-normalized_matrix <- as.matrix(normalized_matrix)
-normalized_matrix <- t(normalized_matrix)
-normalized_matrix <- as.data.frame(normalized_matrix)
-
-fwrite(x = normalized_matrix , file = file.path( OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_normalized_matrix.csv")), row.names = TRUE)
+# # save a tsv file for each sample
+# write.table( seurat_obj[[]], 
+#              file= file.path( OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_filtered_seurat_object.tsv")), 
+#              quote = FALSE, 
+#              row.names = TRUE, 
+#              col.names = NA, # Add a blank column name for row names (CSV convention)
+#              sep="\t");
+
+# # Save the count matrix as csv file 
+# count_matrix <- GetAssayData(object = seurat_obj[["RNA"]], slot = "counts")
+# count_matrix <- as.matrix(count_matrix)
+# count_matrix <- t(count_matrix)
+# count_matrix <- as.data.frame(count_matrix)
+
+# fwrite(x = count_matrix, file = file.path( OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_count_matrix.csv")), row.names = TRUE)
+
+# # Save the normalize matrix as csv file
+# normalized_matrix <- GetAssayData(object = seurat_obj[["RNA"]], slot = "data")
+# normalized_matrix <- as.matrix(normalized_matrix)
+# normalized_matrix <- t(normalized_matrix)
+# normalized_matrix <- as.data.frame(normalized_matrix)
+
+# fwrite(x = normalized_matrix , file = file.path( OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_normalized_matrix.csv")), row.names = TRUE)
diff --git a/03_Script/allen_mouse_whole_cortex.R b/03_Script/allen_mouse_whole_cortex.R
index b3a6daf..acc72f8 100644
--- a/03_Script/allen_mouse_whole_cortex.R
+++ b/03_Script/allen_mouse_whole_cortex.R
@@ -38,6 +38,96 @@ STEP_REMI = "03_remi/"
 
 METADATA_REMI <- TRUE
 
+#-------------------------------------
+# Library
+#-------------------------------------
+library(rhdf5)
+library(data.table)
+library(Seurat)
+library(dplyr)
+
+#-------------------------------------
+# Path to files / folders
+#-------------------------------------
+DIRECTORY = getwd()
+TEXT_OUTPUT = snakemake@output[["data_for_sims_output"]]
+OUTPUTDIR = file.path((DIRECTORY), "05_Output")
+REF = file.path((DIRECTORY), "01_Reference")
+
+SAMPLE_ID = snakemake@params[["sample_id"]]
+REFERENCE_NAME = snakemake@params[["reference_name"]]
+
+NORM_METHOD = snakemake@params[["norm_method"]]
+NORM_SCALE_FACTOR = snakemake@params[["norm_scale_factor"]]
+
+ALLEN_METADATA = snakemake@params[["allen_metadata"]]
+ALLEN_MATRIX = snakemake@params[["allen_matrix"]]
+
+OUTPUT_NAME_REF_METADATA = snakemake@params[["output_name_ref_metadata"]]
+OUTPUT_NAME_REF_MATRIX = snakemake@params[["output_name_ref_matrix"]]
+OUTPUT_NAME_MATRIX = snakemake@params[["output_name_matrix"]]
+ 
+STEP2 = "02_seurat/"
+STEP3 = "03_sims/"
+STEP_REMI = "03_remi/"
+
+METADATA_REMI <- TRUE
+
+
+metadata <- fread(file = file.path(REF, ALLEN_METADATA))
+print("Reference metadata loaded. It's size is :")
+print(dim(metadata))
+
+metadata <- metadata[metadata$region_label %in% "SSp",]
+print("Only SSp are kept in the metadata: The new size of this one is :")
+print(dim(metadata))
+
+# Load the HDF5 matrix
+h5_file <- H5Fopen(file.path(REF, ALLEN_MATRIX))
+h5ls(h5_file)
+expression_matrix <- h5read(h5_file, "/data/counts")
+gene_names <- h5read(h5_file, "/data/gene")
+sample_names <- h5read(h5_file, "/data/samples")
+H5Fclose(h5_file)
+
+colnames(expression_matrix) <- as.character(gene_names)  # Si nécessaire
+rownames(expression_matrix) <- as.character(sample_names)  # Si nécessaire
+
+expression_matrix <- expression_matrix[rownames(expression_matrix) %in% metadata$sample_name,]
+metadata <- metadata[metadata$sample_name %in% rownames(expression_matrix),]
+
+print("The metadata and matrix of reference have the same cells now : The new size of them two :")
+print(dim(expression_matrix))
+print(dim(metadata))
+
+gc()
+
+expression_matrix <- t(expression_matrix)
+print("The matrix is transposed")
+
+gc()
+
+colnames(expression_matrix) <- gsub("_", "-", original_sample_names)
+
+seurat_object <- CreateSeuratObject(counts = expression_matrix, project = "allen_seuratObject")
+saveRDS(seurat_object, file = file.path(REF, "allen_seurat_object.rds"))
+
+rm("seurat_object")
+
+gc()
+
+#### Pour une raison que j'ignore cette partie ne marche pas, je dois le faire en local
+
+seurat_object <- readRDS(file = file.path(REF, "allen_seurat_object.rds"))
+
+metadata$sample_name <- gsub("_", "-", metadata$sample_name)
+metadata <- metadata[match(colnames(seurat_object), metadata$sample_name), ]
+seurat_object <- AddMetaData(object = seurat_object, metadata = metadata)
+
+saveRDS(seurat_object, file = file.path(REF, "allen_seurat_object_with_metadata.rds"))
+
+
+
 # #-------------------------------------
 # # Load the metadata of the reference
 # #-------------------------------------
@@ -135,28 +225,28 @@ METADATA_REMI <- TRUE
 # print("The matrix of reference is now normalized the same way as our matrix to analyze: Here the first 10 rows and 10 columns :")
 # print(reference_matrix[1:5, 1:25])
 
-#-------------------------------------
-# Load our matrix to annotate
-#-------------------------------------
-dir.create(file.path(OUTPUTDIR, STEP3, SAMPLE_ID))
+# #-------------------------------------
+# # Load our matrix to annotate
+# #-------------------------------------
+# dir.create(file.path(OUTPUTDIR, STEP3, SAMPLE_ID))
 
-matrix <- fread(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_normalized_matrix.csv")))
-matrix <- as.matrix(matrix)
-rownames(matrix) <- matrix[,"V1"]
-matrix <- matrix[, -1]
+# matrix <- fread(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_normalized_matrix.csv")))
+# matrix <- as.matrix(matrix)
+# rownames(matrix) <- matrix[,"V1"]
+# matrix <- matrix[, -1]
 
-print("Our matrix to annotate is loaded, see the dimension below :")
-print(dim(matrix))
+# print("Our matrix to annotate is loaded, see the dimension below :")
+# print(dim(matrix))
 
-# -------------------------------------
-# Wtrite the two new matrix
-# -------------------------------------
-class(matrix) <- "numeric"
-write.csv(matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_MATRIX,".csv")))
-print("Our matrix is written")
+# # -------------------------------------
+# # Wtrite the two new matrix
+# # -------------------------------------
+# class(matrix) <- "numeric"
+# write.csv(matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_MATRIX,".csv")))
+# print("Our matrix is written")
 
-rm("matrix")
-gc()
+# rm("matrix")
+# gc()
 
 # class(reference_matrix) <- "numeric"
 # write.csv(reference_matrix, file.path(OUTPUTDIR, STEP3, REFERENCE_NAME, paste0(OUTPUT_NAME_REF_MATRIX,".csv")))
diff --git a/03_Script/arlotta.R b/03_Script/arlotta.R
index 632c8d4..72f2f59 100755
--- a/03_Script/arlotta.R
+++ b/03_Script/arlotta.R
@@ -126,7 +126,7 @@ print("The diferent matrix for diferent developmental time have been loaded in a
 list_reference_matrix <-  lapply(list_reference_matrix, function(x) as.matrix(x))
 list_reference_matrix <-  lapply(list_reference_matrix, function(x) t(x))
 
-print("The matrix have been normalized")
+# print("The matrix have been normalized")
 
 #-------------------------------------
 # Keep only A,T,G and C in every 
diff --git a/03_Script/labelTransfert_seurat.R b/03_Script/labelTransfert_seurat.R
index 13c0285..a5a777c 100644
--- a/03_Script/labelTransfert_seurat.R
+++ b/03_Script/labelTransfert_seurat.R
@@ -9,6 +9,9 @@ library(Seurat)
 library(ggplot2)
 library(dplyr)
 library(data.table)
+library(DoubletFinder)
+
+options(future.globals.maxSize = 3000 * 1024^2)
 
 #=====================================
 # Path to file and folder
@@ -17,528 +20,517 @@ DIRECTORY = getwd()
 OUTPUTDIR = file.path((DIRECTORY), "05_Output")
 REF = file.path((DIRECTORY), "01_Reference")
 
-SAMPLE_ID = snakemake@params[["sample_id"]]
-
-STEP2 = "02_seurat/"
-
-#========================================================================
-# Load the data
-#========================================================================
-
-### Reference dataset Allen 
-ref_data <- readRDS(file = file.path(REF, "/remi/refA21_all.rds"))
-premier_modele <- ref_data@assays[["SCT"]]@SCTModel.list[[1]]
-ref_data@assays[["SCT"]]@SCTModel.list <- list(premier_modele)
+SAMPLE_ID = "P30_WT"
 
-# ### Reference dataset arlotta
-# ref_data <- readRDS(file = file.path(REF, "/modify_arlotta_seurat_object.rds"))
-# ref_data <- PercentageFeatureSet(ref_data, pattern = "^mt-", col.name = "percent.mt")
-# ref_data <- SCTransform(ref_data, vars.to.regress = "percent.mt", assay = "RNA", new.assay.name = "SCT", verbose = FALSE)
+ALLEN = TRUE
 
-### Querry dataset
-querry_data <- readRDS(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "_filtered_seurat_object.rds")))### To change here
-querry_data <- SCTransform(querry_data, vars.to.regress = "percent.mt", assay = "RNA", new.assay.name = "SCT", verbose = FALSE)
+STEP2 = "02_seurat/"
 
-# Change the default assay to SCT
-DefaultAssay(ref_data) <- "SCT"
-DefaultAssay(querry_data) <- "SCT"
+# #========================================================================
+# # Load the data an the reference
+# #========================================================================
 
-# Keep genes that are in common between querry and ref datasets
-ref_genes <- as.data.frame(rownames(ref_data))
-querry_genes <- as.data.frame(rownames(querry_data))
-all_genes <- intersect(ref_genes$`rownames(ref_data)`, querry_genes$`rownames(querry_data)`)
+## Our data
+so <- readRDS(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "_filtered_seurat_object.rds")))
+
+if(ALLEN == TRUE){
+  ref <- readRDS(file = file.path(REF, "/allen_seurat_object_with_metadata.rds"))
+  ref <- PercentageFeatureSet(ref, pattern = "^mt-", col.name = "percent.mt")
+  ref <- SCTransform(ref, vars.to.regress = "percent.mt", verbose = FALSE, future.seed = FALSE)
+
+} else {
+  ## The arlotta reference
+  ref <- readRDS(file = file.path(REF, "/modify_arlotta_seurat_object.rds"))
+
+  # Keep only some cells in the reference
+  pattern <- "E14"
+  metadata <- ref@meta.data
+  filtered_metadata <- metadata[grepl(pattern, metadata$biosample_id), ]
+  cells_to_keep <- rownames(filtered_metadata)
+  ref <- subset(ref, cells = cells_to_keep)
+  ref <- PercentageFeatureSet(ref, pattern = "^mt-", col.name = "percent.mt")
+  ref <- SCTransform(ref, vars.to.regress = "percent.mt", verbose = FALSE)
+}
 
 #========================================================================
-# Create functions
+# Doublet Finder
 #========================================================================
 
-# For findTransfertAnchors function
-calculate_k_score <- function(num_cells, proportion = 0.1, min_value = 1) {
-  k_score <- max(min_value, floor(num_cells * proportion))
-  return(k_score)
-}
-
-# For transferData function
-calculate_k_weight <- function(sample_size, proportion = 0.2, min_value = 1) {
-  k_weight <- round(sample_size * proportion)
-  return(max(min_value, k_weight))
-}
+pct <- so[["pca"]]@stdev / sum(so[["pca"]]@stdev) * 100
+cumu <- cumsum(pct)
+co1 <- which(cumu > 90 & pct < 5)[1]
+co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1
+pcs <- min(co1, co2)
+
+sweep.res <- paramSweep(so, PCs = 1:pcs, sct = TRUE)
+sweep.stats <- summarizeSweep(sweep.res, GT = FALSE)
+# bcmvn <- find.pK(sweep.stats)
+homotypic.prop <- modelHomotypic(so$seurat_clusters)
+nExp_poi <- round(0.075 * ncol(so))
+nExp_poi.adj <- round(nExp_poi * (1 - homotypic.prop))
+so <- doubletFinder(so, PCs = 1:pcs, pN = 0.25, pK = 0.09, nExp = nExp_poi.adj, reuse.pANN = FALSE, sct = TRUE)
+
+metadata_columns <- colnames(so@meta.data)
+pattern <- "DF.classifications_0.25_0.09_\\d+"  # Adjust the regex as needed
+matched_column <- metadata_columns[grepl(pattern, metadata_columns)]
+num_doublets <- sub("DF\\.classifications_0\\.25_0\\.09_(\\d+)", "\\1", matched_column)
+print(num_doublets)
+full_column_name <- paste0("DF.classifications_0.25_0.09_", num_doublets)
+cells_to_keep <- rownames(so@meta.data[so@meta.data[[full_column_name]] == "Singlet", ])
+so <- subset(so, cells = cells_to_keep)
 
 #========================================================================
-# Step 1 : annotate with broad class
+# Annotation
 #========================================================================
 
-# Create a new column in metadata
-querry_data$class_label <- NA
-
-K_SCORE <- 30 # Seurat option
+so$celltype_label <- NA
+all_genes <- intersect(rownames(ref), rownames(so))
+anchors <- FindTransferAnchors(reference = ref, query = so, dims = 1:30, normalization.method = "SCT", features = all_genes)
+prediction <- TransferData(anchorset = anchors, refdata = as.vector(ref$celltype_label))
+so <- AddMetaData(so, metadata = prediction)
+so$celltype_label <- so$predicted.id
 
-if(ncol(querry_data) < K_SCORE){
+pdf(file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "_annotation_before_filtering_UMAP.pdf")), width = 8, height = 8)
+plot <- DimPlot(so, group.by = "celltype_label", label.size = 6) + theme(legend.position = "bottom", legend.text = element_text(size = 4))
+print(plot)
 
-  # Calculate a new k_score
-  k_score_reference <- calculate_k_score(num_cells = ncol(ref_data))
-  k_score_query <- calculate_k_score(num_cells = ncol(querry_data))
-  k_score <- min(k_score_reference, k_score_query)
-
-  anchors <- FindTransferAnchors(reference = ref_data,
-                               query = querry_data,
-                               k.score = k_score,
-                               normalization.method = "SCT",
-                               features = all_genes)
-} else{
-  anchors <- FindTransferAnchors(reference = ref_data,
-                               query = querry_data,
-                               normalization.method = "SCT",
-                               features = all_genes)
+# UMAP plot per label
+labels <- unique(so$celltype_label)
+for (label in labels) {
+  cells_to_highlight <- rownames(so@meta.data[so@meta.data$celltype_label == label, ])
+  plots <- DimPlot(so, group.by = "celltype_label", cells.highlight = cells_to_highlight) +
+           ggtitle(label) +
+           theme(legend.position = "bottom", legend.text = element_text(size = 4))
+  print(plots)
 }
+dev.off()
 
-# Number of anchors
-num_anchors <- nrow(anchors@anchors)
-
-if(ncol(querry_data) < num_anchors){
-
-  # Calculate the k weight
-  k_weight <- calculate_k_weight(sample_size = num_anchors)
+#========================================================================
+# Filtering bad predicted cells : KMeans
+#========================================================================
 
-  # Prediction with transfertData
-  prediction <- TransferData(anchorset = anchors,
-                           refdata = ref_data$class_label,
-                           k.weight = k_weight
-                           )
-} else{
-  prediction <- TransferData(anchorset = anchors,
-                           refdata = ref_data$class_label)
-}
+# set.seed(42)  # For reproducibility
+
+# metadata <- so@meta.data
+# metadata$cell_id <- rownames(metadata)
+# unique_labels <- unique(metadata$celltype_label)
+# thresholds <- data.frame(celltype_label = unique_labels, cluster_threshold = NA)
+
+# for (label in unique_labels) {
+#   subset_data <- metadata[metadata$celltype_label == label, ]
+#   print(label)
+#   print(nrow(subset_data))
+#   # Check if there are at least 2 cells for k-means clustering
+#   if (nrow(subset_data) <= 2) {
+#     # If fewer than 2 cells, set the threshold to 0 (no filtering)
+#     cluster_threshold <- 0
+#   } else {
+#     # Perform k-means clustering
+#     kmeans_result <- kmeans(subset_data$prediction.score.max, centers = 2)
+#     cluster_centers <- kmeans_result$centers
+#     high_confidence_cluster <- which.max(cluster_centers)
+#     cluster_threshold <- min(subset_data$prediction.score.max[kmeans_result$cluster == high_confidence_cluster])
+#   }
+  
+#   thresholds[thresholds$celltype_label == label, "cluster_threshold"] <- cluster_threshold
+# }
 
-querry_data <- AddMetaData(querry_data, metadata = prediction)
-querry_data$class_label[Cells(querry_data)] <- querry_data$predicted.id
+# # Merge the thresholds with the metadata
+# metadata <- merge(metadata, thresholds, by = "celltype_label")
 
-# Plot UMAP
+# # Apply the filtering based on the prediction score and cluster threshold
+# metadata <- metadata[metadata$prediction.score.max >= metadata$cluster_threshold, ]
 
-pdf(file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "_seurat_labelTransfert.pdf")))
-DimPlot(querry_data, group.by = "class_label", label.size = 6) + theme(legend.position = "bottom",legend.text=element_text(size=4))
+# saveRDS(so, file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "annotate_seurat_objet_before_filtering.rds")))
 
-# Plot UMAP per labels
-labels <- unique(querry_data@meta.data$`class_label`)
+# so_clean <- subset(so, cells = metadata$cell_id)
 
-for (label in labels) {
-  plot <- DimPlot(querry_data, group.by = "class_label", cells.highlight = WhichCells(querry_data, expression = class_label == label)) + ggtitle(label)
-  print(plot)
-}
 
-### UMAP plot with the prediction
-FeaturePlot(querry_data, features = "prediction.score.max", reduction = "umap")
+# pdf(file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "_annotation_after_filtering_UMAP.pdf")), width = 8, height = 8)
+# plot <- DimPlot(so_clean, group.by = "celltype_label", label.size = 6) + theme(legend.position = "bottom", legend.text = element_text(size = 4))
+# print(plot)
 
+# # UMAP plot per label
+# labels <- unique(so_clean$celltype_label)
+# for (label in labels) {
+#   cells_to_highlight <- rownames(so_clean@meta.data[so_clean@meta.data$celltype_label == label, ])
+#   plots <- DimPlot(so_clean, group.by = "celltype_label", cells.highlight = cells_to_highlight) +
+#            ggtitle(label) +
+#            theme(legend.position = "bottom", legend.text = element_text(size = 4))
+#   print(plots)
+# }
+# dev.off()
 
+# saveRDS(so_clean, file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "annotate_seurat_objet_after_filtering.rds")))
 
 #========================================================================
-# Step 2 : apply a threshold
+# Filtering bad predicted cells : Remi's
 #========================================================================
 
-ggplot(querry_data@meta.data, aes(x = prediction.score.max)) + 
-  geom_histogram(binwidth = 0.05) + 
-  theme_minimal() + 
-  xlab("Prediction Score Max") + 
-  ylab("Frequency") + 
-  ggtitle("Distribution of Prediction Scores (Broad Class)")
+# 1. Get all the prediction score column names, excluding "prediction.score.max"
+prediction_columns <- grep("^prediction\\.score\\.(?!max$)", colnames(so@meta.data), value = TRUE, perl = TRUE)
+
+# 2. Create a new column "filter_celltype_label" to store the filtered labels
+so@meta.data$filter_celltype_label <- "Nothing"  # Initialize with NA
+
+# 3. Loop through each row and perform the filtering based on the criteria
+for (i in 1:nrow(so@meta.data)) {
+  # Extract the prediction scores for the current cell as a numeric vector
+  scores <- as.numeric(so@meta.data[i, prediction_columns])
+
+  # Sort the scores in descending order and extract the top two
+  sorted_scores <- sort(scores, decreasing = TRUE)
+  best_score <- sorted_scores[1]
+  second_best_score <- sorted_scores[2]
+
+  # Get the labels corresponding to the top two scores
+  best_label <- prediction_columns[which.max(scores)]  # Find the label for the best score
+  second_best_label <- prediction_columns[which(scores == second_best_score)[1]]  # Find the label for the second-best score
+
+  # 4. Check if the difference between the best and second-best score is greater than 20%
+  if ((best_score - second_best_score) > 0.20) {
+    # Keep the label corresponding to the best score (remove "prediction.score." prefix)
+    so@meta.data$filter_celltype_label[i] <- gsub("prediction\\.score\\.", "", best_label)
+  } else {
+    # Otherwise, set the label as NA
+    so@meta.data$filter_celltype_label[i] <- "Nothing"
+  }
+}
 
-dev.off()
+saveRDS(so, file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "annotate_seurat_objet_before_filtering.rds")))
 
-# Calculate mean and standard deviation
-mean_score <- mean(querry_data@meta.data$prediction.score.max)
-print(mean_score)
+so_clean <- subset(so, subset = filter_celltype_label != "Nothing")
 
-sd_score <- sd(querry_data@meta.data$prediction.score.max)
-print(sd_score)
+pdf(file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "_annotation_after_filtering_UMAP.pdf")), width = 8, height = 8)
+plot <- DimPlot(so_clean, group.by = "celltype_label", label.size = 6) + theme(legend.position = "bottom", legend.text = element_text(size = 4))
+print(plot)
 
-# Define threshold
-threshold <- mean_score - sd_score
-print(threshold)
+# UMAP plot per label
+labels <- unique(so_clean$celltype_label)
+for (label in labels) {
+  cells_to_highlight <- rownames(so_clean@meta.data[so_clean@meta.data$celltype_label == label, ])
+  plots <- DimPlot(so_clean, group.by = "celltype_label", cells.highlight = cells_to_highlight) +
+           ggtitle(label) +
+           theme(legend.position = "bottom", legend.text = element_text(size = 4))
+  print(plots)
+}
+dev.off()
 
-# Filter cells
-querry_data <- subset(querry_data, subset = prediction.score.max >= threshold)
-gc()
+saveRDS(so_clean, file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "annotate_seurat_objet_after_filtering.rds")))
 
-pdf(file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "_seurat_labelTransfert.pdf")))
 
-DimPlot(querry_data, group.by = "class_label", label.size = 6) + theme(legend.position = "bottom",legend.text=element_text(size=4))
 
-labels <- unique(querry_data@meta.data$`class_label`)
 
-for (label in labels) {
-  plot <- DimPlot(querry_data, group.by = "class_label", cells.highlight = WhichCells(querry_data, expression = class_label == label)) + ggtitle(label)
-  print(plot)
-}
 
-dev.off()
 
-#========================================================================
-# Step 3 : annotate with supertype
-#========================================================================
 
-querry_data$supertype_label <- NA
 
-for (class in unique(querry_data$class_label)) {
-  print("The supertype class :")
-  print(class)
-  
-  # Set idents=
-  Idents(querry_data) <- "class_label"
-  Idents(ref_data) <- "class_label"
-  
-  # Per class
-  subset_querry_data <- subset(querry_data, idents = class)
-  subset_ref_data <- subset(ref_data, idents = class)
-  
-  # Check subset sizes
-  print(paste("Number of cells in query subset:", ncol(subset_querry_data)))
-  print(paste("Number of cells in reference subset:", ncol(subset_ref_data)))
-
-  # Keep the same genes
-  ref_genes <- as.data.frame(rownames(subset_ref_data))
-  querry_genes <- as.data.frame(rownames(subset_querry_data))
-  all_genes <- intersect(ref_genes$`rownames(subset_ref_data)`, querry_genes$`rownames(subset_querry_data)`)
-  
-  if(ncol(subset_querry_data) < K_SCORE){
-
-  # Calculate a new k_score
-  k_score_reference <- calculate_k_score(num_cells = ncol(subset_ref_data))
-  k_score_query <- calculate_k_score(num_cells = ncol(subset_querry_data))
-  k_score <- min(k_score_reference, k_score_query)
-
-  anchors <- FindTransferAnchors(reference = subset_ref_data,
-                               query = subset_querry_data,
-                               k.score = k_score,
-                               normalization.method = "SCT",
-                               features = all_genes)
-} else{
-  anchors <- FindTransferAnchors(reference = subset_ref_data,
-                               query = subset_querry_data,
-                               normalization.method = "SCT",
-                               features = all_genes)
-}
 
-# Number of anchors
-num_anchors <- nrow(anchors@anchors)
 
-if(ncol(subset_querry_data) < num_anchors){
 
-  # Calculate the k weight
-  k_weight <- calculate_k_weight(sample_size = num_anchors)
 
-  # Prediction with transfertData
-  prediction <- TransferData(anchorset = anchors,
-                           refdata = subset_ref_data$supertype_label,
-                           k.weight = k_weight
-                           )
-} else{
-  prediction <- TransferData(anchorset = anchors,
-                           refdata = subset_ref_data$supertype_label)
-}
-  
-  # Add to the metadata
-  subset_querry_data <- AddMetaData(subset_querry_data, metadata = prediction)
-  querry_data$supertype_label[Cells(subset_querry_data)] <- subset_querry_data$predicted.id
-}
-pdf(file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "_seurat_labelTransfert.pdf")))
 
 
-### Plot UMAP
-Idents(querry_data) <- querry_data$supertype_label
-DimPlot(querry_data, group.by = "supertype_label", label.size = 6) + theme(legend.position = "bottom",legend.text=element_text(size=4))
 
-### Plot UMAP per labels
-labels <- unique(querry_data@meta.data$`supertype_label`)
 
-for (label in labels) {
-  plot <- DimPlot(querry_data, group.by = "supertype_label", cells.highlight = WhichCells(querry_data, expression = supertype_label == label)) + ggtitle(label)
-  print(plot)
-}
-
-### UMAP plot with the prediction
-FeaturePlot(querry_data, features = "prediction.score.max", reduction = "umap")
 
-dev.off()
 
-saveRDS(querry_data, file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID,paste0(SAMPLE_ID, "_seurat_labelTransfert.rds")))
 
 
-TEXT_OUTPUT <- snakemake@output[["seurat_labelTransfert_output"]]
 
-output_file<-file(TEXT_OUTPUT)
-writeLines(c("Rules seurat Label Transfert finished"), output_file)
-close(output_file)
 
 
 
-################################################################################################ LABEL TRANSFERT FROM REMI'S ANNOTATION : WT TO KO
 
-# #========================================================================
-# # Load the data
-# #========================================================================
 
-# ### Ouvrir la matrice de référence pour le NN
-# remi_nn <- readRDS("/scratch/lchabot/inmed_deChevigny_scrnaseq/scRNAseq_analysis_workflow/01_Reference/remi/GEcortex_subRef_cluster_class.list.rds")
-# remi_nn <- remi_nn[["P30.scRNA-seq.10X"]]
-# gc()
 
-# ### Matrice à annoter 
-# # querry_data <- readRDS("/scratch/lchabot/inmed_deChevigny_scrnaseq/scRNAseq_analysis_workflow/05_Output/02_seurat/P5_KO/P5_KO_filtered_seurat_object.rds")
-# querry_data <- readRDS("/scratch/lchabot/inmed_deChevigny_scrnaseq/scRNAseq_analysis_workflow/05_Output/02_seurat/P30_KO_integrated/P30_KO_integrated_filtered_seurat_object.rds")
 
-# querry_data <- SCTransform(querry_data, vars.to.regress = "percent.mt", assay = "RNA", new.assay.name = "SCT", verbose = FALSE)
-# gc()
 
-# ### Set the default assay to SCT for booth seurat object
-# DefaultAssay(remi_nn) <- "SCT"
-# DefaultAssay(querry_data) <- "SCT"
 
-# # Keep genes that are in common between querry and ref datasets
-# ref_genes <- as.data.frame(rownames(remi_nn))
-# querry_genes <- as.data.frame(rownames(querry_data))
-# all_genes <- intersect(ref_genes$`rownames(remi_nn)`, querry_genes$`rownames(querry_data)`)
 
 # #========================================================================
 # # Create functions
 # #========================================================================
 
-# # For findTransfertAnchors function
-# calculate_k_score <- function(num_cells, proportion = 0.1, min_value = 1) {
-#   k_score <- max(min_value, floor(num_cells * proportion))
-#   return(k_score)
-# }
-
-# # For transferData function
-# calculate_k_weight <- function(sample_size, proportion = 0.2, min_value = 1) {
-#   k_weight <- round(sample_size * proportion)
-#   return(max(min_value, k_weight))
-# }
+# # Function for performing label transfer with optional per-class processing
+# perform_label_transfer <- function(so, ref, actual_class = "actual_class", target_label = "target_label", skip_loop = FALSE, output_dir = "default/path", pdf_file = "label_transfer_plot.pdf") {
+  
+#   # Initialize or clear the target_label column
+#   so[[target_label]] <- NA
+  
+#   class_labels <- as.character(so[[actual_class]][, actual_class])
 
-# #========================================================================
-# # Step 1 : annotate broad class 
-# #========================================================================
+#   # Ensure the output directory exists
+#   if (!dir.exists(output_dir)) {
+#     dir.create(output_dir, recursive = TRUE)
+#   }
 
-# # Create a new column in metadata
-# querry_data$class_label <- NA
+#   # Create the full path for the PDF file
+#   pdf_file <- file.path(output_dir, pdf_file)
 
-# K_SCORE <- 30 # Seurat option
+#   if (!is.null(pdf_file)) {
+#     pdf(pdf_file, width = 8, height = 6)  # Open PDF if specified
+#   }
 
-# if(ncol(querry_data) < K_SCORE){
+#   if (!skip_loop) {
+#     unique_classes <- unique(class_labels)
 
-#   # Calculate a new k_score
-#   k_score_reference <- calculate_k_score(num_cells = ncol(remi_nn))
-#   k_score_query <- calculate_k_score(num_cells = ncol(querry_data))
-#   k_score <- min(k_score_reference, k_score_query)
+#     for (class in unique_classes) {
+#       print(class)
 
-#   anchors <- FindTransferAnchors(reference = remi_nn,
-#                                query = querry_data,
-#                                k.score = k_score,
-#                                normalization.method = "SCT",
-#                                features = all_genes)
-# } else{
-#   anchors <- FindTransferAnchors(reference = remi_nn,
-#                                query = querry_data,
-#                                normalization.method = "SCT",
-#                                features = all_genes)
-# }
+#       # Set idents
+#       Idents(so) <- actual_class
+#       Idents(ref) <- actual_class
 
-# # Number of anchors
-# num_anchors <- nrow(anchors@anchors)
+#       # Subset per class
+#       subset_querry_data <- subset(so, idents = class)
+#       subset_ref_data <- subset(ref, idents = class)
 
-# if(ncol(querry_data) < num_anchors){
+#       # Keep the same genes
+#       all_genes <- intersect(rownames(subset_ref_data), rownames(subset_querry_data))
 
-#   # Calculate the k weight
-#   k_weight <- calculate_k_weight(sample_size = num_anchors)
+#       # Directly assign actual_class to target_label
+#       so[[target_label]][so[[actual_class]] == class] <- class
 
-#   # Prediction with transfertData
-#   prediction <- TransferData(anchorset = anchors,
-#                            refdata = remi_nn$class_label,
-#                            k.weight = k_weight
-#                            )
-# } else{
-#   prediction <- TransferData(anchorset = anchors,
-#                            refdata = remi_nn$class_label)
-# }
+#       # Find anchors and Transfer Data with error handling
+#       tryCatch({
+#         anchors <- FindTransferAnchors(reference = subset_ref_data, query = subset_querry_data, dims = 1:30, normalization.method = "SCT", features = all_genes)
+#         prediction <- TransferData(anchorset = anchors, refdata = as.vector(subset_ref_data[[target_label]]))
 
-# querry_data <- AddMetaData(querry_data, metadata = prediction)
-# querry_data$class_label[Cells(querry_data)] <- querry_data$predicted.id
+#         # Append target_label to each prediction column name
+#         colnames(prediction) <- paste0(colnames(prediction), ".", target_label)
+#         subset_querry_data <- AddMetaData(subset_querry_data, metadata = prediction)
+#         new_metadata <- subset_querry_data@meta.data[, grepl(paste0(".", target_label), colnames(subset_querry_data@meta.data))]
+#         so@meta.data[rownames(new_metadata), colnames(new_metadata)] <- new_metadata
 
-# # Plot UMAP
-# DimPlot(querry_data, group.by = "class_label", label.size = 6) + theme(legend.position = "bottom",legend.text=element_text(size=4))
+#         # Assign the predicted labels
+#         so[[target_label]] <- subset_querry_data[[paste0("predicted.id.", target_label)]]
 
-# # Plot UMAP per labels
-# labels <- unique(querry_data@meta.data$`class_label`)
+#       }, error = function(e) {
+#         warning(paste("Error for class:", class, "- Keeping actual_class in target_label"))
+#       })
+#     }
+#   } else {
+#     # Skip looping, transfer all cells at once
+#     all_genes <- intersect(rownames(ref), rownames(so))
 
-# for (label in labels) {
-#   plot <- DimPlot(querry_data, group.by = "class_label", cells.highlight = WhichCells(querry_data, expression = class_label == label)) + ggtitle(label)
-#   print(plot)
-# }
+#     # Perform label transfer without looping
+#     anchors <- FindTransferAnchors(reference = ref, query = so, dims = 1:30, normalization.method = "SCT", features = all_genes)
+#     prediction <- TransferData(anchorset = anchors, refdata = as.vector(ref[[target_label]]))
 
-# ### UMAP plot with the prediction
-# FeaturePlot(querry_data, features = "prediction.score.max", reduction = "umap")
+#     # Append target_label to the prediction columns
+#     colnames(prediction) <- paste0(colnames(prediction), ".", target_label)
 
+#     # Add prediction metadata to the Seurat object
+#     so <- AddMetaData(so, metadata = prediction)
 
-# #=============================================
-# # Apply a threshold
-# #=============================================
+#     # Assign the predicted labels
+#     so[[target_label]] <- so[[paste0("predicted.id.", target_label)]]
+#   }
 
-# ### Remove cells that are not in the rest of the studies
-# querry_data <- subset(querry_data, subset = class_label != "Non-Neuronal")
-# querry_data <- subset(querry_data, subset = class_label != "Immature/Migrating")
-# querry_data <- subset(querry_data, subset = class_label != "Undetermined")
-# querry_data <- subset(querry_data, subset = class_label != "CR")
+#   # UMAP plot
+#   plot <- DimPlot(so, group.by = target_label, label.size = 6) + 
+#           theme(legend.position = "bottom", legend.text = element_text(size = 4))
 
-# DimPlot(querry_data, group.by = "class_label")
+#   print(plot)  # Print the main plot
 
-# ggplot(querry_data@meta.data, aes(x = prediction.score.max)) + 
-#   geom_histogram(binwidth = 0.05) + 
-#   theme_minimal() + 
-#   xlab("Prediction Score Max") + 
-#   ylab("Frequency") + 
-#   ggtitle("Distribution of Prediction Scores (Broad Class)")
+#   # UMAP plot per label
+#   labels <- unique(so@meta.data[[target_label]])
+#   for (label in labels) {
+#     cells_to_highlight <- rownames(so@meta.data[so@meta.data[[target_label]] == label, ])
+#     plots <- DimPlot(so, group.by = target_label, cells.highlight = cells_to_highlight) +
+#              ggtitle(label) +
+#              theme(legend.position = "bottom", legend.text = element_text(size = 4))
+#     print(plots)
+#   }
 
-# # Calculate mean and standard deviation
-# mean_score <- mean(querry_data@meta.data$prediction.score.max)
-# print(mean_score)
+#   if (!is.null(pdf_file)) {
+#     dev.off()  # Close the PDF device
+#   }
 
-# sd_score <- sd(querry_data@meta.data$prediction.score.max)
-# print(sd_score)
+#   return(so)
+# }
 
-# # Define threshold
-# threshold <- mean_score - sd_score
-# print(threshold)
+# # Function to calculate thresholds based on prediction scores using k-means clustering
+# # Function to calculate thresholds based on prediction scores using k-means clustering
+# calculate_thresholds <- function(metadata, celltype_label, prediction_column) {
+#   unique_labels <- unique(metadata[[celltype_label]])
+#   thresholds <- data.frame(celltype_label = unique_labels, cluster_threshold = NA, stringsAsFactors = FALSE)
+
+#   for (label in unique_labels) {
+#     subset_data <- metadata[metadata[[celltype_label]] == label & !is.na(metadata[[prediction_column]]), ]
+
+#     # Try-catch block to handle cases with insufficient data or k-means failures
+#     tryCatch({
+#       if (nrow(subset_data) > 1) {
+#         # Perform k-means clustering with 2 centers
+#         kmeans_result <- kmeans(subset_data[[prediction_column]], centers = 2)
+#         cluster_centers <- kmeans_result$centers
+#         high_confidence_cluster <- which.max(cluster_centers)
+
+#         # Calculate the cluster threshold
+#         cluster_threshold <- min(subset_data[[prediction_column]][kmeans_result$cluster == high_confidence_cluster], na.rm = TRUE)
+#         thresholds[thresholds$celltype_label == label, "cluster_threshold"] <- cluster_threshold
+#       } else {
+#         # If there are not enough rows to cluster
+#         stop("Not enough data for k-means clustering")
+#       }
+#     }, error = function(e) {
+#       # Handle error: retain original label (no filtering)
+#       message(paste("Error for label:", label, "-", e$message, "- Retaining original cells (no filtering)."))
+#       thresholds[thresholds$celltype_label == label, "cluster_threshold"] <- -Inf  # No filtering for this label
+#     })
+#   }
+
+#   return(thresholds)
+# }
 
-# # Filter cells
-# querry_data <- subset(querry_data, subset = prediction.score.max >= threshold)
-# gc()
+# # Function to filter cells by threshold
+# filter_cells_by_threshold <- function(metadata, prediction_column, thresholds, celltype_label) {
+#   # Merge thresholds back to metadata
+#   metadata <- merge(metadata, thresholds, by = celltype_label, by.y = "celltype_label", all.x = TRUE)
 
-# DimPlot(querry_data, group.by = "class_label", label.size = 6) + theme(legend.position = "bottom",legend.text=element_text(size=4))
+#   # Check if the merge was successful
+#   if (nrow(metadata) == 0) {
+#     stop("Merging thresholds into metadata resulted in zero rows. Check column names and values.")
+#   }
 
-# labels <- unique(querry_data@meta.data$`class_label`)
+#   # Filter cells based on prediction scores and thresholds
+#   filtered_metadata <- metadata[
+#     is.na(metadata[[prediction_column]]) |
+#     (metadata[[prediction_column]] >= metadata$cluster_threshold),
+#   ]
 
-# for (label in labels) {
-#   plot <- DimPlot(querry_data, group.by = "class_label", cells.highlight = WhichCells(querry_data, expression = class_label == label)) + ggtitle(label)
-#   print(plot)
+#   return(filtered_metadata)
 # }
 
-# #========================================================================
-# # Step 2 : annotate with subclass
-# #========================================================================
+# # Function to create UMAP plots, save them as a PDF, and return filtered results
+# create_filtered_umap <- function(so, celltype_label, output_dir = "default/path", file_name = "default_plot.pdf") {
+#   set.seed(42)  # For reproducibility
 
-# ref_data <- readRDS(file = file.path(OUTPUTDIR, STEP_REMI, "P30_WT_remi.rds"))
-# premier_modele <- ref_data@assays[["SCT"]]@SCTModel.list[[1]]
-# ref_data@assays[["SCT"]]@SCTModel.list <- list(premier_modele)
+#   # Ensure the output directory exists
+#   if (!dir.exists(output_dir)) {
+#     dir.create(output_dir, recursive = TRUE)
+#   }
 
-# ### Set the default assay to SCT for booth seurat object
-# DefaultAssay(ref_data) <- "SCT"
-# DefaultAssay(querry_data) <- "SCT"
+#   # Get metadata and create cell ID column
+#   metadata <- so@meta.data
+#   metadata$cell_id <- rownames(metadata)
 
-# querry_data$supertype_label <- NA
+#   # Dynamically construct the column name for prediction scores
+#   prediction_column <- paste0("prediction.score.max.", celltype_label)
 
-# for (class in unique(querry_data$class_label)) {
-#   print("The supertype class :")
-#   print(class)
-  
-#   # Set idents
-#   Idents(querry_data) <- "class_label"
-#   Idents(ref_data) <- "class_label"
-  
-#   # Per class
-#   subset_querry_data <- subset(querry_data, idents = class)
-#   subset_ref_data <- subset(ref_data, idents = class)
-  
-#   # Check subset sizes
-#   print(paste("Number of cells in query subset:", ncol(subset_querry_data)))
-#   print(paste("Number of cells in reference subset:", ncol(subset_ref_data)))
-
-#   # Keep the same genes
-#   ref_genes <- as.data.frame(rownames(subset_ref_data))
-#   querry_genes <- as.data.frame(rownames(subset_querry_data))
-#   all_genes <- intersect(ref_genes$`rownames(subset_ref_data)`, querry_genes$`rownames(subset_querry_data)`)
-  
-#   if(ncol(subset_querry_data) < K_SCORE){
-
-#   # Calculate a new k_score
-#   k_score_reference <- calculate_k_score(num_cells = ncol(subset_ref_data))
-#   k_score_query <- calculate_k_score(num_cells = ncol(subset_querry_data))
-#   k_score <- min(k_score_reference, k_score_query)
-
-#   anchors <- FindTransferAnchors(reference = subset_ref_data,
-#                                query = subset_querry_data,
-#                                k.score = k_score,
-#                                normalization.method = "SCT",
-#                                features = all_genes)
-# } else{
-#   anchors <- FindTransferAnchors(reference = subset_ref_data,
-#                                query = subset_querry_data,
-#                                normalization.method = "SCT",
-#                                features = all_genes)
-# }
+#   # Calculate thresholds
+#   thresholds <- calculate_thresholds(metadata, celltype_label, prediction_column)
+#   print(thresholds)  # Log the computed thresholds
+#   write.csv(thresholds, file = file.path(output_dir, paste0(celltype_label, "_threshold.csv")), row.names = FALSE)
 
-# # Number of anchors
-# num_anchors <- nrow(anchors@anchors)
+#   # Filter cells based on thresholds
+#   filtered_metadata <- filter_cells_by_threshold(metadata, prediction_column, thresholds, celltype_label)
 
-# if(ncol(subset_querry_data) < num_anchors){
+#   # Subset the original Seurat object using the filtered metadata
+#   so <- subset(so, cells = filtered_metadata$cell_id)
 
-#   # Calculate the k weight
-#   k_weight <- calculate_k_weight(sample_size = num_anchors)
+#   # Create a full path for the PDF file
+#   pdf_file <- file.path(output_dir, file_name)
 
-#   # Prediction with transfertData
-#   prediction <- TransferData(anchorset = anchors,
-#                            refdata = subset_ref_data$supertype_label,
-#                            k.weight = k_weight
-#                            )
-# } else{
-#   prediction <- TransferData(anchorset = anchors,
-#                            refdata = subset_ref_data$supertype_label)
-# }
-  
-#   # Add to the metadata
-#   subset_querry_data <- AddMetaData(subset_querry_data, metadata = prediction)
-#   querry_data$supertype_label[Cells(subset_querry_data)] <- subset_querry_data$predicted.id
-# }
+#   # Call the plotting function with the filtered Seurat object and cell type label
+#   plot_umap(so, celltype_label, pdf_file)  # Save the plots directly to the specified PDF
 
-# ### Plot UMAP
-# Idents(querry_data) <- querry_data$supertype_label
-# DimPlot(querry_data, group.by = "supertype_label", label.size = 6) + theme(legend.position = "bottom",legend.text=element_text(size=4))
+#   # Return the updated Seurat object and thresholds used
+#   return(list(filtered_object = so, thresholds = thresholds))
+# }
 
-# ### Plot UMAP per labels
-# labels <- unique(querry_data@meta.data$`supertype_label`)
+# # Function to plot UMAP with optional PDF saving
+# plot_umap <- function(so, celltype_label, pdf_file = NULL) {
+#   if (!is.null(pdf_file)) {
+#     pdf(pdf_file, width = 8, height = 6)  # Open a PDF device if specified
+#   }
 
-# for (label in labels) {
-#   plot <- DimPlot(querry_data, group.by = "supertype_label", cells.highlight = WhichCells(querry_data, expression = supertype_label == label)) + ggtitle(label)
-#   print(plot)
+#   # Main UMAP plot
+#   plot <- DimPlot(so, group.by = celltype_label, label.size = 6) +
+#           theme(legend.position = "bottom", legend.text = element_text(size = 4))
+  
+#   print(plot)  # Print the main plot
+  
+#   # Generate individual UMAP plots for each label
+#   labels <- unique(so@meta.data[[celltype_label]])
+#   for (label in labels) {
+#     cells_to_highlight <- rownames(so@meta.data[so@meta.data[[celltype_label]] == label, ])
+#     plots <- DimPlot(so, group.by = celltype_label, cells.highlight = cells_to_highlight) +
+#              ggtitle(label) +
+#              theme(legend.position = "bottom", legend.text = element_text(size = 4))
+
+#     print(plots)  # Print each individual plot
+#   }
+  
+#   if (!is.null(pdf_file)) {
+#     dev.off()  # Close the PDF device if it was opened
+#   }
 # }
 
-# ### UMAP plot with the prediction
-# FeaturePlot(querry_data, features = "prediction.score.max", reduction = "umap")
-
-# saveRDS(querry_data, file = "Rplots.rds")
+# #========================================================================
+# # Load the data an the reference
+# #========================================================================
 
-################################################################################################ CREATE THE ARLOTTA REFERENCE SEURAT OBJECT (USED IN PREVIOUS STEPS)
+# ## Our data
+# so <- readRDS(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "_filtered_seurat_object.rds")))
 
-# # Read the gene expression matrix (genes as columns, cells as rows)
-# expression_matrix <- read.csv("/scratch/lchabot/inmed_deChevigny_scrnaseq/scRNAseq_analysis_workflow/01_Reference/reference_matrix.csv", row.names = 1)
+# ## The reference
+# ref <- readRDS(file = file.path(REF, "/modify_arlotta_seurat_object.rds"))
 
-# expression_matrix <- t(expression_matrix)
+# # Keep only some cells in the reference
+# pattern <- "P4"
+# metadata <- ref@meta.data
+# filtered_metadata <- metadata[grepl(pattern, metadata$biosample_id), ]
+# cells_to_keep <- rownames(filtered_metadata)
+# ref <- subset(ref, cells = cells_to_keep)
+# ref <- PercentageFeatureSet(ref, pattern = "^mt-", col.name = "percent.mt")
+# ref <- SCTransform(ref, vars.to.regress = "percent.mt", verbose = FALSE)
 
-# # Read the metadata (cells as rows)
-# metadata <- read.csv("/scratch/lchabot/inmed_deChevigny_scrnaseq/scRNAseq_analysis_workflow/01_Reference/metadata.csv", row.names = 1)
+# #========================================================================
+# # Doublet Finder
+# #========================================================================
 
-# cell_id_column <- "NEW_NAME"
-# rownames(metadata) <- metadata[[cell_id_column]]
-# metadata <- metadata[, !names(metadata) %in% cell_id_column]
-# metadata <- metadata[rownames(metadata) %in% colnames(expression_matrix), ]
-# expression_matrix <- expression_matrix[, colnames(expression_matrix) %in% rownames(metadata)]
+# pct <- so[["pca"]]@stdev / sum(so[["pca"]]@stdev) * 100
+# cumu <- cumsum(pct)
+# co1 <- which(cumu > 90 & pct < 5)[1]
+# co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1
+# pcs <- min(co1, co2)
+
+# sweep.res <- paramSweep(so, PCs = 1:pcs, sct = TRUE)
+# sweep.stats <- summarizeSweep(sweep.res, GT = FALSE)
+# # bcmvn <- find.pK(sweep.stats)
+# homotypic.prop <- modelHomotypic(so$seurat_clusters)
+# nExp_poi <- round(0.075 * ncol(so))
+# nExp_poi.adj <- round(nExp_poi * (1 - homotypic.prop))
+# so <- doubletFinder(so, PCs = 1:pcs, pN = 0.25, pK = 0.09, nExp = nExp_poi.adj, reuse.pANN = FALSE, sct = TRUE)
+
+# metadata_columns <- colnames(so@meta.data)
+# pattern <- "DF.classifications_0.25_0.09_\\d+"  # Adjust the regex as needed
+# matched_column <- metadata_columns[grepl(pattern, metadata_columns)]
+# num_doublets <- sub("DF\\.classifications_0\\.25_0\\.09_(\\d+)", "\\1", matched_column)
+# print(num_doublets)
+# full_column_name <- paste0("DF.classifications_0.25_0.09_", num_doublets)
+# cells_to_keep <- rownames(so@meta.data[so@meta.data[[full_column_name]] == "Singlet", ])
+# so <- subset(so, cells = cells_to_keep)
 
-# # Create the Seurat object
-# seurat_object <- CreateSeuratObject(counts = expression_matrix)
+# #========================================================================
+# # Step 1 : annotate with broad class
+# #========================================================================
+# so <- perform_label_transfer(so, ref, actual_class = "class_label", target_label = "class_label", skip_loop = TRUE, output_dir = file.path(OUTPUTDIR, STEP2, SAMPLE_ID), pdf_file = "class_lablel_before_filtering.pdf")
+# filtered_results <- create_filtered_umap(so, celltype_label = "class_label", output_dir = file.path(OUTPUTDIR, STEP2, SAMPLE_ID), file_name = "class_lablel_after_filtering.pdf")
 
-# # Add metadata to the Seurat object
-# seurat_object <- AddMetaData(seurat_object, metadata = metadata)
+# #========================================================================
+# # Step 2 : annotate with subclass
+# #========================================================================
+# so <- perform_label_transfer(so, ref, actual_class = "class_label", target_label = "subclass_label", skip_loop = FALSE, output_dir = file.path(OUTPUTDIR, STEP2, SAMPLE_ID), pdf_file = "subclass_label_before_filtering.pdf")
+# filtered_results <- create_filtered_umap(so, celltype_label = "subclass_label", output_dir = file.path(OUTPUTDIR, STEP2, SAMPLE_ID), file_name = "subclass_label_after_filtering.pdf")
 
-# # Check the Seurat object
-# print("Metadata in Seurat object:")
-# print(head(seurat_object@meta.data))
+# #========================================================================
+# # Step 3 : annotate with celltype
+# #========================================================================
+# so <- perform_label_transfer(so, ref, actual_class = "subclass_label", target_label = "celltype_label", skip_loop = FALSE, output_dir = file.path(OUTPUTDIR, STEP2, SAMPLE_ID), pdf_file = "celltype_label_before_filtering.pdf")
+# filtered_results <- create_filtered_umap(so, celltype_label = "celltype_label", output_dir = file.path(OUTPUTDIR, STEP2, SAMPLE_ID), file_name = "celltype_label_after_filtering.pdf")
 
-# saveRDS(seurat_object, "/scratch/lchabot/inmed_deChevigny_scrnaseq/scRNAseq_analysis_workflow/01_Reference/modify_arlotta_seurat_object.rds")
\ No newline at end of file
+# saveRDS(so, file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "annotate_seurat_objet.rds")))
diff --git a/03_Script/remi.R b/03_Script/remi.R
index ada6bcd..399d894 100644
--- a/03_Script/remi.R
+++ b/03_Script/remi.R
@@ -17,10 +17,6 @@ DIRECTORY = getwd()
 OUTPUTDIR = file.path((DIRECTORY), "05_Output")
 REF = file.path((DIRECTORY), "01_Reference")
 
-STEP_REMI = "03_remi/"
-
-dir.create(file.path(OUTPUTDIR, STEP_REMI)) ### A voir !!!!!
-
 #====================================
 # Open the arlotta metadata :
 # https://singlecell.broadinstitute.org/single_cell/study/SCP1290/molecular-logic-of-cellular-diversification-in-the-mammalian-cerebral-cortex
@@ -90,7 +86,7 @@ modify_arlotta_metadata <- rbind(remi_arlotta_metadata, arlotta_metadata)
 ### Create a new column for cell name, to only keep A,T,C,C (to match the matrix) + write it as csv
 modify_arlotta_metadata <- as.data.table(modify_arlotta_metadata)
 modify_arlotta_metadata <- modify_arlotta_metadata[, new_sample_name := gsub("[^ATCG]", "", sample_name)]
-write.csv(modify_arlotta_metadata, file = file.path(OUTPUTDIR, STEP_REMI, "modify_arlotta_metadata.csv"), row.names = FALSE)
+write.csv(modify_arlotta_metadata, file = file.path(REF, "modify_arlotta_metadata.csv"), row.names = FALSE)
 
 #====================================
 # Match our arlotta matrices to 
@@ -172,10 +168,10 @@ print(dim(reference_matrix))
 #====================================
 # Create a Seurat Object + save it
 #====================================
-rownames(metadata) <- metadata$new_sampl e_name
+rownames(metadata) <- metadata$new_sample_name
 seurat_object <- CreateSeuratObject(counts = reference_matrix, meta.data = metadata)
 
-saveRDS(seurat_object, file = file.path(OUTPUTDIR, STEP_REMI, "modify_arlotta_seurat_object.rds"))
+saveRDS(seurat_object, file = file.path(REF, "modify_arlotta_seurat_object.rds"))
 
 
 ################################################################################################ CREATE FILES TO HAVE ALL CLASS FOR SIMS ANNOTATION
diff --git a/03_Script/seurat.Rmd b/03_Script/seurat.Rmd
index dbee8fd..697b8ec 100644
--- a/03_Script/seurat.Rmd
+++ b/03_Script/seurat.Rmd
@@ -45,7 +45,6 @@ library( htmltools)
 [//]: # "Load the various chunks in the various files"
 ```{r readChunks, echo=FALSE, warning=FALSE, message=FALSE}
 read_chunk( path=file.path( SCRIPTDIR, "01_prepare_data.R"));
-read_chunk( path=file.path( SCRIPTDIR, "02_variables_genes.R"));
 read_chunk( path=file.path( SCRIPTDIR, "03_cell_heterogeneity_PCA.R"));
 read_chunk( path=file.path( SCRIPTDIR, "04_cell_heterogeneity_tSNE_UMAP.R"));
 read_chunk( path=file.path( SCRIPTDIR, "05_cell_heterogeneity_clustering.R"));
@@ -83,64 +82,17 @@ We visualize QC metrics, and use these to filter cells.
 ```{r filterData_filterObject, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
 <<filterData_filterObject>>
 ```
-
 ## Normalizing the data
 
-After removing unwanted cells from the dataset we normalize the feature expression measurements for each cell (and correct the batch effect using the Seurat integration procedure if necessary) with `r snakemake@params[["norm_method"]]`
-We apply a linear transformation (‘scaling’) that is a standard pre-processing step prior to dimensional reduction techniques like PCA. If SCTransform is performed, it's not necessary to scale the data after it.
+After removing unwanted cells from the dataset we use the SCTransform function in Seurat that performs normalization and variance stabilization of molecular count data from scRNA-seq experiments.
 
-```{r normalizeData, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<normalizeData>>
+```{r SCTransform, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
+<<SCTransform>>
 ```
 
 <br /><br />
 
-## Identification of highly variable genes
-
-Subset of features that exhibit high cell-to-cell variation in the dataset
-
-```{r findVariableGenes_seuratMethod, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE, warning = FALSE}
-<<findVariableGenes_seuratMethod>>
-```
-
-Number of variable genes found: `r variablesGenesStats`
-
-```{r findVariableGenes_summaryPlot, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE, warning = FALSE}
-<<findVariableGenes_summaryPlot>>
-```
-
-<br />
-
-Top `r VARIABLE_FEATURES_SHOWTOP` most variable annotations:
-
-```{r findVariableGenes_summaryTable, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE, warning = FALSE}
-<<findVariableGenes_summaryTable>>
-```
-
-```{r scaleData, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<scaleData>>
-```
-
-<br /><br />
-
-```{r, echo = FALSE}
-if (as.logical(MULTIPLEX) == TRUE) {
-  include_chunk <- TRUE
-  include_title <- TRUE
-} else {
-  include_chunk <- FALSE
-  include_title <- FALSE
-}
-```
-
-`r if(include_title) "## Demultiplexing"`
-```{r HTODemux, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-if (include_chunk) {<<HTODemux>>}
-```
-
-<br /><br />
-
-## PCA results
+# PCA results
 
 Visualizing both cells and features that define the PCA
 
@@ -160,40 +112,12 @@ A ranking of principle components based on the percentage of variance explained
 <<elbowplot>>
 ```
 
-Number of dimensions to use for PCA : `r snakemake@params[["pca_npc"]]``
-
-PCA projections for dimensions `r paste( 1:PCA_PLOTS_NBDIMS, collapse="/")` (colored by batch)  
+Number of dimensions to use for PCA : `r pcs`
 
 ```{r heterogeneity_pca, echo = FALSE, out.width='33%', results = "asis", message = FALSE, warning = FALSE}
 <<heterogeneity_pca>>
 ```
 
-<br />
-
-UMIs counts on PCA dimensions `r paste( 1:PCA_PLOTS_NBDIMS, collapse="/")`  
-```{r heterogeneity_pca_umisCounts, echo = FALSE, out.width='33%', results = "asis", message = FALSE, warning = FALSE}
-<<heterogeneity_pca_umisCounts>>
-```
-
-<br />
-
-Genes counts on PCA dimensions `r paste( 1:PCA_PLOTS_NBDIMS, collapse="/")`     
-```{r heterogeneity_pca_genesCounts, echo = FALSE, out.width='33%', results = "asis", message = FALSE, warning = FALSE}
-<<heterogeneity_pca_genesCounts>>
-```
-
-<br />
-
-Correlation (Spearman) of PCA dimensions `r paste( 1:PCA_PLOTS_NBDIMS, collapse="/")` with UMIs and genes counts  
-```{r heterogeneity_pca_correlations, echo = FALSE, fig.dim = c(9,6), results = "asis", message = FALSE, warning = FALSE}
-<<heterogeneity_pca_correlations>>
-```
-<br />
-
-PCA loadings of top `r PCA_PLOTS_NBFEATURES` features for dimensions `r paste( 1:PCA_PLOTS_NBDIMS, collapse="/")`  
-```{r heterogeneity_pca_loadings, echo = FALSE, fig.dim = c(4, 4), out.width='33%', results = "asis", message = FALSE, warning = FALSE}
-<<heterogeneity_pca_loadings>>
-```
 <br /><br />
 
 ## Dimensionality reduction
@@ -220,7 +144,6 @@ rm("useReduction");
 
 <br /><br />
 
-
 ## Clusters identification
 
 We use a graph-based clustering approach : 1. Construct a KNN graph based on the euclidian distance in PCA space, and refine the edge weights between any two cells based on the shared overlap in their local neighborhoods (Jaccard similarity). 2. Modularity optimization techniques such as the Louvain algorithm or SLM, to iteratively group cells together, with the goal of optimizing the standard modularity function.
@@ -261,162 +184,3 @@ rm("useReduction");
 ```
 
 
-##  GABAergic markers {.tabset .tabset-fade}
-Here you can vizualize GABAergic markers expression.
-
-### GABAN
-```{r GABAN, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<GABAN>>
-```
-
-### Meis2
-```{r Meis2, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<Meis2>>
-```
-
-### Lamp A7C|CNC
-```{r Lamp5_A7C_CNC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<Lamp5_A7C_CNC>>
-```
-
-### Lamp A7C|NGC
-```{r Lamp5_A7C_NGC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<Lamp5_A7C_NGC>>
-```
-
-### Sngc CCK|BC
-```{r Sncg_CCK_BC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<Sncg_CCK_BC>>
-```
-
-### Vip BP|BTC
-```{r Vip_BP_BTC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<Vip_BP_BTC>>
-```
-
-### Sst T-MC
-```{r Sst_T_MC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<Sst_T_MC>>
-```
-
-### Sst fan-MC
-```{r Sst_fan_MC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<Sst_fan_MC>>
-```
-
-### Sst NMC
-```{r Sst_NMC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<Sst_NMC>>
-```
-
-### Sst IVC
-```{r Sst_IVC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<Sst_IVC>>
-```
-
-### Sst FS-like
-```{r Sst_FS_like, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<Sst_FS_like>>
-```
-
-### Sst Chold|LPC
-```{r Sst_Chold_LPC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<Sst_Chold_LPC>>
-```
-
-### Pvalb CHC
-```{r Pvalb_CHC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<Pvalb_CHC>>
-```
-
-### Pvalb FS|BC
-```{r Pvalb_FS_BC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<Pvalb_FS_BC>>
-```
-
-## {-}
-
-##  Glutamatergic markers {.tabset .tabset-fade}
-Here you can vizualize Glutamatergic markers expression.
-
-### Cajal retzius cells
-```{r Cajal_Retzius_cells_markers, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<Cajal_Retzius_cells_markers>>
-```
-
-### L2/3 IT
-```{r L2_3_IT, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<L2_3_IT>>
-```
-
-### L4 IT SSC
-```{r L4_IT_SSC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<L4_IT_SSC>>
-```
-
-### L4 IT PC
-```{r L4_IT_PC, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<L4_IT_PC>>
-```
-
-### L5 IT
-```{r L5_IT, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<L5_IT>>
-```
-
-### L5 PT
-```{r L5_PT, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<L5_PT>>
-```
-
-### L5/6 NP
-```{r L5_6_NP, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<L5_6_NP>>
-```
-
-### L6 IT
-```{r L6_IT, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<L6_IT>>
-```
-
-### L6 CT
-```{r L6_CT, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<L6_CT>>
-```
-
-### L6b
-```{r L6b, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<L6b>>
-```
-
-## {-}
-
-##  Non-Neuronal markers {.tabset .tabset-fade}
-Here you can vizualize GABAergic markers expression.
-
-### Astrocytes
-```{r astro_markers, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<astro_markers>>
-```
-
-### Oligodendrocytes
-```{r oligo_markers, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<oligo_markers>>
-```
-
-### Microglia
-```{r microglia_markers, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<microglia_markers>>
-```
-
-### Endothelial
-```{r Endothelial_cells_markers, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<Endothelial_cells_markers>>
-```
-
-### Pericyte
-```{r Pericyte_markers, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE}
-<<Pericyte_markers>>
-```
-
-## {-}
diff --git a/04_Workflow/seurat.smk b/04_Workflow/seurat.smk
index db8919e..5c0e0ff 100644
--- a/04_Workflow/seurat.smk
+++ b/04_Workflow/seurat.smk
@@ -18,8 +18,8 @@ rule seurat:
         sc_data = expand(OUTPUTDIR + "01_cellranger/{sample_id}/count/sample_filtered_feature_bc_matrix/",sample_id = SAMPLE_ID),
         cell_ranger_count_path = config["seurat"]["cell_ranger_count_path"],
         sample_id = expand("{sample_id.id}", sample_id = sample_id.itertuples()),
-        multiplex = expand("{multiplex.multiplex}", multiplex = multiplex.itertuples()),
-        hto = expand("{hto.hto}", hto = hto.itertuples()),
+        # multiplex = expand("{multiplex.multiplex}", multiplex = multiplex.itertuples()),
+        # hto = expand("{hto.hto}", hto = hto.itertuples()),
         plot_raster_nbcells_threshold = config["seurat"]["plot_raster_nbcells_threshold"],
         qc_exploration_mode = config["seurat"]["qc_exploration_mode"],
         min_cells = config["seurat"]["min_cells"],
@@ -33,18 +33,18 @@ rule seurat:
         filter_percent_rb = config["seurat"]["filter_percent_rb"].split(','),
         pattern_mt = config["seurat"]["pattern_mt"],
         pattern_rb = config["seurat"]["pattern_rb"],
-        norm_method = config["seurat"]["norm_method"],
-        norm_scale_factor = config["seurat"]["norm_scale_factor"],
-        feature_select_method = config["seurat"]["feature_select_method"],
-        variable_features = config["seurat"]["variable_features"],
-        variable_features_showtop = config["seurat"]["variable_features_showtop"],
+        # norm_method = config["seurat"]["norm_method"],
+        # norm_scale_factor = config["seurat"]["norm_scale_factor"],
+        # feature_select_method = config["seurat"]["feature_select_method"],
+        # variable_features = config["seurat"]["variable_features"],
+        # variable_features_showtop = config["seurat"]["variable_features_showtop"],
         dims = config["seurat"]["dims"],
-        pca_npc = config["seurat"]["pca_npc"],
-        pca_plots_nbdims = config["seurat"]["pca_plots_nbdims"],
-        pca_plot_nbfeatures = config["seurat"]["pca_plot_nbfeatures"],
-        dimreduc_use_pca_nbdims = config["seurat"]["dimreduc_use_pca_nbdims"],
-        findclusters_use_pca_nbdims = config["seurat"]["findclusters_use_pca_nbdims"],
-        findneighbors_k = config["seurat"]["findneighbors_k"],
+        # pca_npc = config["seurat"]["pca_npc"],
+        # pca_plots_nbdims = config["seurat"]["pca_plots_nbdims"],
+        # pca_plot_nbfeatures = config["seurat"]["pca_plot_nbfeatures"],
+        # dimreduc_use_pca_nbdims = config["seurat"]["dimreduc_use_pca_nbdims"],
+        # findclusters_use_pca_nbdims = config["seurat"]["findclusters_use_pca_nbdims"],
+        # findneighbors_k = config["seurat"]["findneighbors_k"],
         findclusters_resolution = config["seurat"]["findclusters_resolution"],
         findclusters_algorithm = config["seurat"]["findclusters_algorithm"],
 
@@ -66,16 +66,24 @@ rule seurat_labelTransfert:
         seurat_labelTransfert_output = expand(OUTPUTDIR + "02_seurat/seurat_labelTransfert_output.txt"),
 
     conda:
-        CONTAINER + "seurat.yaml"
+        CONTAINER + "labelTransfer.yaml"
 
     params:
+        scriptdir = SCRIPTDIR,
         sample_id = expand("{sample_id.id}", sample_id = sample_id.itertuples()),
 
     message:
         "Run Seurat for the transfert data"
 
-    script:
-        SCRIPTDIR + "labelTransfert_seurat.R"
+    shell:
+        """
+        Rscript -e 'if (!requireNamespace("DoubletFinder", quietly=TRUE)) {{
+                        if (!requireNamespace("remotes", quietly=TRUE)) install.packages("remotes");
+                        remotes::install_github("chris-mcginnis-ucsf/DoubletFinder")
+                    }}'
+        Rscript {params.scriptdir}/labelTransfert_seurat.R {params.sample_id}
+        touch {output.seurat_labelTransfert_output}
+        """
 
 #----------------------------------------
 # DEG analysis with Seurat
diff --git a/06_Schemas/config.schema.yaml b/06_Schemas/config.schema.yaml
index 4087adc..112a404 100644
--- a/06_Schemas/config.schema.yaml
+++ b/06_Schemas/config.schema.yaml
@@ -4,173 +4,173 @@ description: snakemake configuration file
 
 type: object
 
-properties:
-  rules:
-    type: object
-    properties:
-      reference_enhancer_rule:
-        type: string
-      ref_cellranger_rule:
-        type: string
-      multi_rule:
-        type: string
-    required:
-      - reference_enhancer_rule
-      - ref_cellranger_rule
+# properties:
+#   rules:
+#     type: object
+#     properties:
+#       reference_enhancer_rule:
+#         type: string
+#       ref_cellranger_rule:
+#         type: string
+#       multi_rule:
+#         type: string
+#     required:
+#       - reference_enhancer_rule
+#       - ref_cellranger_rule
       
-  fastq:
-    type: object
-    properties:
-      sname:
-        type: string
-      biop:
-        type: string
-      sra:
-        type: string
-    required:
-      - sname
-      - biop
+#   fastq:
+#     type: object
+#     properties:
+#       sname:
+#         type: string
+#       biop:
+#         type: string
+#       sra:
+#         type: string
+#     required:
+#       - sname
+#       - biop
   
-  reference:
-    type: object
-    properties:
-      link_ref_fasta:
-        type: string
-      ref_gtf:
-        type: string
-      link_ref_gtf:
-        type: string
-      ref_cellranger:
-        type: string
-      ref_version:
-        type: string
-    required:
-      - link_ref_fasta
-      - link_ref_gtf
-      - ref_cellranger
-      - ref_version
+#   reference:
+#     type: object
+#     properties:
+#       link_ref_fasta:
+#         type: string
+#       ref_gtf:
+#         type: string
+#       link_ref_gtf:
+#         type: string
+#       ref_cellranger:
+#         type: string
+#       ref_version:
+#         type: string
+#     required:
+#       - link_ref_fasta
+#       - link_ref_gtf
+#       - ref_cellranger
+#       - ref_version
   
-  cellranger:
-    type: object
-    properties:
-      fastqs:
-        type: string
-      cells:
-        type: integer
-      localcores:
-        type: integer
-      localmem:
-        type: integer
-    required:
-      - fastqs
-      - expect_cells
-      - localcores
-      - localmem
+#   cellranger:
+#     type: object
+#     properties:
+#       fastqs:
+#         type: string
+#       cells:
+#         type: integer
+#       localcores:
+#         type: integer
+#       localmem:
+#         type: integer
+#     required:
+#       - fastqs
+#       - expect_cells
+#       - localcores
+#       - localmem
 
-  demuxlet:
-    type: object
-    properties:
-      spooled:
-        type: string
-    required:
-      - spooled
+#   demuxlet:
+#     type: object
+#     properties:
+#       spooled:
+#         type: string
+#     required:
+#       - spooled
 
-  seurat:
-    type: object
-    properties:
-      plot_raster_nbcells_threshold:
-        type: integer
-      qc_exploration_mode:
-        type: string
-      min_cells:
-        type: integer
-      min_features:
-        type: integer
-      filter_umi_min:
-        type: string
-      filter_umi_max:
-        type: string
-      filter_feature_min:
-        type: string
-      filter_feature_max:
-        type: string
-      filter_percent_mt:
-        type: string
-      filter_percent_mt_min:
-        type: string
-      filter_percent_rb:
-        type: string
-      pattern_mt:
-        type: string
-      pattern_rb:
-        type: string
-      norm_method:
-        type: string
-      norm_scale_factor:
-        type: integer
-      feature_select_method:
-        type: string
-      variable_features:
-        type: integer
-      variable_features_showtop:
-        type: integer
-      pca_npc:
-        type: integer
-      pca_plots_nbdims:
-        type: integer
-      pca_plot_nbfeatures:
-        type: integer
-      dimreduc_use_pca_nbdims:
-        type: integer
-      findclusters_use_pca_nbdims:
-        type: integer
-      findneighbors_k:
-        type: integer
-      findclusters_resolution:
-        type: integer
-      findclusters_algorithm:
-        type: integer
-    required:
-      - plot_raster_nbcells_threshold
-      - qc_exploration_mode
-      - min_cells
-      - min_features
-      - filter_umi_min
-      - filter_umi_max
-      - filter_feature_min
-      - filter_feature_max
-      - filter_percent_mt
-      - filter_percent_mt_min
-      - filter_percent_rb
-      - pattern_mt
-      - pattern_rb
-      - norm_method
-      - norm_scale_factor
-      - feature_select_method
-      - variable_features
-      - variable_features_showtop
-      - pca_npc
-      - pca_plots_nbdims
-      - pca_plot_nbfeatures
-      - dimreduc_use_pca_nbdims
-      - findclusters_use_pca_nbdims
-      - findneighbors_k
-      - findclusters_resolution
-      - findclusters_algorithm
+#   seurat:
+#     type: object
+#     properties:
+#       plot_raster_nbcells_threshold:
+#         type: integer
+#       qc_exploration_mode:
+#         type: string
+#       min_cells:
+#         type: integer
+#       min_features:
+#         type: integer
+#       filter_umi_min:
+#         type: string
+#       filter_umi_max:
+#         type: string
+#       filter_feature_min:
+#         type: string
+#       filter_feature_max:
+#         type: string
+#       filter_percent_mt:
+#         type: string
+#       filter_percent_mt_min:
+#         type: string
+#       filter_percent_rb:
+#         type: string
+#       pattern_mt:
+#         type: string
+#       pattern_rb:
+#         type: string
+#       norm_method:
+#         type: string
+#       norm_scale_factor:
+#         type: integer
+#       feature_select_method:
+#         type: string
+#       variable_features:
+#         type: integer
+#       variable_features_showtop:
+#         type: integer
+#       pca_npc:
+#         type: integer
+#       pca_plots_nbdims:
+#         type: integer
+#       pca_plot_nbfeatures:
+#         type: integer
+#       dimreduc_use_pca_nbdims:
+#         type: integer
+#       findclusters_use_pca_nbdims:
+#         type: integer
+#       findneighbors_k:
+#         type: integer
+#       findclusters_resolution:
+#         type: integer
+#       findclusters_algorithm:
+#         type: integer
+#     required:
+#       - plot_raster_nbcells_threshold
+#       - qc_exploration_mode
+#       - min_cells
+#       - min_features
+#       - filter_umi_min
+#       - filter_umi_max
+#       - filter_feature_min
+#       - filter_feature_max
+#       - filter_percent_mt
+#       - filter_percent_mt_min
+#       - filter_percent_rb
+#       - pattern_mt
+#       - pattern_rb
+#       - norm_method
+#       - norm_scale_factor
+#       - feature_select_method
+#       - variable_features
+#       - variable_features_showtop
+#       - pca_npc
+#       - pca_plots_nbdims
+#       - pca_plot_nbfeatures
+#       - dimreduc_use_pca_nbdims
+#       - findclusters_use_pca_nbdims
+#       - findneighbors_k
+#       - findclusters_resolution
+#       - findclusters_algorithm
 
-  diffexp:
-    type: object
-    properties:
-      features:
-        type: string
-      cell_marker:
-        type: string
-      test:
-        type: string
-      threshold:
-        type: integer
-    required:
-      - features
-      - cell_marker
-      - test
-      - threshold
+#   diffexp:
+#     type: object
+#     properties:
+#       features:
+#         type: string
+#       cell_marker:
+#         type: string
+#       test:
+#         type: string
+#       threshold:
+#         type: integer
+#     required:
+#       - features
+#       - cell_marker
+#       - test
+#       - threshold
diff --git a/Snakefile b/Snakefile
index c9c2c6d..6619b0a 100644
--- a/Snakefile
+++ b/Snakefile
@@ -78,13 +78,13 @@ rule all:
 		# demuxlet = OUTPUTDIR + "01_cellranger/Mix_MM_lines/outs/demuxlet_Mix_MM_lines.best",
 		# tabdemuxlet = OUTPUTDIR + "01_cellranger/Mix_MM_lines/outs/demuxlet_Mix_MM_lines.tsv",
 		### Seurat
-		# seurat_report = expand(OUTPUTDIR + "02_seurat/{sample_id}/{sample_id}_seurat_report.html", sample_id = SAMPLE_ID),
+		# seurat_report = expand(OUTPUTDIR + "02_seurat/{sample_id}/{sample_id}_seurat_report.html", sample_id = SAMPLE_ID),
 		### Remi data
 		# remi_output = expand(OUTPUTDIR + "remi_output.txt"),
 		### Label Transfert with seurat
-		# seurat_labelTransfert_output = expand(OUTPUTDIR + "02_seurat/seurat_labelTransfert_output.txt"),
+		seurat_labelTransfert_output = expand(OUTPUTDIR + "02_seurat/seurat_labelTransfert_output.txt"),
 		### DEG analysis with seurat
-		seurat_DEG_output = expand(OUTPUTDIR + "04_diffexp/seurat_DEG_output.txt"),
+		# seurat_DEG_output = expand(OUTPUTDIR + "04_diffexp/seurat_DEG_output.txt"),
 		### Prepare data for SIMS
 		# data_for_sims_output = expand(OUTPUTDIR + "03_sims/data_for_sims_output.txt"),
 		# anndata_for_sims_output = expand(OUTPUTDIR + "03_sims/anndata_for_sims_output.txt"),
diff --git a/config.yaml b/config.yaml
index 914eac4..2213d81 100644
--- a/config.yaml
+++ b/config.yaml
@@ -9,21 +9,21 @@ sample: sample.tsv
 #-------------------------------------
 
 # If you run the workflow with slurm (sbatch run_slurm.sh)
-run_slurm: true
+run_slurm: false
 # If you data are demultiplexed = true
-run_demultiplex: true
+run_demultiplex: false
 # If your data are not demultiplexed = true
-run_multiplex: false
+run_multiplex: true
 
 rules:
     # Optimizing and assembling genome annotations for 3’ single-cell RNA-sequencing analysis
     reference_enhancer_rule: "FALSE"
     # Build the reference with cellranger
-    ref_cellranger_rule: "TRUE"
+    ref_cellranger_rule: "FALSE"
     # Run SIMS to anotate cells
-    sims_rule: "TRUE"
+    sims_rule: "FALSE"
     # Run Knnor to help with imbalanced datasets
-    knnor_rule: "TRUE"
+    knnor_rule: "FALSE"
 
 run:
     # FASTQ file naming convention : [Sample name]_S1_L00[Lane Number]_[Read Type]_001.fastq.gz
@@ -110,14 +110,14 @@ seurat:
     min_features: 200
     # Cells with number of UMIs outside the range will be excluded
     # Can be a list of UMIs but MUST be on the same order than sample.tsv id.
-    filter_umi_min: "500"
-    filter_umi_max: ""
+    filter_umi_min: "800"
+    filter_umi_max: "6000"
     # Filter cells that have unique feature counts over/less n
     # Can be a list of feature counts but MUST be on the same order than sample.tsv id.
-    filter_feature_min: "500"
-    filter_feature_max: ""
+    filter_feature_min: "800"
+    filter_feature_max: "2500"
     # Filter cells that have > n% mitochondrial counts
-    filter_percent_mt: "10"
+    filter_percent_mt: "6"
     # Filter cells that have < n% mitochondrial counts
     filter_percent_mt_min: "0"
     # Filter cells that have < n% ribosomal counts
@@ -129,36 +129,36 @@ seurat:
 
     ##### Normalization
     # Normalization parameters (see Seurat::NormalizeData())
-    norm_method: "LogNormalize"
-    norm_scale_factor: 10000
+    # norm_method: "LogNormalize"
+    # norm_scale_factor: 10000
 
     ##### Find the most variable genes
-    # How to choose top variable features => vst, mean.var.plot or dispersion
-    feature_select_method: "vst"
-    # Number of features to select as top variable features; only used when select_method is set to 'dispersion' or 'vst'
-    variable_features: 2000
-    # For table in report
-    variable_features_showtop : 200
+    # # How to choose top variable features => vst, mean.var.plot or dispersion
+    # feature_select_method: "vst"
+    # # Number of features to select as top variable features; only used when select_method is set to 'dispersion' or 'vst'
+    # variable_features: 2000
+    # # For table in report
+    # variable_features_showtop : 200
 
     ##### PCA parameters
     # Number of dimention to show for the vizualisation
-    dims: 50
-    # Default number of dimensions to use for PCA (see Seurat::RunPCA())
-    pca_npc: 
-    # Number of dimensions to show in PCA-related plots
-    pca_plots_nbdims : 3
-    #  Number of'top' features to show when plotting PCA loadings
-    pca_plot_nbfeatures : 3
+    dims: 10
+    # # Default number of dimensions to use for PCA (see Seurat::RunPCA())
+    # pca_npc: 2
+    # # Number of dimensions to show in PCA-related plots
+    # pca_plots_nbdims : 3
+    # #  Number of'top' features to show when plotting PCA loadings
+    # pca_plot_nbfeatures : 3
 
     ##### Dimensionality reduction parameters (TSNE/UMAP)
-    # Number of dimensions to use from PCA results
-    dimreduc_use_pca_nbdims :
+    # # Number of dimensions to use from PCA results
+    # dimreduc_use_pca_nbdims : 2
 
     ##### Cluster identification parameters
     # Number of dimensions to use from PCA results
-    findclusters_use_pca_nbdims :
-    # Nearest-neighbor graph construction
-    findneighbors_k : 30
+    # findclusters_use_pca_nbdims : 3
+    # Nearest-neighb    # findclusters_use_pca_nbdims : 3or graph construction
+    # findneighbors_k : 30
     # Value of the resolution parameter, use a value above (below) 1.0 if you want to obtain a larger (smaller) number of communities.
     findclusters_resolution : 1
     # Algorithm for modularity optimization