diff --git a/.gitignore b/.gitignore index feb3cad..adbbf72 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,6 @@ 02_Container/cellranger.sif 02_Container/mambaforge:23.1.0-1.sif 05_Output -05_Output_june .vscode/ .bash_history/ .cache/ @@ -18,4 +17,6 @@ test S004647_to_S004650_unoptimized_ref/ S004647_to_S004650/ __pycache__ - +.netrc +*.errs +*.out diff --git a/02_Container/DE_analysis.yaml b/02_Container/DE_analysis.yaml new file mode 100644 index 0000000..135c46f --- /dev/null +++ b/02_Container/DE_analysis.yaml @@ -0,0 +1,11 @@ +channels : + - r + - bioconda + - conda-forge + - defaults +dependencies : + - r-ggplot2 =3.3.5 + - r-data.table =1.14.2 + - r-seurat =4.0.3 + - r-dplyr =1.0.9 + - r-ggrepel diff --git a/02_Container/knnor.yaml b/02_Container/knnor.yaml new file mode 100644 index 0000000..556eb30 --- /dev/null +++ b/02_Container/knnor.yaml @@ -0,0 +1,12 @@ +channels : + - main + - defaults + - bioconda + - conda-forge +dependencies : + - python = 3.9 + - pandas + - numpy + - pip + - pip : + - knnor diff --git a/02_Container/labelTransfer.yaml b/02_Container/labelTransfer.yaml new file mode 100644 index 0000000..2c97461 --- /dev/null +++ b/02_Container/labelTransfer.yaml @@ -0,0 +1,14 @@ +channels : + - r + - bioconda + - conda-forge + - defaults +dependencies : + - r-ggplot2 =3.5.1 + - r-seurat =5.1.0 + - r-dplyr =1.1.4 + - r-sctransform=0.4.1 # Include sctransform + - r-devtools # Required for installing packages from GitHub + - bioconductor-glmgampoi + + \ No newline at end of file diff --git a/02_Container/remi.yaml b/02_Container/remi.yaml new file mode 100644 index 0000000..d9d5e69 --- /dev/null +++ b/02_Container/remi.yaml @@ -0,0 +1,10 @@ +channels : + - r + - bioconda + - conda-forge + - defaults +dependencies : + - r-seurat = 4.0.3 + - r-dplyr = 1.0.9 + - r-ggplot2 =3.3.5 + - r-cowplot = 1.1.1 diff --git a/02_Container/sims.yaml b/02_Container/sims.yaml index afd0061..b431007 100644 --- a/02_Container/sims.yaml +++ b/02_Container/sims.yaml @@ -8,4 +8,5 @@ dependencies : - pip - pip: - --upgrade pip setuptools wheel - - --use-pep517 git+https://github.com/braingeneers/SIMS.git \ No newline at end of file + # Sims version Nov 17, 2023 + - --use-pep517 git+https://github.com/braingeneers/SIMS.git@c3cc547e9223e979fdc70f9af2ae932b729da88e \ No newline at end of file diff --git a/03_Script/01_prepare_data.R b/03_Script/01_prepare_data.R index facba7d..8a72bd5 100644 --- a/03_Script/01_prepare_data.R +++ b/03_Script/01_prepare_data.R @@ -23,23 +23,41 @@ STEP2 = "02_seurat/" source(file.path(dirname(DIRECTORY), "03_Script/00_general_deps.R")) in_data_dir = file.path( OUTPUTDIR, STEP1) +in_data_dir_demux = file.path( OUTPUTDIR, STEP2) samples <- dir(in_data_dir) - - MIN_CELLS <- as.numeric(MIN_CELLS) MIN_FEATURES <- as.numeric(MIN_FEATURES) +# HTO <- unlist(strsplit(HTO, ",")) + +options(Seurat.object.assay.version = 'v3') #........................................ # Read data #........................................ -seurat_data <- Read10X(paste0(in_data_dir,SAMPLE_ID,CELL_RANGER_COUNT_PATH)) -# Need a condition if data are depultiplexed with cell ranger multi if( RUN_DEMULTIPLEX == TRUE){ + seurat_data <- Read10X(paste0(in_data_dir,SAMPLE_ID,CELL_RANGER_COUNT_PATH)) seurat_obj <- CreateSeuratObject(counts = seurat_data, min.cells = MIN_CELLS, min.features = MIN_FEATURES, project = SAMPLE_ID) -} else { - seurat_obj <- CreateSeuratObject(counts = seurat_data$`Gene Expression`, min.cells = MIN_CELLS, min.features = MIN_FEATURES, project = SAMPLE_ID) +} else if (RUN_DEMULTIPLEX == FALSE) { + seurat_obj <- readRDS(paste0(in_data_dir_demux,SAMPLE_ID, "/demultiplex_", SAMPLE_ID, ".rds")) } + + + +# # Need a condition if data are depultiplexed with cell ranger multi +# if( RUN_DEMULTIPLEX == TRUE){ +# seurat_obj <- CreateSeuratObject(counts = seurat_data, min.cells = MIN_CELLS, min.features = MIN_FEATURES, project = SAMPLE_ID) +# } else if (as.logical(MULTIPLEX) == TRUE) { +# seurat_obj <- CreateSeuratObject(counts = seurat_data$`Gene Expression`, min.cells = MIN_CELLS, min.features = MIN_FEATURES, project = SAMPLE_ID) +# hto <- seurat_data$`Antibody Capture` +# hto <- hto[c(HTO), ] +# seurat_obj[["HTO"]] <- CreateAssayObject(counts = hto) +# seurat_obj <- subset(x = seurat_obj, subset = nCount_HTO > 0) +# seurat_obj <- NormalizeData(seurat_obj, assay = "HTO", normalization.method = "CLR") +# } else{ +# seurat_obj <- CreateSeuratObject(counts = seurat_data$`Gene Expression`, min.cells = MIN_CELLS, min.features = MIN_FEATURES, project = SAMPLE_ID) +# } + seurat_obj$SampleID <- SAMPLE_ID # .......................................................................................................... @@ -371,18 +389,20 @@ ggplot( seurat_obj[[]][order( seurat_obj[["outlier"]]),], # Plot FALSE first and labs( x = "# Genes", y = "# UMIs") + theme( legend.position = "none") -# Mitochondrial vs ribosomal distributions -if(exists( "mito.drop") && exists( "ribo.drop")) +if(exists( "mito.drop")) { ggplot( seurat_obj[[]][order( seurat_obj[["outlier"]]),], # Plot FALSE first and TRUE after - aes( x = percent.rb, + aes( x = nCount_RNA, y = percent.mt, color = outlier)) + geom_point( size = 0.5) + - geom_vline( xintercept = FILTER_PERCENT_RB, linetype = 2, color = "blue", alpha = 0.5) + + geom_vline( xintercept = c( FILTER_FEATURE_MIN, FILTER_FEATURE_MAX), + linetype = 2, + color = c( "blue", "red")[!sapply( list( FILTER_FEATURE_MIN, FILTER_FEATURE_MAX), is.null)], + alpha = 0.5) + geom_hline( yintercept = FILTER_PERCENT_MT, linetype = 2, color = "red", alpha = 0.5) + scale_color_manual( values = c( "TRUE" = "#FF0000AA", "FALSE" = "#44444444")) + - labs( x = "% Ribosomal genes", y = "% Mitochondrial genes") + + labs( x = "# UMIs", y = "% Mitochondrial genes") + theme( legend.position = "none") } @@ -405,15 +425,8 @@ if( QC_EXPLORATION_MODE == FALSE){ # sep="\t"); # .......................................................................................................... -## @knitr normalizeData +## @knitr SCTransform # .......................................................................................................... -# Normalize the count data present in a given assay -seurat_obj = NormalizeData( object = seurat_obj, - normalization.method = NORM_METHOD, - scale.factor = NORM_SCALE_FACTOR, - verbose = .VERBOSE) - -# # Scales and centers features in the dataset -# seurat_obj = ScaleData( object = seurat_obj, -# verbose = .VERBOSE) +# # Normalize the count data present in a given assay +seurat_obj = SCTransform(seurat_obj, vars.to.regress = "percent.mt", verbose = FALSE) diff --git a/03_Script/02_variables_genes.R b/03_Script/02_variables_genes.R index 704cfae..44f54bb 100644 --- a/03_Script/02_variables_genes.R +++ b/03_Script/02_variables_genes.R @@ -35,7 +35,11 @@ suppressMessages( suppressWarnings( LabelPoints( plot = VariableFeaturePlot( seu ## @knitr findVariableGenes_summaryTable # .......................................................................................................... -# Extract variable genes info as data.frame +# Extract variable genes info as data.frame feature_select_method: "vst" + # Number of features to select as top variable features; only used when select_method is set to 'dispersion' or 'vst' + variable_features: 2000 + # For table in report + variable_features_showtop : 200 variableAnnotationsDT = head( HVFInfo( object = seurat_obj, assay = "RNA", selection.method = FEATURE_SELECT_METHOD)[VariableFeatures( seurat_obj),], VARIABLE_FEATURES_SHOWTOP); variableAnnotationsDT = cbind("Gene" = rownames(variableAnnotationsDT), variableAnnotationsDT); diff --git a/03_Script/03_cell_heterogeneity_PCA.R b/03_Script/03_cell_heterogeneity_PCA.R index 223ae86..e3286f3 100644 --- a/03_Script/03_cell_heterogeneity_PCA.R +++ b/03_Script/03_cell_heterogeneity_PCA.R @@ -3,122 +3,66 @@ # of cells using dimension reduction techniques # ######################################################## -## @knitr ldr +# .......................................................................................................... +## @knitr HTODemux +# .......................................................................................................... + +# if(as.logical(MULTIPLEX) == TRUE){ +# # Demultiplexing data +# seurat_obj <- HTODemux(seurat_obj, assay = "HTO", positive.quantile = 0.99) +# cat("

Removed cells after filtering Doublet et Negative:", sum(seurat_obj[["hash.ID"]] == "Negative" | seurat_obj[["hash.ID"]] == "Doublet")); +# cat("

Remaining cells after filtering:", sum(seurat_obj$hash.ID %in% HTO)); +# cat("\n
\n"); +# seurat_obj <- subset(seurat_obj, idents = c("Doublet","Negative"), invert = TRUE) ### Keep only the Singlet cells (Remove Negative and Doublet) + +# seurat_obj <- FindVariableFeatures(seurat_obj, selection.method = "mean.var.plot") +# seurat_obj <- ScaleData(seurat_obj, features = VariableFeatures(seurat_obj)) +# seurat_obj <- RunPCA(seurat_obj) +# seurat_obj <- RunUMAP(seurat_obj, dims = 1:25) +# print(DimPlot(seurat_obj)) +# } -# Scales and centers features in the dataset -seurat_obj = ScaleData( object = seurat_obj, - verbose = .VERBOSE) +# .......................................................................................................... +## @knitr ldr +# .......................................................................................................... # Perform linear dimensional reduction -seurat_obj <- RunPCA(seurat_obj, features = VariableFeatures(object = seurat_obj)) +seurat_obj <- RunPCA(seurat_obj, assay = "SCT", verbose = FALSE) VizDimLoadings(seurat_obj, dims = 1:DIMS, reduction = "pca") # DimPlot(seurat_obj, reduction = "pca", group.by = "orig.ident") # DimPlot(seurat_obj, reduction = "pca", group.by = "categorie") DimHeatmap(seurat_obj, dims = 1:DIMS, cells = 500, balanced = TRUE) +# .......................................................................................................... ## @knitr elbowplot +# .......................................................................................................... + ElbowPlot(seurat_obj, ndims = DIMS) +# https://hbctraining.github.io/scRNA-seq/lessons/elbow_plot_metric.html +pct <- seurat_obj[["pca"]]@stdev / sum(seurat_obj[["pca"]]@stdev) * 100 +cumu <- cumsum(pct) +co1 <- which(cumu > 90 & pct < 5)[1] +co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1 +pcs <- min(co1, co2) +# Create a dataframe with values +plot_df <- data.frame(pct = pct, + cumu = cumu, + rank = 1:length(pct)) + +# Elbow plot to visualize + ggplot(plot_df, aes(cumu, pct, label = rank, color = rank > pcs)) + + geom_text() + + geom_vline(xintercept = 90, color = "grey") + + geom_hline(yintercept = min(pct[pct > 5]), color = "grey") + + theme_bw() + # .......................................................................................................... ## @knitr heterogeneity_pca # .......................................................................................................... # Compute PCA on selected variable genes -nbPC = PCA_NPC -if(PCA_NPC > length(Cells(seurat_obj))) -{ - warning( paste0( "Number of cells in object (", length(Cells(seurat_obj)), ") smaller than requested number of PCs (", PCA_NPC,"), setting lower PC number..." )) - nbPC = length(Cells(seurat_obj)) -} seurat_obj <- RunPCA( object = seurat_obj, - features = VariableFeatures( seurat_obj), - npcs = nbPC, - verbose = .VERBOSE); - -# Plot PCA, highlighting seurat clusters for combination of dimensions -invisible( apply( combn( 1:PCA_PLOTS_NBDIMS, 2), 2, function(dims) -{ - print( DimPlot( object = seurat_obj, reduction = "pca", dims = dims, group.by = "orig.ident") + - theme( legend.position = "none")); -})); - -# .......................................................................................................... -## @knitr heterogeneity_pca_umisCounts -# .......................................................................................................... - -# Same PCA plots but highlight UMI counts -invisible( apply( combn( 1:PCA_PLOTS_NBDIMS, 2), 2, function(dims) -{ - print( FeaturePlot( seurat_obj, feature = "nCount_RNA", reduction = "pca", dims = dims) + - theme( legend.position = "none", - plot.margin = margin(0, 0, 0, 0, "cm"), - plot.title = element_blank())); -})); - -# .......................................................................................................... -## @knitr heterogeneity_pca_genesCounts -# .......................................................................................................... - -# Same PCA plots but highlight feature counts -invisible( apply( combn( 1:PCA_PLOTS_NBDIMS, 2), 2, function(dims) -{ - print( FeaturePlot( seurat_obj, feature = "nFeature_RNA", reduction = "pca", dims = dims) + - theme( legend.position = "none", - plot.margin = margin(0, 0, 0, 0, "cm"), - plot.title = element_blank())); -})); - -# .......................................................................................................... -## @knitr heterogeneity_pca_correlations -# .......................................................................................................... - -# Isolate the PCA location of cells in first dimensions, together with UMI and genes counts for correlation analysis (makes use of cbind recycling to repeat values for each stacked PC) -relationToPC = suppressWarnings( cbind( stack( as.data.frame( Embeddings( seurat_obj, reduction = "pca")[ rownames( seurat_obj[[]]), paste0( "PC_", 1:PCA_PLOTS_NBDIMS) ])), - seurat_obj[[ c( "nCount_RNA", "nFeature_RNA") ]], - Cluster = Idents( seurat_obj))); - -# Plot relationship of UMIs and genes counts with PCs (combine plots using '/' from 'patchwork' lib) -print( (ggplot( data = relationToPC, aes( x = values, y = nCount_RNA)) + - facet_wrap( ~ind) + - stat_binhex( bins = 60) + - #geom_point( aes(col = Cluster), alpha = 0.5) + - geom_smooth( method = 'lm') + - stat_cor( method = "spearman") + - ylab( "# UMIs") + - theme( axis.title.x = element_blank(), - axis.text.x = element_blank(), - axis.ticks.x = element_blank(), - plot.margin = unit( c( 1, 1, -0.5, 0.5), "lines"))) - / - (ggplot( data = relationToPC, aes( x = values, y = nFeature_RNA)) + - facet_wrap( ~ind) + - stat_binhex( bins = 60) + - #geom_point( aes(col = Cluster), alpha = 0.5) + - geom_smooth( method = 'lm') + - stat_cor( method = "spearman") + - xlab( "PC values") + - ylab( "# Genes"))); - -# .......................................................................................................... -## @knitr heterogeneity_pca_loadings -# .......................................................................................................... - -# Plot PCA loadings -invisible( apply( combn( 1:PCA_PLOTS_NBDIMS, 2), 2, function(dims) -{ - namesPC=paste0( "PC_", dims); - # Get the loading values for concerned PCs - loadingsMatrix = Loadings( seurat_obj, reduction = "pca")[ , namesPC ]; - # Sort features by average absolute value and convert as DF with features names as column - loadingsMatrix = head( loadingsMatrix[ order( apply( loadingsMatrix, 1, function(x){ mean( abs( x)) }), decreasing = TRUE), ], PCA_PLOTS_NBFEATURES); - loadingsDF = data.frame( loadingsMatrix, features = rownames( loadingsMatrix)); - - # Define symmetric and consistent axes for group of plots - axesLimit = max( abs( loadingsMatrix)); - - # Plot arrows and features name - print( ggplot( data = loadingsDF, aes_( x = as.name( namesPC[1]), y = as.name( namesPC[2]))) + - coord_cartesian( xlim = c( -axesLimit, axesLimit), ylim = c( -axesLimit, axesLimit)) + - geom_text_repel( aes( label = features), max.iter = 10000) + #, max.overlaps = Inf - geom_segment( x = 0 , y = 0, aes_( xend = as.name(namesPC[1]), yend = as.name(namesPC[2])), col = "#00000044", arrow = arrow( length = unit( 2, "mm")))); -})); + npcs = pcs, + verbose = .VERBOSE, + assay = "SCT"); diff --git a/03_Script/04_cell_heterogeneity_tSNE_UMAP.R b/03_Script/04_cell_heterogeneity_tSNE_UMAP.R index 20d8212..8ffc74b 100644 --- a/03_Script/04_cell_heterogeneity_tSNE_UMAP.R +++ b/03_Script/04_cell_heterogeneity_tSNE_UMAP.R @@ -7,21 +7,14 @@ ## @knitr heterogeneity_dimReduc # .......................................................................................................... -nbPC_dimreduc = DIMREDUC_USE_PCA_NBDIMS -if(DIMREDUC_USE_PCA_NBDIMS > nbPC) -{ - warning( paste0( "Number of computed PCs (", nbPC, ") smaller than requested PCs for 'dimreduc' (", DIMREDUC_USE_PCA_NBDIMS,"), setting lower PC number (", nbPC, ")..." )) - nbPC_dimreduc = nbPC -} - #........................................ # Dimensional reduction (tSNE / UMAP) #........................................ # Runs the Uniform Manifold Approximation and Projection (UMAP) dimensional reduction technique -seurat_obj = RunUMAP( seurat_obj, dims = 1:nbPC_dimreduc); +seurat_obj = RunUMAP( seurat_obj, dims = 1:pcs); # Run t-SNE dimensionality reduction on selected features -seurat_obj = RunTSNE( seurat_obj, dims = 1:nbPC_dimreduc); +seurat_obj = RunTSNE( seurat_obj, dims = 1:pcs); # Save resulting coordinates for all cells as 'tsv' files # write.table( Embeddings(seurat_obj, reduction = "umap"), @@ -42,23 +35,6 @@ seurat_obj = RunTSNE( seurat_obj, dims = 1:nbPC_dimreduc); ## @knitr dimreduc_ggplot_covariables # .......................................................................................................... -#........................................ -# UMAP -#........................................ -# # Plot the cells sample of origin in UMAP -# DimPlot( seurat_obj, reduction = ifelse(exists("useReduction"), useReduction, "umap"), -# group.by = "orig.ident") + -# ggtitle( "Map of cells by sample of origin") - - -# # if QC EXPLORATION MODE is active, plot if the cells are outliers or not -# if( QC_EXPLORATION_MODE == TRUE){ -# print( DimPlot( seurat_obj, reduction = ifelse(exists("useReduction"), useReduction, "umap"), -# group.by = "outlier") + -# ggtitle( "Map of cells by \nQC filtering status") -# ) -# } - # Plot the cells percentage of ribosomal genes in UMAP FeaturePlot( seurat_obj, reduction = ifelse(exists("useReduction"), useReduction, "umap"), features = "percent.rb") + @@ -119,23 +95,6 @@ if( QC_EXPLORATION_MODE == TRUE){ ## @knitr dimreduc_ggplot_covariables_tSNE # .......................................................................................................... -#........................................ -# TSNE -#........................................ -# # Plot the cells sample of origin in tSNE -# DimPlot( seurat_obj, reduction = ifelse(exists("useReduction"), useReduction, "tsne"), -# group.by = "orig.ident") + -# ggtitle( "Map of cells by sample of origin") - - -# # if QC EXPLORATION MODE is active, plot if the cells are outliers or not -# if( QC_EXPLORATION_MODE == TRUE){ -# print( DimPlot( seurat_obj, reduction = ifelse(exists("useReduction"), useReduction, "tsne"), -# group.by = "outlier") + -# ggtitle( "Map of cells by \nQC filtering status") -# ) -# } - # Plot the cells percentage of ribosomal genes in tSNE FeaturePlot( seurat_obj, reduction = ifelse(exists("useReduction"), useReduction, "tsne"), features = "percent.rb") + diff --git a/03_Script/05_cell_heterogeneity_clustering.R b/03_Script/05_cell_heterogeneity_clustering.R index 82ea05d..d58bb37 100644 --- a/03_Script/05_cell_heterogeneity_clustering.R +++ b/03_Script/05_cell_heterogeneity_clustering.R @@ -12,19 +12,11 @@ source(file.path(dirname(DIRECTORY), "03_Script/00_general_deps.R")) #........................................ # Identify clusters #........................................ -# Identify clusters of cells by graph approach -nbPC_findclusters = FINDCLUSTERS_USE_PCA_NBDIMS -if(FINDCLUSTERS_USE_PCA_NBDIMS > nbPC) -{ - warning( paste0( "Number of computed PCs (", nbPC, ") smaller than requested PCs for 'findclusters' (", FINDCLUSTERS_USE_PCA_NBDIMS,"), setting lower PC number (", nbPC, ")..." )) - nbPC_findclusters = nbPC -} # Computes the k.param nearest neighbors for a given dataset seurat_obj <- FindNeighbors(object = seurat_obj, - k.param = FINDNEIGHBORS_K, reduction = "pca", - dims = 1:nbPC_findclusters, + dims = 1:pcs, verbose = .VERBOSE); # Identify clusters of cells by a shared nearest neighbor (SNN) modularity optimization based clustering algorithm @@ -294,26 +286,26 @@ if( exists( "CLUSTER_GROUP_LIST")){ # Save each seurat object saveRDS(seurat_obj, file = file.path( OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID,"_filtered_seurat_object.rds"))) -# save a tsv file for each sample -write.table( seurat_obj[[]], - file= file.path( OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_filtered_seurat_object.tsv")), - quote = FALSE, - row.names = TRUE, - col.names = NA, # Add a blank column name for row names (CSV convention) - sep="\t"); - -# Save the count matrix as csv file -count_matrix <- GetAssayData(object = seurat_obj[["RNA"]], slot = "counts") -count_matrix <- as.matrix(count_matrix) -count_matrix <- t(count_matrix) -count_matrix <- as.data.frame(count_matrix) - -fwrite(x = count_matrix, file = file.path( OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_count_matrix.csv")), row.names = TRUE) - -# Save the normalize matrix as csv file -normalized_matrix <- GetAssayData(object = seurat_obj[["RNA"]], slot = "data") -normalized_matrix <- as.matrix(normalized_matrix) -normalized_matrix <- t(normalized_matrix) -normalized_matrix <- as.data.frame(normalized_matrix) - -fwrite(x = normalized_matrix , file = file.path( OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_normalized_matrix.csv")), row.names = TRUE) +# # save a tsv file for each sample +# write.table( seurat_obj[[]], +# file= file.path( OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_filtered_seurat_object.tsv")), +# quote = FALSE, +# row.names = TRUE, +# col.names = NA, # Add a blank column name for row names (CSV convention) +# sep="\t"); + +# # Save the count matrix as csv file +# count_matrix <- GetAssayData(object = seurat_obj[["RNA"]], slot = "counts") +# count_matrix <- as.matrix(count_matrix) +# count_matrix <- t(count_matrix) +# count_matrix <- as.data.frame(count_matrix) + +# fwrite(x = count_matrix, file = file.path( OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_count_matrix.csv")), row.names = TRUE) + +# # Save the normalize matrix as csv file +# normalized_matrix <- GetAssayData(object = seurat_obj[["RNA"]], slot = "data") +# normalized_matrix <- as.matrix(normalized_matrix) +# normalized_matrix <- t(normalized_matrix) +# normalized_matrix <- as.data.frame(normalized_matrix) + +# fwrite(x = normalized_matrix , file = file.path( OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_normalized_matrix.csv")), row.names = TRUE) diff --git a/03_Script/06_markers.R b/03_Script/06_markers.R new file mode 100644 index 0000000..40fca2f --- /dev/null +++ b/03_Script/06_markers.R @@ -0,0 +1,209 @@ +# .......................................................................................................... +## @knitr GlutN +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Neurod2","Sv2b", "Nrn1", "Tbr1", "Satb2")) +) + +# .......................................................................................................... +## @knitr Cajal_Retzius_cells_markers +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Reln")) # Ndnf ? +) + +# .......................................................................................................... +## @knitr L2_3_IT +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Cux2","Stard8", "Lpl", "Prex1")) +) + +# .......................................................................................................... +## @knitr L4_IT_SSC +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Rorb","Rspol", "Kcnab3")) +) + +# .......................................................................................................... +## @knitr L4_IT_PC +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Rorb","S100b", "Ccdc3", "Scna7a")) +) + +# .......................................................................................................... +## @knitr L5_IT +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Hs3st2","Lvzl4", "Etv6")) +) + +# .......................................................................................................... +## @knitr L5_PT +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Fezf2","Oma1")) +) + +# .......................................................................................................... +## @knitr L5_6_NP +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Tshz2","Slc17a8", "Stard5", "Dkk2")) +) + +# .......................................................................................................... +## @knitr L6_IT +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Cryab","Plekha2", "1110032F04Rik")) +) + +# .......................................................................................................... +## @knitr L6_CT +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Foxp2","Jam3", "Sennc2")) +) + +# .......................................................................................................... +## @knitr L6b +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Cplx3","Ctgf", "Fbx17", "Clic5")) +) + +# .......................................................................................................... +## @knitr GABAN +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Gad1","Gad2", "Dlx1", "Dlx2")) +) + +# .......................................................................................................... +## @knitr Meis2 +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Meis2","Pbx3")) +) + +# .......................................................................................................... +## @knitr Lamp5_A7C_CNC +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Lamp5","Innt1", "Cox6a2", "Igfb2")) +) + +# .......................................................................................................... +## @knitr Lamp5_A7C_NGC +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Lamp5","Innt1", "Egln3")) +) + +# .......................................................................................................... +## @knitr Sncg_CCK_BC +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Sncg","Npas3", "Frem1")) +) + +# .......................................................................................................... +## @knitr Vip_BP_BTC +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Vip","Stat3")) +) + +# .......................................................................................................... +## @knitr Sst_T_MC +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Sst","Nyap1", "Nudt11", "Igfb2")) +) + +# .......................................................................................................... +## @knitr Sst_fan_MC +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Ptpn9","Exoc2", "Junb")) +) + +# .......................................................................................................... +## @knitr Sst_NMC +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Ptpn9","Exoc2")) +) + +# .......................................................................................................... +## @knitr Sst_IVC +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Hpse","Paqr8", "Hs3st1")) +) + +# .......................................................................................................... +## @knitr Sst_FS_like +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Atpla2","Prkg2")) +) + +# .......................................................................................................... +## @knitr Sst_Chold_LPC +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Zfhx3","Isl1", "Six3")) +) + +# .......................................................................................................... +## @knitr Pvalb_CHC +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Sp9","Rprm", "Gnb2l1", "Id3")) +) + +# .......................................................................................................... +## @knitr Pvalb_FS_BC +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Pvalb","Tafa2", "Tac1")) +) + +# .......................................................................................................... +## @knitr astro_markers +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Apoe","Aldh1l1","Slc1a3")) +) + +# .......................................................................................................... +## @knitr oligo_markers +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Pdgfra","Olig1","Olig2")) +) + +# .......................................................................................................... +## @knitr microglia_markers +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Aif1","Tmem119")) +) + +# .......................................................................................................... +## @knitr Endothelial_cells_markers +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Cldn5")) +) + +# .......................................................................................................... +## @knitr Pericyte_markers +# .......................................................................................................... +print( + FeaturePlot(seurat_obj, features = c("Pdgfrb")) +) \ No newline at end of file diff --git a/03_Script/DEG_analysis.R b/03_Script/DEG_analysis.R new file mode 100644 index 0000000..b573c70 --- /dev/null +++ b/03_Script/DEG_analysis.R @@ -0,0 +1,140 @@ +#======================================================================== +# DEG analysis +#======================================================================== + +#===================================== +# Library +#===================================== +library(Seurat) +library(ggplot2) +library(dplyr) +# library(ggrepel) +library(data.table) + +#===================================== +# Path to file and folder +#===================================== +DIRECTORY = getwd() +OUTPUTDIR = file.path((DIRECTORY), "05_Output") + +TIME_POINT = snakemake@params[["time_point"]] + +STEP2 = "02_seurat/" +STEP4 = "04_diffexp/" + +dir.create(file.path(OUTPUTDIR, STEP4)) + +#======================================================================== +# Load the data +#======================================================================== +WT <- readRDS(file = file.path(OUTPUTDIR, STEP2, paste0(TIME_POINT,"_WT/" ,TIME_POINT, "_WT_seurat_labelTransfert.rds"))) +KO <- readRDS(file = file.path(OUTPUTDIR, STEP2, paste0(TIME_POINT,"_KO/" ,TIME_POINT, "_KO_seurat_labelTransfert.rds"))) + +# Merge the two condition together +merge_data <- merge(WT, y = KO, add.cell.ids = c("WT", "KO")) + +### DE analysis +Idents(merge_data) <- "SCT" + +#===================================== +# Per condition +#===================================== +merge_data <- PrepSCTFindMarkers(merge_data) +de_results_per_condition <- FindMarkers(merge_data, ident.1 = "WT", ident.2 = "KO", group.by = "condition", logfc.threshold = -Inf) + +de_results_per_condition$diffexpressed <- "Not significant" +de_results_per_condition$diffexpressed[de_results_per_condition$avg_log2FC > 0.6 & de_results_per_condition$p_val_adj < 0.05] <- "Upregulated" +de_results_per_condition$diffexpressed[de_results_per_condition$avg_log2FC < -0.6 & de_results_per_condition$p_val_adj < 0.05] <- "Downregulated" +table(de_results_per_condition$diffexpressed) + +de_results_per_condition$delabel <- NA +de_results_per_condition$delabel[de_results_per_condition$diffexpressed != "Not significant"] <- rownames(de_results_per_condition)[de_results_per_condition$diffexpressed != "Not significant"] + +# When pvalue = 0 for plotting +de_results_per_condition$plot_p_val_adj <- ifelse(de_results_per_condition$p_val_adj == 0, 1e-300, de_results_per_condition$p_val_adj) + +# plot adding up all layers we have seen so far +plot <- ggplot(data=de_results_per_condition, aes(x=avg_log2FC, y=-log10(plot_p_val_adj), col=diffexpressed, label=delabel)) + + geom_point() + + theme_minimal() + + geom_text_repel() + + scale_color_manual(values=c("blue", "black", "red")) + + geom_vline(xintercept=c(-0.6, 0.6), col="red") + + geom_hline(yintercept=-log10(0.05), col="red") + +# Save the file in one folder +ggsave(path = file.path(OUTPUTDIR, STEP4, TIME_POINT), filename = paste0("/", TIME_POINT, "_volcanoplot_per_condition.pdf"), plot = plot, device = "pdf") +fwrite(x = de_results_per_condition, file = file.path(OUTPUTDIR, STEP4, TIME_POINT, paste0("/", TIME_POINT, "_diffexp_per_condition.csv")), row.names = TRUE) + +#===================================== +# Per cluster +#===================================== +# Function to have minimum 3 cells per label +filter_cell_types <- function(seurat_obj, min_cells = 3) { + celltype_counts <- table(seurat_obj$supertype_label) + valid_celltypes <- names(celltype_counts[celltype_counts >= min_cells]) + seurat_obj_filtered <- subset(seurat_obj, subset = supertype_label %in% valid_celltypes) + return(seurat_obj_filtered) +} + +WT <- filter_cell_types(WT) +KO <- filter_cell_types(KO) + +### Keep only the cells type that are in both condition +common_cell_types <- intersect(unique(KO$supertype_label), unique(WT$supertype_label)) + +KO <- subset(KO, subset = supertype_label %in% common_cell_types) +WT <- subset(WT, subset = supertype_label %in% common_cell_types) + +merge_data <- merge(WT, y = KO, add.cell.ids = c("WT", "KO")) + +de_list_per_cluster <- list() + +for (cell_type in common_cell_types) { + print(cell_type) + cell_type_subset <- subset(merge_data, subset = supertype_label == cell_type) + print(table(cell_type_subset$orig.ident)) + cell_type_subset <- PrepSCTFindMarkers(cell_type_subset) + de_results <- FindMarkers(cell_type_subset, ident.1 = "WT", ident.2 = "KO", group.by = "condition", logfc.threshold = -Inf) + de_list_per_cluster[[cell_type]] <- de_results +} + +# Assuming de_list_per_cluster is your list containing data frames for each cell type +for (cell_type in names(de_list_per_cluster)) { + de_results_per_cluster <- de_list_per_cluster[[cell_type]] + + # Classify differentially expressed genes + de_results_per_cluster$diffexpressed <- "Not significant" + de_results_per_cluster$diffexpressed[de_results_per_cluster$avg_log2FC > 0.6 & de_results_per_cluster$p_val_adj < 0.05] <- "Upregulated" + de_results_per_cluster$diffexpressed[de_results_per_cluster$avg_log2FC < -0.6 & de_results_per_cluster$p_val_adj < 0.05] <- "Downregulated" + + # Display the table of diffexpressed values + print(table(de_results_per_cluster$diffexpressed)) + + # Label differentially expressed genes + de_results_per_cluster$delabel <- NA + de_results_per_cluster$delabel[de_results_per_cluster$diffexpressed != "Not significant"] <- rownames(de_results_per_cluster)[de_results_per_cluster$diffexpressed != "Not significant"] + + de_results_per_cluster$plot_p_val_adj <- ifelse(de_results_per_cluster$p_val_adj == 0, 1e-300, de_results_per_cluster$p_val_adj) + + # Store the modified data frame back in the list + de_list_per_cluster[[cell_type]] <- de_results_per_cluster + + plot <- ggplot(data=de_results_per_cluster, aes(x=avg_log2FC, y=-log10(plot_p_val_adj), col=diffexpressed, label=delabel)) + + geom_point() + + theme_minimal() + + # geom_text_repel() + + scale_color_manual(values=c("blue", "black", "red")) + + geom_vline(xintercept=c(-0.6, 0.6), col="red") + + geom_hline(yintercept=-log10(0.05), col="red") + + labs(title=paste("Volcano Plot for", cell_type)) + + # Save the plot + ggsave(path = file.path(OUTPUTDIR, STEP4, TIME_POINT), filename = paste0("/", TIME_POINT, "_volcanoplot_", cell_type, ".pdf"), plot = plot, device = "pdf") + fwrite(x = de_results_per_cluster, file = file.path(OUTPUTDIR, STEP4, TIME_POINT, paste0("/", TIME_POINT, "_diffexp_", cell_type, ".csv")), row.names = TRUE) +} + +TEXT_OUTPUT <- snakemake@output[["seurat_DEG_output"]] +output_file<-file(TEXT_OUTPUT) +writeLines(c("Rules seurat DE finished"), output_file) +close(output_file) \ No newline at end of file diff --git a/03_Script/allen_mouse_whole_cortex.R b/03_Script/allen_mouse_whole_cortex.R index f455df2..acc72f8 100644 --- a/03_Script/allen_mouse_whole_cortex.R +++ b/03_Script/allen_mouse_whole_cortex.R @@ -8,21 +8,25 @@ #------------------------------------- library(rhdf5) library(data.table) +library(Seurat) +library(dplyr) #------------------------------------- # Path to files / folders #------------------------------------- DIRECTORY = getwd() TEXT_OUTPUT = snakemake@output[["data_for_sims_output"]] -OUTPUTDIR = file.path(dirname(DIRECTORY), "05_Output") -REF = file.path(dirname(DIRECTORY), "01_Reference") +OUTPUTDIR = file.path((DIRECTORY), "05_Output") +REF = file.path((DIRECTORY), "01_Reference") SAMPLE_ID = snakemake@params[["sample_id"]] +REFERENCE_NAME = snakemake@params[["reference_name"]] + +NORM_METHOD = snakemake@params[["norm_method"]] +NORM_SCALE_FACTOR = snakemake@params[["norm_scale_factor"]] ALLEN_METADATA = snakemake@params[["allen_metadata"]] ALLEN_MATRIX = snakemake@params[["allen_matrix"]] -ALLEN_GENES = snakemake@params[["allen_genes"]] -ALLEN_CELLS = snakemake@params[["allen_cells"]] OUTPUT_NAME_REF_METADATA = snakemake@params[["output_name_ref_metadata"]] OUTPUT_NAME_REF_MATRIX = snakemake@params[["output_name_ref_matrix"]] @@ -30,90 +34,248 @@ OUTPUT_NAME_MATRIX = snakemake@params[["output_name_matrix"]] STEP2 = "02_seurat/" STEP3 = "03_sims/" +STEP_REMI = "03_remi/" + +METADATA_REMI <- TRUE #------------------------------------- -# Load the metadata of the reference +# Library #------------------------------------- -reference_metadata <- fread(file = file.path(REF, ALLEN_METADATA)) +library(rhdf5) +library(data.table) +library(Seurat) +library(dplyr) -########################################## PROPRE AU PROJET ########################################## #------------------------------------- -# Keep only some part of the cortex -# we want to study: Ssp +# Path to files / folders #------------------------------------- -reference_metadata <- reference_metadata[reference_metadata$region_label %in% "Ssp",] +DIRECTORY = getwd() +TEXT_OUTPUT = snakemake@output[["data_for_sims_output"]] +OUTPUTDIR = file.path((DIRECTORY), "05_Output") +REF = file.path((DIRECTORY), "01_Reference") -###################################################################################################### +SAMPLE_ID = snakemake@params[["sample_id"]] +REFERENCE_NAME = snakemake@params[["reference_name"]] -#------------------------------------- -# Load the matrix of reference -#------------------------------------- -reference_matrix <- h5read(file = file.path(REF, ALLEN_MATRIX), "/data/counts") -reference_genes <- h5read(file = file.path(REF, ALLEN_GENES), "/data/gene") -reference_cells <- h5read(file = file.path(REF, ALLEN_CELLS), "/data/samples") +NORM_METHOD = snakemake@params[["norm_method"]] +NORM_SCALE_FACTOR = snakemake@params[["norm_scale_factor"]] -rownames(reference_matrix) <- as.character(reference_cells) -colnames(reference_matrix) <- as.character(reference_genes) +ALLEN_METADATA = snakemake@params[["allen_metadata"]] +ALLEN_MATRIX = snakemake@params[["allen_matrix"]] -rm(list = c("reference_genes","reference_cells")) -gc() +OUTPUT_NAME_REF_METADATA = snakemake@params[["output_name_ref_metadata"]] +OUTPUT_NAME_REF_MATRIX = snakemake@params[["output_name_ref_matrix"]] +OUTPUT_NAME_MATRIX = snakemake@params[["output_name_matrix"]] + +STEP2 = "02_seurat/" +STEP3 = "03_sims/" +STEP_REMI = "03_remi/" -#------------------------------------- -# Keep cells thaht match both the -# metadata an matrix of reference -#------------------------------------- -reference_matrix <- reference_matrix[rownames(reference_matrix) %in% reference_metadata$sample_name,] -reference_metadata <- reference_metadata[reference_metadata$sample_name %in% rownames(reference_matrix),] +METADATA_REMI <- TRUE -#------------------------------------- -# Order metadata according to matrix -# (Necessary ?) -#------------------------------------- -reference_metadata <- reference_metadata[match(rownames(reference_matrix),reference_metadata$sample_name),] -#------------------------------------- -# Wtrite the the metadata as csv -#------------------------------------- -fwrite(x = reference_metadata, file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_REF_METADATA, ".csv"))) +metadata <- fread(file = file.path(REF, ALLEN_METADATA)) +print("Reference metadata loaded. It's size is :") +print(dim(metadata)) -rm("reference_metadata") -gc() +metadata <- metadata[metadata$region_label %in% "SSp",] +print("Only SSp are kept in the metadata: The new size of this one is :") +print(dim(metadata)) -#------------------------------------- -# Load our matrix to annotate -#------------------------------------- -matrix <- fread(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_count_matrix.csv"))) # The count matrix is used for this reference. -matrix <- as.matrix(matrix) -rownames(matrix) <- matrix[,"V1"] +# Load the HDF5 matrix +h5_file <- H5Fopen(file.path(REF, ALLEN_MATRIX)) +h5ls(h5_file) +expression_matrix <- h5read(h5_file, "/data/counts") +gene_names <- h5read(h5_file, "/data/gene") +sample_names <- h5read(h5_file, "/data/samples") +H5Fclose(h5_file) -#------------------------------------- -# Look at genes who are common in both -# matrix and subset genes who are nots -#------------------------------------- -gene_intersection <- intersect(colnames(matrix), colnames(reference_matrix)) -reference_matrix <- reference_matrix[, gene_intersection] -matrix <- matrix[, gene_intersection] +colnames(expression_matrix) <- as.character(gene_names) # Si nécessaire +rownames(expression_matrix) <- as.character(sample_names) # Si nécessaire -#------------------------------------- -# Wtrite the two new matrix -#------------------------------------- -dir.create(file.path(OUTPUTDIR, STEP3, SAMPLE_ID)) +expression_matrix <- expression_matrix[rownames(expression_matrix) %in% metadata$sample_name,] +metadata <- metadata[metadata$sample_name %in% rownames(expression_matrix),] -class(matrix) <- "numeric" -write.csv(matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_MATRIX,".csv"))) +print("The metadata and matrix of reference have the same cells now : The new size of them two :") +print(dim(expression_matrix)) +print(dim(metadata)) -rm("matrix") gc() -class(reference_matrix) <- "numeric" -write.csv(reference_matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_REF_MATRIX,".csv"))) +expression_matrix <- t(expression_matrix) +print("The matrix is transposed") + +gc() + +colnames(expression_matrix) <- gsub("_", "-", original_sample_names) + +seurat_object <- CreateSeuratObject(counts = expression_matrix, project = "allen_seuratObject") +saveRDS(seurat_object, file = file.path(REF, "allen_seurat_object.rds")) + +rm("seurat_object") -rm("reference_matrix") gc() +#### Pour une raison que j'ignore cette partie ne marche pas, je dois le faire en local + +seurat_object <- readRDS(file = file.path(REF, "allen_seurat_object.rds")) + +metadata$sample_name <- gsub("_", "-", metadata$sample_name) +metadata <- metadata[match(colnames(seurat_object), metadata$sample_name), ] +seurat_object <- AddMetaData(object = seurat_object, metadata = metadata) + +saveRDS(seurat_object, file = file.path(REF, "allen_seurat_object_with_metadata.rds")) + + + +# #------------------------------------- +# # Load the metadata of the reference +# #------------------------------------- + +# if(METADATA_REMI){ +# reference_metadata <- fread(file = file.path(OUTPUTDIR, STEP_REMI, "final_allen.csv")) +# cells_names <- "sample_name" +# celltypes <- "celltype_label" +# } else { +# reference_metadata <- fread(file = file.path(REF, ALLEN_METADATA)) +# cells_names <- "sample_name" +# celltypes <- "subclass_label" +# } + +# print("Reference metadata loaded. It's size is :") +# print(dim(reference_metadata)) + +# ########################################## PROPRE AU PROJET ########################################## +# #------------------------------------- +# # Keep only some part of the cortex +# # we want to study: Ssp +# #------------------------------------- +# if(!METADATA_REMI){ +# reference_metadata <- reference_metadata[reference_metadata$region_label %in% "SSp",] +# print("Only SSp are kept in the metadata: The new size of this one is :") +# print(dim(reference_metadata)) +# } +# ###################################################################################################### + +# #------------------------------------- +# # For the training we need at least +# # two cells per labels +# #------------------------------------- +# occurence <- table(reference_metadata[[celltypes]]) +# one_cell_per_label <- names(occurence[occurence == 1]) +# reference_metadata <- reference_metadata[!(reference_metadata[[celltypes]] %in% one_cell_per_label), ] + +# print("The new size after removing labels thaht only appear once :") +# print(dim(reference_metadata)) + +# #------------------------------------- +# # Load the matrix of reference +# #------------------------------------- +# reference_matrix <- h5read(file = file.path(REF, ALLEN_MATRIX), "/data/counts") +# reference_genes <- h5read(file = file.path(REF, ALLEN_MATRIX), "/data/gene") +# reference_cells <- h5read(file = file.path(REF, ALLEN_MATRIX), "/data/samples") + +# rownames(reference_matrix) <- as.character(reference_cells) +# colnames(reference_matrix) <- as.character(reference_genes) + +# rm(list = c("reference_genes","reference_cells")) +# gc() + +# print("Reference matrix loaded. The size is :") +# print(dim(reference_matrix)) + +# #------------------------------------- +# # Keep cells that match both the +# # metadata and matrix of reference +# #------------------------------------- +# reference_matrix <- reference_matrix[rownames(reference_matrix) %in% reference_metadata[[cells_names]],] +# reference_metadata <- reference_metadata[reference_metadata[[cells_names]] %in% rownames(reference_matrix),] + +# print("The metadata and matrix of reference have the same cells now : The new size of them two :") +# print(dim(reference_matrix)) +# print(dim(reference_metadata)) +# print(reference_matrix[1:5, 1:25]) + +# #------------------------------------- +# # Order metadata according to matrix +# # (Necessary ?) +# #------------------------------------- +# reference_metadata <- reference_metadata[match(rownames(reference_matrix),reference_metadata[[cells_names]]),] +# print("Cells are in the same order in both files") + +# #------------------------------------- +# # Write the the metadata as csv +# #------------------------------------- +# dir.create(file.path(OUTPUTDIR, STEP3, REFERENCE_NAME)) + +# fwrite(x = reference_metadata, file = file.path(OUTPUTDIR, STEP3, REFERENCE_NAME, paste0(OUTPUT_NAME_REF_METADATA, ".csv"))) + +# rm("reference_metadata") +# gc() + +# print("Metadata is written") + +# #------------------------------------- +# # Normalization of the matrix of +# # reference +# #------------------------------------- +# reference_matrix <- NormalizeData( object = reference_matrix, normalization.method = NORM_METHOD, scale.factor = NORM_SCALE_FACTOR, verbose = FALSE) +# reference_matrix <- as.matrix(reference_matrix) + +# print("The matrix of reference is now normalized the same way as our matrix to analyze: Here the first 10 rows and 10 columns :") +# print(reference_matrix[1:5, 1:25]) + +# #------------------------------------- +# # Load our matrix to annotate +# #------------------------------------- +# dir.create(file.path(OUTPUTDIR, STEP3, SAMPLE_ID)) + +# matrix <- fread(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_normalized_matrix.csv"))) +# matrix <- as.matrix(matrix) +# rownames(matrix) <- matrix[,"V1"] +# matrix <- matrix[, -1] + +# print("Our matrix to annotate is loaded, see the dimension below :") +# print(dim(matrix)) + +# # ------------------------------------- +# # Wtrite the two new matrix +# # ------------------------------------- +# class(matrix) <- "numeric" +# write.csv(matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_MATRIX,".csv"))) +# print("Our matrix is written") + +# rm("matrix") +# gc() + +# class(reference_matrix) <- "numeric" +# write.csv(reference_matrix, file.path(OUTPUTDIR, STEP3, REFERENCE_NAME, paste0(OUTPUT_NAME_REF_MATRIX,".csv"))) +# print("Our matrix of reference is written") + +# rm("reference_matrix") +# gc() + #------------------------------------- # create the output file #------------------------------------- output_file<-file(TEXT_OUTPUT) -writeLines(c("Files preparation for the Arlotta reference for SIMS finished (CSV format)"), output_file) +writeLines(c("Files preparation for the Allen reference for SIMS finished (CSV format)"), output_file) close(output_file) + + + + + +# #------------------------------------- +# # Look at genes who are common in both +# # matrix and subset genes who are nots +# #------------------------------------- +# gene_intersection <- intersect(colnames(matrix), colnames(reference_matrix)) +# reference_matrix <- reference_matrix[, gene_intersection] +# matrix <- matrix[, gene_intersection] + +# print("Only the gene in common in booth our matrix are kept : The new dimmension :") +# print(dim(reference_matrix)) +# print(colnames(reference_matrix)[1:35]) +# print(dim(matrix)) +# print(colnames(matrix)[1:35]) \ No newline at end of file diff --git a/03_Script/arlotta.R b/03_Script/arlotta.R index 8ca4d32..72f2f59 100755 --- a/03_Script/arlotta.R +++ b/03_Script/arlotta.R @@ -3,7 +3,7 @@ # Paola Arlotta reference : # https://www.nature.com/articles/s41586-021-03670-5 #.............................................................. -########################################################################### + #------------------------------------- # Library #------------------------------------- @@ -12,7 +12,7 @@ library(data.table) library(Seurat) #------------------------------------- -# Path to files / folderss +# Path to files / folders #------------------------------------- DIRECTORY = getwd() TEXT_OUTPUT = snakemake@output[["data_for_sims_output"]] @@ -21,6 +21,7 @@ REF = file.path((DIRECTORY), "01_Reference") DEVELOPMENTAL_TIME = snakemake@params[["developmental_time"]] SAMPLE_ID = snakemake@params[["sample_id"]] +REFERENCE_NAME = snakemake@params[["reference_name"]] NORM_METHOD = snakemake@params[["norm_method"]] NORM_SCALE_FACTOR = snakemake@params[["norm_scale_factor"]] @@ -31,30 +32,52 @@ OUTPUT_NAME_MATRIX = snakemake@params[["output_name_matrix"]] STEP2 = "02_seurat/" STEP3 = "03_sims/" +STEP_REMI = "03_remi/" + +METADATA_REMI <- TRUE #------------------------------------- # Load the metadata of the reference #------------------------------------- -reference_metadata <- fread(file = file.path(REF, "arlotta/metaData_scDevSC.txt")) -reference_metadata <- reference_metadata[-1, ] -reference_metadata <- reference_metadata[, NEW_NAME := gsub("[^ATCG]", "", NAME)] ### Keep only the A,T,G and C in the original name +if(METADATA_REMI){ + reference_metadata <- fread(file = file.path(OUTPUTDIR, STEP_REMI, "final_arlotta.csv")) + cells_names <- "sample_name" + celltypes <- "celltype_label" + column_id <- "biosample_id" +} else { + reference_metadata <- fread(file = file.path(REF, "arlotta/metaData_scDevSC.txt")) + reference_metadata <- reference_metadata[-1, ] + cells_names <- "NAME" + celltypes <- "New_cellType" + column_id <- "orig_ident" +} + +reference_metadata <- reference_metadata[, NEW_NAME := gsub("[^ATCG]", "", get(cells_names))] ### Keep only the A,T,G and C in the original name +print("The metadata is loaded, seee his dimension below :") +print(dim(reference_metadata)) +print(head(reference_metadata)) #------------------------------------- # For the training we need at least # two cells per labels #------------------------------------- -occurence <- table(reference_metadata$New_cellType) +occurence <- table(reference_metadata[[celltypes]]) one_cell_per_label <- names(occurence[occurence == 1]) -reference_metadata <- reference_metadata[!(reference_metadata$New_cellType %in% one_cell_per_label), ] +reference_metadata <- reference_metadata[!(reference_metadata[[celltypes]] %in% one_cell_per_label), ] + +print("The new dimension after removing the labels taht only appear once :") +print(dim(reference_metadata)) #------------------------------------- # Erase some labels in the metadata : # Low quality cells, Doublet, # Red blood cells, group #------------------------------------- -patterns_to_delete <- c("Low quality cells", "Doublet", "Red blood cells", "group") -rows_to_delete <- apply(sapply(patterns_to_delete, grepl, reference_metadata$New_cellType), 1, any) -reference_metadata <- subset(reference_metadata, !rows_to_delete) +if (!METADATA_REMI){ + patterns_to_delete <- c("Low quality cells", "Doublet", "Red blood cells", "group") + rows_to_delete <- apply(sapply(patterns_to_delete, grepl, reference_metadata[[celltypes]]), 1, any) + reference_metadata <- subset(reference_metadata, !rows_to_delete) +} #------------------------------------- # Load the matrix of reference @@ -89,18 +112,22 @@ for (x in file_to_open){ list_reference_matrix[[x]] <- Read10X_h5(filepath[[x]]) } +print("The diferent matrix for diferent developmental time have been loaded in a list") + #------------------------------------- # Normalization + Transpose them # (cell in rows) + Transform them # in matrix #------------------------------------- -list_reference_matrix <- lapply(list_reference_matrix, function(x){ - normalized_matrix <- NormalizeData( object = x, normalization.method = NORM_METHOD, scale.factor = NORM_SCALE_FACTOR, verbose = FALSE) -}) +# list_reference_matrix <- lapply(list_reference_matrix, function(x){ +# normalized_matrix <- NormalizeData( object = x, normalization.method = NORM_METHOD, scale.factor = NORM_SCALE_FACTOR, verbose = FALSE) +# }) list_reference_matrix <- lapply(list_reference_matrix, function(x) as.matrix(x)) list_reference_matrix <- lapply(list_reference_matrix, function(x) t(x)) +# print("The matrix have been normalized") + #------------------------------------- # Keep only A,T,G and C in every # matrix for the cells name (to match @@ -116,54 +143,68 @@ list_reference_matrix <- lapply(list_reference_matrix, clean_rownames) #------------------------------------- # Keep cells that match both the -# metadata an matrix of reference +# metadata and matrix of reference #------------------------------------- metadata <- data.table() for(x in names(list_reference_matrix)){ - if(x %in% reference_metadata$orig_ident){ - rows_metadata <- reference_metadata[orig_ident == x] # Look in the metadata and keep only the rows that corespond to the matrix (Keep P4 rows for the P4 matrix for exemple) - cell_metadata <- rows_metadata$NEW_NAME # Cells in the metadata - cell_reference_matrix <- rownames(list_reference_matrix[[x]]) # Cells in the metadata - matching_cells <- intersect(cell_metadata, cell_reference_matrix) # Look at cells that match - temporary_metadata <- subset(rows_metadata, NEW_NAME %in% matching_cells) # Create a temporary metadata for each matrix whith only the cells that match and remove the others - temporary_metadata$NEW_NAME <- paste(x, temporary_metadata$NEW_NAME, sep="_") # Add the develomental age in front of cell name (some diferent developmental point have same name for cells, that is why) - metadata <- rbind(metadata, temporary_metadata) # Append the for each matrix the new metadata to the final one - list_reference_matrix[[x]] <- list_reference_matrix[[x]][matching_cells, ] # Now keep only the mathing cell in the matrix and remove the others - rownames(list_reference_matrix[[x]]) <- paste(x, rownames(list_reference_matrix[[x]]), sep = "_") # Add the develomental age in front of cell name (Like this also names still match metadata) + print("The developmental time :") + print(x) + if(x %in% reference_metadata[[column_id]]){ + rows_metadata <- reference_metadata[reference_metadata[[column_id]] == x] # Look in the metadata and keep only the rows that corespond to the matrix (Keep P4 rows for the P4 matrix for exemple) + print("The dimension of the metadata for this develompental time :") + print(dim(rows_metadata)) + cell_metadata <- rows_metadata$NEW_NAME # Cells in the metadata + cell_reference_matrix <- rownames(list_reference_matrix[[x]]) # Cells in the matrix + matching_cells <- intersect(cell_metadata, cell_reference_matrix) # Look at cells that match + temporary_metadata <- subset(rows_metadata, NEW_NAME %in% matching_cells) # Create a temporary metadata for each matrix whith only the cells that match and remove the others + temporary_metadata$NEW_NAME <- paste(x, temporary_metadata$NEW_NAME, sep="_")# Add the develomental age in front of cell name (some diferent developmental point have same name for cells, that is why) + print("The new dimension of the metadata after keeping only cells that match with the matrix :") + print(dim(temporary_metadata)) + metadata <- rbind(metadata, temporary_metadata) # Append the for each matrix the new metadata to the final one + list_reference_matrix[[x]] <- list_reference_matrix[[x]][matching_cells, ] # Now keep only the mathing cell in the matrix and remove the others + rownames(list_reference_matrix[[x]]) <- paste(x, rownames(list_reference_matrix[[x]]), sep = "_") # Add the develomental age in front of cell name (Like this also names still match metadata) } } +print("The final size of the metadata : ") +print(dim(metadata)) + reference_matrix <- do.call(rbind, list_reference_matrix) +print("The size of the matrix of reference : ") +print(dim(reference_matrix)) +print("Here the first 10 rows and 10 columns :") +print(reference_matrix[1:10, 1:10]) + #------------------------------------- # Load our matrix to annotate #------------------------------------- -matrix <- fread(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_normalized_matrix.csv"))) # The normalized matrix is used for this reference. +matrix <- fread(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_count_matrix.csv"))) # The normalized matrix is used for this reference. matrix <- as.matrix(matrix) rownames(matrix) <- matrix[,"V1"] +matrix <- matrix[, -1] -#------------------------------------- -# Look at genes who are common in both -# matrix and subset genes who are not -#------------------------------------- -gene_intersection <- intersect(colnames(matrix), colnames(reference_matrix)) -reference_matrix <- reference_matrix[, gene_intersection] -matrix <- matrix[, gene_intersection] +print("Our matrix to annotate is loaded, here the first rows : ") +print(matrix[1:10, 1:10]) #------------------------------------- # Wtrite the two new matrix + # create directory #------------------------------------- -dir.create(file.path(OUTPUTDIR, STEP3, SAMPLE_ID)) - +dir.create(file.path(OUTPUTDIR, STEP3, REFERENCE_NAME)) class(reference_matrix) <- "numeric" -write.csv(reference_matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_REF_MATRIX,".csv"))) +write.csv(reference_matrix, file.path(OUTPUTDIR, STEP3, REFERENCE_NAME, paste0(OUTPUT_NAME_REF_MATRIX,".csv")), row.names = TRUE) +print("The reference matrix is written") + +dir.create(file.path(OUTPUTDIR, STEP3, SAMPLE_ID)) class(matrix) <- "numeric" write.csv(matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_MATRIX,".csv"))) +print("The matrix is written") -fwrite(x = metadata, file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_REF_METADATA,".csv"))) +fwrite(x = metadata, file = file.path(OUTPUTDIR, STEP3, REFERENCE_NAME, paste0(OUTPUT_NAME_REF_METADATA,".csv")), row.names = TRUE) +print("The metadaa is written") #------------------------------------- # create the output file @@ -172,104 +213,11 @@ output_file<-file(TEXT_OUTPUT) writeLines(c("Files preparation for the Arlotta reference for SIMS finished (CSV format)"), output_file) close(output_file) -###################################################################################################################################################################################################################################################### -# Old code -# #------------------------------------- -# # Path to files / folders -# #------------------------------------- -# DIRECTORY = getwd() -# TEXT_OUTPUT = snakemake@output[["data_for_sims_output"]] -# OUTPUTDIR = file.path((DIRECTORY), "05_Output") -# REF = file.path((DIRECTORY), "01_Reference") -# PATTERN_TO_KEEP = snakemake@params[["pattern_to_keep"]] -# SAMPLE_ID = snakemake@params[["sample_id"]] -# ARLOTTA_METADATA = snakemake@params[["arlotta_metadata"]] -# ARLOTTA_MATRIX = snakemake@params[["arlotta_matrix"]] -# ARLOTTA_CELLS = snakemake@params[["arlotta_cells"]] -# ARLOTTA_FEATURES = snakemake@params[["arlotta_features"]] -# OUTPUT_NAME_REF_METADATA = snakemake@params[["output_name_ref_metadata"]] -# OUTPUT_NAME_REF_MATRIX = snakemake@params[["output_name_ref_matrix"]] -# OUTPUT_NAME_MATRIX = snakemake@params[["output_name_matrix"]] -# STEP2 = "02_seurat/" -# STEP3 = "03_sims/" - -# #------------------------------------- -# # Load the metadata of the reference -# #------------------------------------- -# reference_metadata <- fread(file = file.path(REF, ARLOTTA_METADATA)) - -# #------------------------------------- -# # Erase some labels in the metadata : -# # Low quality cells, Doublet, -# # Red blood cells, group -# #------------------------------------- -# patterns_to_delete <- c("Low quality cells", "Doublet", "Red blood cells", "group") -# rows_to_delete <- apply(sapply(patterns_to_delete, grepl, reference_metadata$New_cellType), 1, any) -# reference_metadata <- subset(reference_metadata, !rows_to_delete) - -# #------------------------------------- -# # Load the matrix of reference -# #------------------------------------- -# reference_matrix <- readMM(file = file.path(REF, ARLOTTA_MATRIX)) - -# feature.names = read.delim(file.path(REF, ARLOTTA_FEATURES), -# header = FALSE, -# stringsAsFactors = FALSE) - -# cells.names = read.delim(file.path(REF, ARLOTTA_CELLS), -# header = FALSE, -# stringsAsFactors = FALSE) - -# colnames(reference_matrix) = cells.names$V1 -# rownames(reference_matrix) = feature.names$V1 - -# # #------------------------------------- -# # # Transpose the matrix : -# # # Cells in row and genes in columns + -# # # Transform rownames and colnames -# # # as characters (for sims). -# # #------------------------------------- -# reference_matrix <- t(reference_matrix) -# reference_matrix <- as.matrix(reference_matrix) - -# colnames(reference_matrix) <- as.character(colnames(reference_matrix)) # Genes -# rownames(reference_matrix) <- as.character(rownames(reference_matrix)) # Cells - -# #------------------------------------- -# # Write a new matrix : Keep only -# # mouse age that are interesting -# # for the project -# #------------------------------------- -# pattern_to_keep <- c(PATTERN_TO_KEEP) -# rows_to_keep <- grepl(paste(pattern_to_keep, collapse = "|"), rownames(reference_matrix)) -# reference_matrix <- reference_matrix[rows_to_keep, ] - -# #------------------------------------- -# # Keep cells that match both the -# # metadata an matrix of reference -# #------------------------------------- -# column_metadata <- reference_metadata$NAME -# matching_cells <- intersect(rownames(reference_matrix), column_metadata) -# reference_metadata <- reference_metadata[reference_metadata$NAME %in% matching_cells, ] -# reference_matrix <- reference_matrix[matching_cells, ] - -# #------------------------------------- -# # Order metadata according to matrix -# # (Necessary ?) -# #------------------------------------- -# reference_metadata <- reference_metadata[match(rownames(reference_matrix),reference_metadata$NAME),] - -# #------------------------------------- -# # Load our matrix to annotate -# #------------------------------------- -# matrix <- fread(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0( SAMPLE_ID, "_normalized_matrix.csv"))) # The normalized matrix is used for this reference. -# matrix <- as.matrix(matrix) -# rownames(matrix) <- matrix[,"V1"] # #------------------------------------- # # Look at genes who are common in both @@ -278,24 +226,3 @@ close(output_file) # gene_intersection <- intersect(colnames(matrix), colnames(reference_matrix)) # reference_matrix <- reference_matrix[, gene_intersection] # matrix <- matrix[, gene_intersection] - -# #------------------------------------- -# # Wtrite the two new matrix + -# # create directory -# #------------------------------------- -# dir.create(file.path(OUTPUTDIR, STEP3, SAMPLE_ID)) - -# class(reference_matrix) <- "numeric" -# write.csv(reference_matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_REF_MATRIX,".csv"))) - -# class(matrix) <- "numeric" -# write.csv(matrix, file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_MATRIX,".csv"))) - -# fwrite(x = reference_metadata, file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(OUTPUT_NAME_REF_METADATA,".csv"))) - -# #------------------------------------- -# # create the output file -# #------------------------------------- -# output_file<-file(TEXT_OUTPUT) -# writeLines(c("Files preparation for the Arlotta reference for SIMS finished (CSV format)"), output_file) -# close(output_file) \ No newline at end of file diff --git a/03_Script/imbalanced_datasets.py b/03_Script/imbalanced_datasets.py new file mode 100644 index 0000000..71d5b78 --- /dev/null +++ b/03_Script/imbalanced_datasets.py @@ -0,0 +1,200 @@ +#.............................................................. +# KNNOR : An oversampling technique for imalanced datasets +#.............................................................. + +#------------------------------------- +# Import +#------------------------------------- +from knnor import data_augment +from sklearn.preprocessing import LabelEncoder +import pandas as pd +import numpy as np +import sys +import os + +#------------------------------------- +# Path to files / folders +#------------------------------------- +DIRECTORY = os.getcwd() +OUTPUTDIR = os.path.join(DIRECTORY, "05_Output") + +SAMPLE_ID = sys.argv[1] +MATRIX = sys.argv[2] +METADATA = sys.argv[3] +CLASS_LABEL = sys.argv[4] +MAX_OVERSAMPLING = sys.argv[5] +CELLS_COLUMN = sys.argv[6] + +STEP3 = "03_sims/" + +#------------------------------------- +# Open metadata and matrix +#------------------------------------- +print("****************************************************************************************************************************************************************************************") +matrix = pd.read_csv(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, MATRIX + ".csv")) +print("The original matrix :") +print(matrix) +print("") + +metadata = pd.read_csv(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, METADATA + ".csv")) +print("The original metadata :") +print(metadata) +print("") + +#------------------------------------- +# Occurence of labels in metadata +#------------------------------------- +max_labels = metadata[CLASS_LABEL].value_counts() +print("The data will be increased to the level of the largest occurrence (see table below): ") +print(max_labels) +print("") + +#------------------------------------- +# Supress the top occurences in +# metadata if needed ! +#------------------------------------- +MAX_OVERSAMPLING = MAX_OVERSAMPLING.split(',') +MAX_OVERSAMPLING = '|'.join(MAX_OVERSAMPLING) + +new_metadata = metadata[~metadata[CLASS_LABEL].str.contains(MAX_OVERSAMPLING)] +print("The new metadata after supression of top occurence(s) :") +print(new_metadata) +print("") + +#------------------------------------- +# Delete cells in matrix that are not +# anymore in the metadata +#------------------------------------- +new_matrix = matrix[matrix.iloc[:, 0].isin(new_metadata[CELLS_COLUMN])] +print("The new matrix that match the new metadata :") +print(new_matrix) +print("") + +#------------------------------------- +# Check if cells are in the same order +# in booth file +#------------------------------------- +are_identical = new_matrix.iloc[:, 0].equals(new_metadata[CELLS_COLUMN]) + +if are_identical: + print("Column are in the same order") +else: + print("Column are not in the same order : it will be reorder") ### A faire !!! + + +#------------------------------------- +# Add the label column from the +# metadata at the end of the matrix +#------------------------------------- +new_matrix['Label'] = new_metadata[CLASS_LABEL].tolist() + +#------------------------------------- +# Put the label into numeric class +#------------------------------------- +label_encoder = LabelEncoder() +new_matrix['Label'] = label_encoder.fit_transform(new_matrix['Label']) + +# Create an asociate dictionary +label_dictionnary = {index: label for index, label in enumerate(label_encoder.classes_)} +print("Labels dictionnary : ") +print(label_dictionnary) +print("") + +# Put the first column as rownames +new_matrix = new_matrix.set_index(new_matrix.columns[0]) +new_matrix.index.name = None + +#------------------------------------- +# Convert the matrix to a numpy array +#------------------------------------- +new_matrix = np.array(new_matrix) + +# X : the matrix, y : the labels (numeric values) +X = new_matrix[:,:-1] +y = new_matrix[:,-1] + +# ------------------------------------- +# Use KNNOR +# ------------------------------------- +knnor = data_augment.KNNOR() + +# X_new : complete data with augmented datapoints +# y_new : Labels including the augmented ones +# X_aug_min : Just the augmented minority points +# y_aug_min : Labels for only augmented minority points +X_new, y_new, X_aug_min, y_aug_min = knnor.fit_resample(X, y) +y_new = y_new.reshape(-1,1) + +# new_data contains the augmented data (Matrix + numeric labels ) +new_data = np.append(X_new, y_new, axis=1) + +# Transform value numeric back to the labels name +y_aug_min = np.array([label_dictionnary[label] for label in y_aug_min]) + +# Look at label augmentation +print("Labels augmentation : ") +unique_values, counts = np.unique(y_aug_min, return_counts=True) +for value, count in zip(unique_values, counts): + print(f"{value} is repeated {count} times.") +print("") + +# ------------------------------------- +# Create final metadata +# ------------------------------------- +# Create a dictionary to store new data +metadata_knnor = { + CELLS_COLUMN: ["KNNOR" + str(i) for i in range(1, len(y_aug_min) + 1)], + CLASS_LABEL: list(y_aug_min) +} + +# Create a DataFrame from the dictionary +metadata_knnor = pd.DataFrame(metadata_knnor) +print("The knnor metadata only with augmented points :") +print(metadata_knnor) +print("") + +# Add the new DataFrame to the metadata +metadata = pd.concat([metadata, metadata_knnor], ignore_index=True) + +# Replace all NaN values with -100 +metadata.fillna(-100, inplace = True) +print("The metadata with augmented values :") +print(metadata) +print("") + +# Occurence of labels with new data created +max_labels = metadata[CLASS_LABEL].value_counts() +print("Occurence of labels with new data created: ") +print(max_labels) +print("") + +# Write the new metadata with augmented values +metadata.to_csv(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, "KNNOR_" + METADATA + ".csv"), index = False) + +# ------------------------------------- +# Create final matrix +# ------------------------------------- +# Recover new cell names created for new data +cell_names = list(metadata_knnor[CELLS_COLUMN]) + +# Add a new column (first column) to the numpy array (for the concat step at the end) +num_rows = X_aug_min.shape[0] +empty_column = np.zeros((num_rows, 1), dtype=int) +X_aug_min = np.hstack((empty_column, X_aug_min)) + +# Transform the numpy matrix with augmented data to a DataFrame +X_aug_min = pd.DataFrame(X_aug_min, columns = matrix.columns) + +# Add the cell names to the first column +X_aug_min.iloc[:, 0] = cell_names +print("The knnor matrix only with augmented points :") +print(X_aug_min) +print("") + +# Add the augmented data matrix to our matrix +matrix = pd.concat([matrix, X_aug_min], ignore_index=True) +print("The matrix with augmented values :") +print(matrix) + +# Write the new matrix with augmented data +matrix.to_csv(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, "KNNOR_" + MATRIX + ".csv"), index = False) \ No newline at end of file diff --git a/03_Script/labelTransfert_seurat.R b/03_Script/labelTransfert_seurat.R new file mode 100644 index 0000000..a5a777c --- /dev/null +++ b/03_Script/labelTransfert_seurat.R @@ -0,0 +1,536 @@ +#======================================================================== +# Label transfert seurat +#======================================================================== + +#===================================== +# Library +#===================================== +library(Seurat) +library(ggplot2) +library(dplyr) +library(data.table) +library(DoubletFinder) + +options(future.globals.maxSize = 3000 * 1024^2) + +#===================================== +# Path to file and folder +#===================================== +DIRECTORY = getwd() +OUTPUTDIR = file.path((DIRECTORY), "05_Output") +REF = file.path((DIRECTORY), "01_Reference") + +SAMPLE_ID = "P30_WT" + +ALLEN = TRUE + +STEP2 = "02_seurat/" + +# #======================================================================== +# # Load the data an the reference +# #======================================================================== + +## Our data +so <- readRDS(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "_filtered_seurat_object.rds"))) + +if(ALLEN == TRUE){ + ref <- readRDS(file = file.path(REF, "/allen_seurat_object_with_metadata.rds")) + ref <- PercentageFeatureSet(ref, pattern = "^mt-", col.name = "percent.mt") + ref <- SCTransform(ref, vars.to.regress = "percent.mt", verbose = FALSE, future.seed = FALSE) + +} else { + ## The arlotta reference + ref <- readRDS(file = file.path(REF, "/modify_arlotta_seurat_object.rds")) + + # Keep only some cells in the reference + pattern <- "E14" + metadata <- ref@meta.data + filtered_metadata <- metadata[grepl(pattern, metadata$biosample_id), ] + cells_to_keep <- rownames(filtered_metadata) + ref <- subset(ref, cells = cells_to_keep) + ref <- PercentageFeatureSet(ref, pattern = "^mt-", col.name = "percent.mt") + ref <- SCTransform(ref, vars.to.regress = "percent.mt", verbose = FALSE) +} + +#======================================================================== +# Doublet Finder +#======================================================================== + +pct <- so[["pca"]]@stdev / sum(so[["pca"]]@stdev) * 100 +cumu <- cumsum(pct) +co1 <- which(cumu > 90 & pct < 5)[1] +co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1 +pcs <- min(co1, co2) + +sweep.res <- paramSweep(so, PCs = 1:pcs, sct = TRUE) +sweep.stats <- summarizeSweep(sweep.res, GT = FALSE) +# bcmvn <- find.pK(sweep.stats) +homotypic.prop <- modelHomotypic(so$seurat_clusters) +nExp_poi <- round(0.075 * ncol(so)) +nExp_poi.adj <- round(nExp_poi * (1 - homotypic.prop)) +so <- doubletFinder(so, PCs = 1:pcs, pN = 0.25, pK = 0.09, nExp = nExp_poi.adj, reuse.pANN = FALSE, sct = TRUE) + +metadata_columns <- colnames(so@meta.data) +pattern <- "DF.classifications_0.25_0.09_\\d+" # Adjust the regex as needed +matched_column <- metadata_columns[grepl(pattern, metadata_columns)] +num_doublets <- sub("DF\\.classifications_0\\.25_0\\.09_(\\d+)", "\\1", matched_column) +print(num_doublets) +full_column_name <- paste0("DF.classifications_0.25_0.09_", num_doublets) +cells_to_keep <- rownames(so@meta.data[so@meta.data[[full_column_name]] == "Singlet", ]) +so <- subset(so, cells = cells_to_keep) + +#======================================================================== +# Annotation +#======================================================================== + +so$celltype_label <- NA +all_genes <- intersect(rownames(ref), rownames(so)) +anchors <- FindTransferAnchors(reference = ref, query = so, dims = 1:30, normalization.method = "SCT", features = all_genes) +prediction <- TransferData(anchorset = anchors, refdata = as.vector(ref$celltype_label)) +so <- AddMetaData(so, metadata = prediction) +so$celltype_label <- so$predicted.id + +pdf(file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "_annotation_before_filtering_UMAP.pdf")), width = 8, height = 8) +plot <- DimPlot(so, group.by = "celltype_label", label.size = 6) + theme(legend.position = "bottom", legend.text = element_text(size = 4)) +print(plot) + +# UMAP plot per label +labels <- unique(so$celltype_label) +for (label in labels) { + cells_to_highlight <- rownames(so@meta.data[so@meta.data$celltype_label == label, ]) + plots <- DimPlot(so, group.by = "celltype_label", cells.highlight = cells_to_highlight) + + ggtitle(label) + + theme(legend.position = "bottom", legend.text = element_text(size = 4)) + print(plots) +} +dev.off() + +#======================================================================== +# Filtering bad predicted cells : KMeans +#======================================================================== + +# set.seed(42) # For reproducibility + +# metadata <- so@meta.data +# metadata$cell_id <- rownames(metadata) +# unique_labels <- unique(metadata$celltype_label) +# thresholds <- data.frame(celltype_label = unique_labels, cluster_threshold = NA) + +# for (label in unique_labels) { +# subset_data <- metadata[metadata$celltype_label == label, ] +# print(label) +# print(nrow(subset_data)) +# # Check if there are at least 2 cells for k-means clustering +# if (nrow(subset_data) <= 2) { +# # If fewer than 2 cells, set the threshold to 0 (no filtering) +# cluster_threshold <- 0 +# } else { +# # Perform k-means clustering +# kmeans_result <- kmeans(subset_data$prediction.score.max, centers = 2) +# cluster_centers <- kmeans_result$centers +# high_confidence_cluster <- which.max(cluster_centers) +# cluster_threshold <- min(subset_data$prediction.score.max[kmeans_result$cluster == high_confidence_cluster]) +# } + +# thresholds[thresholds$celltype_label == label, "cluster_threshold"] <- cluster_threshold +# } + +# # Merge the thresholds with the metadata +# metadata <- merge(metadata, thresholds, by = "celltype_label") + +# # Apply the filtering based on the prediction score and cluster threshold +# metadata <- metadata[metadata$prediction.score.max >= metadata$cluster_threshold, ] + +# saveRDS(so, file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "annotate_seurat_objet_before_filtering.rds"))) + +# so_clean <- subset(so, cells = metadata$cell_id) + + +# pdf(file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "_annotation_after_filtering_UMAP.pdf")), width = 8, height = 8) +# plot <- DimPlot(so_clean, group.by = "celltype_label", label.size = 6) + theme(legend.position = "bottom", legend.text = element_text(size = 4)) +# print(plot) + +# # UMAP plot per label +# labels <- unique(so_clean$celltype_label) +# for (label in labels) { +# cells_to_highlight <- rownames(so_clean@meta.data[so_clean@meta.data$celltype_label == label, ]) +# plots <- DimPlot(so_clean, group.by = "celltype_label", cells.highlight = cells_to_highlight) + +# ggtitle(label) + +# theme(legend.position = "bottom", legend.text = element_text(size = 4)) +# print(plots) +# } +# dev.off() + +# saveRDS(so_clean, file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "annotate_seurat_objet_after_filtering.rds"))) + +#======================================================================== +# Filtering bad predicted cells : Remi's +#======================================================================== + +# 1. Get all the prediction score column names, excluding "prediction.score.max" +prediction_columns <- grep("^prediction\\.score\\.(?!max$)", colnames(so@meta.data), value = TRUE, perl = TRUE) + +# 2. Create a new column "filter_celltype_label" to store the filtered labels +so@meta.data$filter_celltype_label <- "Nothing" # Initialize with NA + +# 3. Loop through each row and perform the filtering based on the criteria +for (i in 1:nrow(so@meta.data)) { + # Extract the prediction scores for the current cell as a numeric vector + scores <- as.numeric(so@meta.data[i, prediction_columns]) + + # Sort the scores in descending order and extract the top two + sorted_scores <- sort(scores, decreasing = TRUE) + best_score <- sorted_scores[1] + second_best_score <- sorted_scores[2] + + # Get the labels corresponding to the top two scores + best_label <- prediction_columns[which.max(scores)] # Find the label for the best score + second_best_label <- prediction_columns[which(scores == second_best_score)[1]] # Find the label for the second-best score + + # 4. Check if the difference between the best and second-best score is greater than 20% + if ((best_score - second_best_score) > 0.20) { + # Keep the label corresponding to the best score (remove "prediction.score." prefix) + so@meta.data$filter_celltype_label[i] <- gsub("prediction\\.score\\.", "", best_label) + } else { + # Otherwise, set the label as NA + so@meta.data$filter_celltype_label[i] <- "Nothing" + } +} + +saveRDS(so, file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "annotate_seurat_objet_before_filtering.rds"))) + +so_clean <- subset(so, subset = filter_celltype_label != "Nothing") + +pdf(file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "_annotation_after_filtering_UMAP.pdf")), width = 8, height = 8) +plot <- DimPlot(so_clean, group.by = "celltype_label", label.size = 6) + theme(legend.position = "bottom", legend.text = element_text(size = 4)) +print(plot) + +# UMAP plot per label +labels <- unique(so_clean$celltype_label) +for (label in labels) { + cells_to_highlight <- rownames(so_clean@meta.data[so_clean@meta.data$celltype_label == label, ]) + plots <- DimPlot(so_clean, group.by = "celltype_label", cells.highlight = cells_to_highlight) + + ggtitle(label) + + theme(legend.position = "bottom", legend.text = element_text(size = 4)) + print(plots) +} +dev.off() + +saveRDS(so_clean, file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "annotate_seurat_objet_after_filtering.rds"))) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +# #======================================================================== +# # Create functions +# #======================================================================== + +# # Function for performing label transfer with optional per-class processing +# perform_label_transfer <- function(so, ref, actual_class = "actual_class", target_label = "target_label", skip_loop = FALSE, output_dir = "default/path", pdf_file = "label_transfer_plot.pdf") { + +# # Initialize or clear the target_label column +# so[[target_label]] <- NA + +# class_labels <- as.character(so[[actual_class]][, actual_class]) + +# # Ensure the output directory exists +# if (!dir.exists(output_dir)) { +# dir.create(output_dir, recursive = TRUE) +# } + +# # Create the full path for the PDF file +# pdf_file <- file.path(output_dir, pdf_file) + +# if (!is.null(pdf_file)) { +# pdf(pdf_file, width = 8, height = 6) # Open PDF if specified +# } + +# if (!skip_loop) { +# unique_classes <- unique(class_labels) + +# for (class in unique_classes) { +# print(class) + +# # Set idents +# Idents(so) <- actual_class +# Idents(ref) <- actual_class + +# # Subset per class +# subset_querry_data <- subset(so, idents = class) +# subset_ref_data <- subset(ref, idents = class) + +# # Keep the same genes +# all_genes <- intersect(rownames(subset_ref_data), rownames(subset_querry_data)) + +# # Directly assign actual_class to target_label +# so[[target_label]][so[[actual_class]] == class] <- class + +# # Find anchors and Transfer Data with error handling +# tryCatch({ +# anchors <- FindTransferAnchors(reference = subset_ref_data, query = subset_querry_data, dims = 1:30, normalization.method = "SCT", features = all_genes) +# prediction <- TransferData(anchorset = anchors, refdata = as.vector(subset_ref_data[[target_label]])) + +# # Append target_label to each prediction column name +# colnames(prediction) <- paste0(colnames(prediction), ".", target_label) +# subset_querry_data <- AddMetaData(subset_querry_data, metadata = prediction) +# new_metadata <- subset_querry_data@meta.data[, grepl(paste0(".", target_label), colnames(subset_querry_data@meta.data))] +# so@meta.data[rownames(new_metadata), colnames(new_metadata)] <- new_metadata + +# # Assign the predicted labels +# so[[target_label]] <- subset_querry_data[[paste0("predicted.id.", target_label)]] + +# }, error = function(e) { +# warning(paste("Error for class:", class, "- Keeping actual_class in target_label")) +# }) +# } +# } else { +# # Skip looping, transfer all cells at once +# all_genes <- intersect(rownames(ref), rownames(so)) + +# # Perform label transfer without looping +# anchors <- FindTransferAnchors(reference = ref, query = so, dims = 1:30, normalization.method = "SCT", features = all_genes) +# prediction <- TransferData(anchorset = anchors, refdata = as.vector(ref[[target_label]])) + +# # Append target_label to the prediction columns +# colnames(prediction) <- paste0(colnames(prediction), ".", target_label) + +# # Add prediction metadata to the Seurat object +# so <- AddMetaData(so, metadata = prediction) + +# # Assign the predicted labels +# so[[target_label]] <- so[[paste0("predicted.id.", target_label)]] +# } + +# # UMAP plot +# plot <- DimPlot(so, group.by = target_label, label.size = 6) + +# theme(legend.position = "bottom", legend.text = element_text(size = 4)) + +# print(plot) # Print the main plot + +# # UMAP plot per label +# labels <- unique(so@meta.data[[target_label]]) +# for (label in labels) { +# cells_to_highlight <- rownames(so@meta.data[so@meta.data[[target_label]] == label, ]) +# plots <- DimPlot(so, group.by = target_label, cells.highlight = cells_to_highlight) + +# ggtitle(label) + +# theme(legend.position = "bottom", legend.text = element_text(size = 4)) +# print(plots) +# } + +# if (!is.null(pdf_file)) { +# dev.off() # Close the PDF device +# } + +# return(so) +# } + +# # Function to calculate thresholds based on prediction scores using k-means clustering +# # Function to calculate thresholds based on prediction scores using k-means clustering +# calculate_thresholds <- function(metadata, celltype_label, prediction_column) { +# unique_labels <- unique(metadata[[celltype_label]]) +# thresholds <- data.frame(celltype_label = unique_labels, cluster_threshold = NA, stringsAsFactors = FALSE) + +# for (label in unique_labels) { +# subset_data <- metadata[metadata[[celltype_label]] == label & !is.na(metadata[[prediction_column]]), ] + +# # Try-catch block to handle cases with insufficient data or k-means failures +# tryCatch({ +# if (nrow(subset_data) > 1) { +# # Perform k-means clustering with 2 centers +# kmeans_result <- kmeans(subset_data[[prediction_column]], centers = 2) +# cluster_centers <- kmeans_result$centers +# high_confidence_cluster <- which.max(cluster_centers) + +# # Calculate the cluster threshold +# cluster_threshold <- min(subset_data[[prediction_column]][kmeans_result$cluster == high_confidence_cluster], na.rm = TRUE) +# thresholds[thresholds$celltype_label == label, "cluster_threshold"] <- cluster_threshold +# } else { +# # If there are not enough rows to cluster +# stop("Not enough data for k-means clustering") +# } +# }, error = function(e) { +# # Handle error: retain original label (no filtering) +# message(paste("Error for label:", label, "-", e$message, "- Retaining original cells (no filtering).")) +# thresholds[thresholds$celltype_label == label, "cluster_threshold"] <- -Inf # No filtering for this label +# }) +# } + +# return(thresholds) +# } + +# # Function to filter cells by threshold +# filter_cells_by_threshold <- function(metadata, prediction_column, thresholds, celltype_label) { +# # Merge thresholds back to metadata +# metadata <- merge(metadata, thresholds, by = celltype_label, by.y = "celltype_label", all.x = TRUE) + +# # Check if the merge was successful +# if (nrow(metadata) == 0) { +# stop("Merging thresholds into metadata resulted in zero rows. Check column names and values.") +# } + +# # Filter cells based on prediction scores and thresholds +# filtered_metadata <- metadata[ +# is.na(metadata[[prediction_column]]) | +# (metadata[[prediction_column]] >= metadata$cluster_threshold), +# ] + +# return(filtered_metadata) +# } + +# # Function to create UMAP plots, save them as a PDF, and return filtered results +# create_filtered_umap <- function(so, celltype_label, output_dir = "default/path", file_name = "default_plot.pdf") { +# set.seed(42) # For reproducibility + +# # Ensure the output directory exists +# if (!dir.exists(output_dir)) { +# dir.create(output_dir, recursive = TRUE) +# } + +# # Get metadata and create cell ID column +# metadata <- so@meta.data +# metadata$cell_id <- rownames(metadata) + +# # Dynamically construct the column name for prediction scores +# prediction_column <- paste0("prediction.score.max.", celltype_label) + +# # Calculate thresholds +# thresholds <- calculate_thresholds(metadata, celltype_label, prediction_column) +# print(thresholds) # Log the computed thresholds +# write.csv(thresholds, file = file.path(output_dir, paste0(celltype_label, "_threshold.csv")), row.names = FALSE) + +# # Filter cells based on thresholds +# filtered_metadata <- filter_cells_by_threshold(metadata, prediction_column, thresholds, celltype_label) + +# # Subset the original Seurat object using the filtered metadata +# so <- subset(so, cells = filtered_metadata$cell_id) + +# # Create a full path for the PDF file +# pdf_file <- file.path(output_dir, file_name) + +# # Call the plotting function with the filtered Seurat object and cell type label +# plot_umap(so, celltype_label, pdf_file) # Save the plots directly to the specified PDF + +# # Return the updated Seurat object and thresholds used +# return(list(filtered_object = so, thresholds = thresholds)) +# } + +# # Function to plot UMAP with optional PDF saving +# plot_umap <- function(so, celltype_label, pdf_file = NULL) { +# if (!is.null(pdf_file)) { +# pdf(pdf_file, width = 8, height = 6) # Open a PDF device if specified +# } + +# # Main UMAP plot +# plot <- DimPlot(so, group.by = celltype_label, label.size = 6) + +# theme(legend.position = "bottom", legend.text = element_text(size = 4)) + +# print(plot) # Print the main plot + +# # Generate individual UMAP plots for each label +# labels <- unique(so@meta.data[[celltype_label]]) +# for (label in labels) { +# cells_to_highlight <- rownames(so@meta.data[so@meta.data[[celltype_label]] == label, ]) +# plots <- DimPlot(so, group.by = celltype_label, cells.highlight = cells_to_highlight) + +# ggtitle(label) + +# theme(legend.position = "bottom", legend.text = element_text(size = 4)) + +# print(plots) # Print each individual plot +# } + +# if (!is.null(pdf_file)) { +# dev.off() # Close the PDF device if it was opened +# } +# } + +# #======================================================================== +# # Load the data an the reference +# #======================================================================== + +# ## Our data +# so <- readRDS(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "_filtered_seurat_object.rds"))) + +# ## The reference +# ref <- readRDS(file = file.path(REF, "/modify_arlotta_seurat_object.rds")) + +# # Keep only some cells in the reference +# pattern <- "P4" +# metadata <- ref@meta.data +# filtered_metadata <- metadata[grepl(pattern, metadata$biosample_id), ] +# cells_to_keep <- rownames(filtered_metadata) +# ref <- subset(ref, cells = cells_to_keep) +# ref <- PercentageFeatureSet(ref, pattern = "^mt-", col.name = "percent.mt") +# ref <- SCTransform(ref, vars.to.regress = "percent.mt", verbose = FALSE) + +# #======================================================================== +# # Doublet Finder +# #======================================================================== + +# pct <- so[["pca"]]@stdev / sum(so[["pca"]]@stdev) * 100 +# cumu <- cumsum(pct) +# co1 <- which(cumu > 90 & pct < 5)[1] +# co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1 +# pcs <- min(co1, co2) + +# sweep.res <- paramSweep(so, PCs = 1:pcs, sct = TRUE) +# sweep.stats <- summarizeSweep(sweep.res, GT = FALSE) +# # bcmvn <- find.pK(sweep.stats) +# homotypic.prop <- modelHomotypic(so$seurat_clusters) +# nExp_poi <- round(0.075 * ncol(so)) +# nExp_poi.adj <- round(nExp_poi * (1 - homotypic.prop)) +# so <- doubletFinder(so, PCs = 1:pcs, pN = 0.25, pK = 0.09, nExp = nExp_poi.adj, reuse.pANN = FALSE, sct = TRUE) + +# metadata_columns <- colnames(so@meta.data) +# pattern <- "DF.classifications_0.25_0.09_\\d+" # Adjust the regex as needed +# matched_column <- metadata_columns[grepl(pattern, metadata_columns)] +# num_doublets <- sub("DF\\.classifications_0\\.25_0\\.09_(\\d+)", "\\1", matched_column) +# print(num_doublets) +# full_column_name <- paste0("DF.classifications_0.25_0.09_", num_doublets) +# cells_to_keep <- rownames(so@meta.data[so@meta.data[[full_column_name]] == "Singlet", ]) +# so <- subset(so, cells = cells_to_keep) + +# #======================================================================== +# # Step 1 : annotate with broad class +# #======================================================================== +# so <- perform_label_transfer(so, ref, actual_class = "class_label", target_label = "class_label", skip_loop = TRUE, output_dir = file.path(OUTPUTDIR, STEP2, SAMPLE_ID), pdf_file = "class_lablel_before_filtering.pdf") +# filtered_results <- create_filtered_umap(so, celltype_label = "class_label", output_dir = file.path(OUTPUTDIR, STEP2, SAMPLE_ID), file_name = "class_lablel_after_filtering.pdf") + +# #======================================================================== +# # Step 2 : annotate with subclass +# #======================================================================== +# so <- perform_label_transfer(so, ref, actual_class = "class_label", target_label = "subclass_label", skip_loop = FALSE, output_dir = file.path(OUTPUTDIR, STEP2, SAMPLE_ID), pdf_file = "subclass_label_before_filtering.pdf") +# filtered_results <- create_filtered_umap(so, celltype_label = "subclass_label", output_dir = file.path(OUTPUTDIR, STEP2, SAMPLE_ID), file_name = "subclass_label_after_filtering.pdf") + +# #======================================================================== +# # Step 3 : annotate with celltype +# #======================================================================== +# so <- perform_label_transfer(so, ref, actual_class = "subclass_label", target_label = "celltype_label", skip_loop = FALSE, output_dir = file.path(OUTPUTDIR, STEP2, SAMPLE_ID), pdf_file = "celltype_label_before_filtering.pdf") +# filtered_results <- create_filtered_umap(so, celltype_label = "celltype_label", output_dir = file.path(OUTPUTDIR, STEP2, SAMPLE_ID), file_name = "celltype_label_after_filtering.pdf") + +# saveRDS(so, file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID, "annotate_seurat_objet.rds"))) diff --git a/03_Script/remi.R b/03_Script/remi.R new file mode 100644 index 0000000..399d894 --- /dev/null +++ b/03_Script/remi.R @@ -0,0 +1,257 @@ +#======================================================================================= +# Create the arlotta reference with Remi's work +#======================================================================================= + +#------------------------------------- +# Library +#------------------------------------- +library(Seurat) +library(ggplot2) +library(dplyr) +library(data.table) + +#------------------------------------- +# Path to files / folders +#------------------------------------- +DIRECTORY = getwd() +OUTPUTDIR = file.path((DIRECTORY), "05_Output") +REF = file.path((DIRECTORY), "01_Reference") + +#==================================== +# Open the arlotta metadata : +# https://singlecell.broadinstitute.org/single_cell/study/SCP1290/molecular-logic-of-cellular-diversification-in-the-mammalian-cerebral-cortex +#==================================== +arlotta_metadata <- fread(file = file.path(REF, "arlotta/metaData_scDevSC.txt")) + +### Remove some low quality cells an keep only NN, Migrating/Immature and Dorsal Pallium Progenitor : +patterns_to_delete <- c("Interneurons", "Cajal Retzius cells", "UL CPN", "Layer 4", "DL CPN", "DL_CPN_1", "DL_CPN_2", "SCPN", "NP", "CThPN", "Layer 6b", "Doublet", "group", "Low quality cells", "Red blood cells") +rows_to_delete <- apply(sapply(patterns_to_delete, grepl, arlotta_metadata$New_cellType), 1, any) +arlotta_metadata <- subset(arlotta_metadata, !rows_to_delete) + +### Keep only some column +arlotta_metadata <- subset(arlotta_metadata, select = c("NAME", "New_cellType", "orig_ident")) # Cells, subclass (for NN), and the age of the mouse + +### Add the class that corespond to each New Celltype => Rémi's thesis +pattern_class <- c("Apical progenitors" = "Dorsal Pallium Progenitor", + "Astrocytes" = "Non Neuronal", + "Cycling glial cells" = "Non Neuronal", + "Endothelial cells" = "Non Neuronal", + "Ependymocytes" = "Non Neuronal", + "Immature neurons" = "Immature/Migrating", + "Intermediate progenitors" = "Dorsal Pallium Progenitor", + "Microglia" = "Non Neuronal", + "Migrating neurons" = "Immature/Migrating", + "Oligodendrocytes" = "Non Neuronal", + "Pericytes" = "Non Neuronal", + "VLMC" = "Non Neuronal") + +assign_class <- function(cell_type, patterns) { + match <- patterns[grep(cell_type, names(patterns))] + if (length(match) > 0) { + return(match) + } else { + return(NA) + } +} + +arlotta_metadata$class_label <- sapply(arlotta_metadata$New_cellType, assign_class, pattern_class) +arlotta_metadata$supertype_label <- arlotta_metadata$New_cellType +arlotta_metadata$celltype_label <- arlotta_metadata$New_cellType + +### Rename column for a futur merge +setnames(arlotta_metadata, "NAME", "sample_name") +setnames(arlotta_metadata, "New_cellType", "subclass_label") +setnames(arlotta_metadata, "orig_ident", "biosample_id") + +#==================================== +# Open Remi's metadata : reanotation +# of the arlotta reference +#==================================== +rds_remi <- readRDS(file.path(REF, "remi/GECortex_PostMsub_celltypefinal.rds" )) +rds_metadata_remi <- rds_remi@meta.data + +rm("rds_remi") +gc() + +### Looking for the arlotta annotation in the whole metadata +motif_arlotta <- "J.Di Bella et al., Nature 2021" +remi_arlotta_metadata <- subset(rds_metadata_remi, grepl(motif_arlotta, rds_metadata_remi[["study_label"]])) + +### Only keep some column +remi_arlotta_metadata <- subset(remi_arlotta_metadata, select = c("sample_name", "class_label", "biosample_id", "supertype_label", "subclass_label", "celltype_label")) + +### Merge the two metadata +modify_arlotta_metadata <- rbind(remi_arlotta_metadata, arlotta_metadata) + +### Create a new column for cell name, to only keep A,T,C,C (to match the matrix) + write it as csv +modify_arlotta_metadata <- as.data.table(modify_arlotta_metadata) +modify_arlotta_metadata <- modify_arlotta_metadata[, new_sample_name := gsub("[^ATCG]", "", sample_name)] +write.csv(modify_arlotta_metadata, file = file.path(REF, "modify_arlotta_metadata.csv"), row.names = FALSE) + +#==================================== +# Match our arlotta matrices to +# the new metadata +# https://singlecell.broadinstitute.org/single_cell/study/SCP1290/molecular-logic-of-cellular-diversification-in-the-mammalian-cerebral-cortex +#==================================== + +### Path of every h5 matrices in a list +list_files <- list( +E10 <- file.path(REF, "arlotta/GSM5277843_E10_v1_filtered_feature_bc_matrix.h5"), +E11 <- file.path(REF, "arlotta/GSM4635072_E11_5_filtered_gene_bc_matrices_h5.h5"), +E12 <- file.path(REF, "arlotta/GSM4635073_E12_5_filtered_gene_bc_matrices_h5.h5"), +E13 <- file.path(REF, "arlotta/GSM4635074_E13_5_filtered_gene_bc_matrices_h5.h5"), +E14 <- file.path(REF, "arlotta/GSM4635075_E14_5_filtered_gene_bc_matrices_h5.h5"), +E15 <- file.path(REF, "arlotta/GSM4635076_E15_5_S1_filtered_gene_bc_matrices_h5.h5"), +E16 <- file.path(REF, "arlotta/GSM4635077_E16_filtered_gene_bc_matrices_h5.h5"), +E17 <- file.path(REF, "arlotta/GSM5277844_E17_5_filtered_feature_bc_matrix.h5"), +E18_S1 <- file.path(REF, "arlotta/GSM4635078_E18_5_S1_filtered_gene_bc_matrices_h5.h5"), +E18_S3 <- file.path(REF, "arlotta/GSM4635079_E18_S3_filtered_gene_bc_matrices_h5.h5"), +P1 <- file.path(REF, "arlotta/GSM4635081_P1_S2_filtered_gene_bc_matrices_h5.h5"), +P1_S1 <- file.path(REF, "arlotta/GSM4635080_P1_S1_filtered_gene_bc_matrices_h5.h5"), +P4 <- file.path(REF, "arlotta/GSM5277845_P4_filtered_feature_bc_matrix.h5") +) + +list_files <- setNames(list_files, c("E10", "E11", "E12", "E13", "E14", "E15", "E16", "E17", "E18_S1", "E18_S3", "P1", "P1_S1", "P4")) + +list_reference_matrix <- list() + +### Read every h5 fie +for (x in names(list_files)){ + filepath <- list_files[[x]] + list_reference_matrix[[x]] <- Read10X_h5(filepath) +} + +### Make sure the cellnames are written in the same way as the metadata +clean_colnames <- function(x){ + current_colnames <- colnames(x) + modified_colnames <- gsub("[^ATCG]", "", current_colnames) + colnames(x) <- modified_colnames + return(x) +} +list_reference_matrix <- lapply(list_reference_matrix, clean_colnames) + +print("The diferent matrix for diferent developmental time have been loaded in a list") + +### Keep cells that match both the metadata and the matrix +metadata <- data.table() + +for (x in names(list_reference_matrix)) { + print("The developmental time :") + print(x) + if (x %in% modify_arlotta_metadata$biosample_id) { + rows_metadata <- modify_arlotta_metadata[modify_arlotta_metadata$biosample_id == x] # Look in the metadata and keep only the rows that corespond to the matrix (Keep P4 rows for the P4 matrix for exemple) + cell_metadata <- rows_metadata$new_sample_name # Cells name in the metadata (With just A,T,G,C) + cell_reference_matrix <- colnames(list_reference_matrix[[x]]) # Cells in the matrix + matching_cells <- intersect(cell_metadata, cell_reference_matrix) # Look at cells that match in metadata and matrix + + # Create a temporary metadata for each matrix with only the cells that match and remove the others + temporary_metadata <- subset(rows_metadata, new_sample_name %in% matching_cells) + temporary_metadata$new_sample_name <- paste(x, temporary_metadata$new_sample_name, sep="_") # Add the developmental age in front of cell name + + # Append the new metadata for each matrix to the final one + metadata <- rbind(metadata, temporary_metadata) + + # Now keep only the matching cells in the matrix and remove the others + matching_indices <- which(cell_reference_matrix %in% matching_cells) + list_reference_matrix[[x]] <- list_reference_matrix[[x]][, matching_indices, drop=FALSE] + colnames(list_reference_matrix[[x]]) <- paste(x, colnames(list_reference_matrix[[x]]), sep = "_") # Add the developmental age in front of cell name + } +} + +print("The final size of the metadata : ") +print(dim(metadata)) + +reference_matrix <- do.call(cbind, list_reference_matrix) +print("The size of the matrix of reference : ") +print(dim(reference_matrix)) + +#==================================== +# Create a Seurat Object + save it +#==================================== +rownames(metadata) <- metadata$new_sample_name +seurat_object <- CreateSeuratObject(counts = reference_matrix, meta.data = metadata) + +saveRDS(seurat_object, file = file.path(REF, "modify_arlotta_seurat_object.rds")) + + +################################################################################################ CREATE FILES TO HAVE ALL CLASS FOR SIMS ANNOTATION + +# #===================================== +# # Fichier celltype manipulation +# #===================================== +# # Open the RDS and write the structure as a .txt file : +# remi_celltypes <- readRDS(file.path(REF, "remi/GECortex_PostMsub_celltypefinal.rds" )) # ~ 16 GB +# sink(file.path(OUTPUTDIR, STEP_REMI, "remi_celltypefinal_structure.txt" )) +# str(remi_celltypes) +# sink() + +# # Write the metadata as a csv file : +# remi_celltypes_metadata <- remi_celltypes@meta.data +# write.csv(remi_celltypes_metadata, file.path(OUTPUTDIR, STEP_REMI, "remi_celltypefinal_metadata.csv"), row.names = TRUE) + +# # write metadata for both Arlotta and Allen reference : +# motif_arlotta <- "J.Di Bella et al., Nature 2021" +# remi_arlotta_metadata <- subset(remi_celltypes_metadata, grepl(motif_arlotta, remi_celltypes_metadata[["study_label"]])) +# write.csv(remi_arlotta_metadata, file.path(OUTPUTDIR, STEP_REMI, "remi_arlotta_metadata.csv"), row.names = TRUE) + +# # motif_allen <- "Yao et al., Cell 2021" +# remi_allen_metadata <- subset(remi_celltypes_metadata, grepl(motif_allen, remi_celltypes_metadata[["study_label"]])) +# remi_allen_metadata <- remi_allen_metadata[remi_allen_metadata$platform_label %in% "10X", ] # I only use this matrix in my analysis +# write.csv(remi_allen_metadata, file.path(OUTPUTDIR, STEP_REMI, "remi_allen_metadata.csv"), row.names = TRUE) + +# #===================================== +# # ARLOTTA +# #===================================== +# # Open the metadata +# arlotta_metadata <- fread(file = file.path(REF, "arlotta/metaData_scDevSC.txt")) + +# # Remove every cells that are not Non Neuronal +# strings_to_remove <- c("Interneurons", "Cajal Retzius cells", "UL CPN", "Layer 4", "DL CPN", "DL_CPN_1", "DL_CPN_2", "SCPN", "NP", "CThPN", "Layer 6b", "Doublet", "group", "Low quality cells", "Red blood cells") +# for (pattern in strings_to_remove) { +# rows_to_remove <- grep(pattern, arlotta_metadata$New_cellType) +# arlotta_metadata <- arlotta_metadata[-rows_to_remove, ] +# } + +# # Keep only some column in both files for a futur merge +# arlotta_metadata <- subset(arlotta_metadata, select = c("NAME", "New_cellType", "orig_ident")) # Cells, subclass (for NN), and the age of the mouse +# remi_arlotta_metadata <- subset(remi_arlotta_metadata, select = c("sample_name", "supertype_label", "biosample_id"))# Cells, celltype (for GABAergic and Glutamatergic), and the age of the mouse + +# # Give the two data table the same column name for the merge +# setnames(arlotta_metadata, "NAME", "sample_name") +# setnames(arlotta_metadata, "New_cellType", "supertype_label") +# setnames(arlotta_metadata, "orig_ident", "biosample_id") + +# # Merge the two files +# final_arlotta <- rbind(remi_arlotta_metadata, arlotta_metadata) +# write.csv(final_arlotta, file.path(OUTPUTDIR, STEP_REMI, "supertype_final_arlotta.csv"), row.names = FALSE) + +# #===================================== +# # ALLEN +# #===================================== +# # Open the metadata +# allen_metadata <- fread(file = file.path(REF, "allen/metadata.csv")) + +# # Keep only cells that are Non Neuronal +# strings_to_keep <- c("Astro", "Endo", "Micro-PVM", "Oligo", "SMC-Peri", "VLMC") +# allen_metadata <- allen_metadata[allen_metadata$subclass_label %in% strings_to_keep] + +# # Keep only some column in both files for a futur merge +# allen_metadata <- subset(allen_metadata, select = c("sample_name", "subclass_label"))# Cells, subclass (for NN) +# remi_allen_metadata <- subset(remi_allen_metadata, select = c("exp_component_name", "celltype_label")) + +# # Give the two data table the same column name for the merge +# setnames(remi_allen_metadata, "exp_component_name", "sample_name") +# setnames(allen_metadata, "subclass_label", "celltype_label") + +# # Merge the two files +# final_allen <- rbind(remi_allen_metadata, allen_metadata) +# write.csv(final_allen, file.path(OUTPUTDIR, STEP_REMI, "final_allen.csv"), row.names = FALSE) + +################################################################################################ CREATE FILES TO HAVE REMI'S ANNOTATION + +# remi <- readRDS(file.path(REF, "remi/GECortex_PostMsub_celltypefinal.rds" )) # ~ 16 GB +# cell_names <- grep("^P30_wt_", colnames(remi), value = TRUE) +# remi <- subset(remi, cells = cell_names) +# saveRDS(remi, file = file.path(OUTPUTDIR, STEP_REMI, "P30_WT_remi.rds")) + +################################################################################################ LABEL TRANSFERT FROM THE REFERENCE : ARLOTTA AND ALLEN \ No newline at end of file diff --git a/03_Script/seurat.Rmd b/03_Script/seurat.Rmd index aaa4f41..697b8ec 100644 --- a/03_Script/seurat.Rmd +++ b/03_Script/seurat.Rmd @@ -1,8 +1,8 @@ --- -title: "Single-cell RNAseq analysis of gastruloids" +title: "Single-cell RNAseq analysis of the somatosensory cortex" subtitle: "Seurat | QC and Clustering" author: -- Thomas Vannier +- Léa Chabot date: "`r format(Sys.time(), '%d %B %Y')`" output: html_document: @@ -45,15 +45,15 @@ library( htmltools) [//]: # "Load the various chunks in the various files" ```{r readChunks, echo=FALSE, warning=FALSE, message=FALSE} read_chunk( path=file.path( SCRIPTDIR, "01_prepare_data.R")); -read_chunk( path=file.path( SCRIPTDIR, "02_variables_genes.R")); read_chunk( path=file.path( SCRIPTDIR, "03_cell_heterogeneity_PCA.R")); read_chunk( path=file.path( SCRIPTDIR, "04_cell_heterogeneity_tSNE_UMAP.R")); read_chunk( path=file.path( SCRIPTDIR, "05_cell_heterogeneity_clustering.R")); +read_chunk( path=file.path( SCRIPTDIR, "06_markers.R")); ``` This report present the analyse of dataset generated with 10x Genomics Single Cell Protocol. -# Primary analysis : QC and selecting cells for further analysis of the sample `r snakemake@params[["sample_id"]]` +# Primary analysis : QC and selecting cells for further analysis of the sample `r SAMPLE_ID` ```{r loadData, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE} <> @@ -82,43 +82,17 @@ We visualize QC metrics, and use these to filter cells. ```{r filterData_filterObject, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE} <> ``` - ## Normalizing the data -After removing unwanted cells from the dataset we normalize the feature expression measurements for each cell (and correct the batch effect using the Seurat integration procedure if necessary) with `r snakemake@params[["norm_method"]]` -We apply a linear transformation (‘scaling’) that is a standard pre-processing step prior to dimensional reduction techniques like PCA. If SCTransform is performed, it's not necessary to scale the data after it. +After removing unwanted cells from the dataset we use the SCTransform function in Seurat that performs normalization and variance stabilization of molecular count data from scRNA-seq experiments. -```{r normalizeData, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE} -<> +```{r SCTransform, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE} +<> ```

-## Identification of highly variable genes - -Subset of features that exhibit high cell-to-cell variation in the dataset - -```{r findVariableGenes_seuratMethod, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE, warning = FALSE} -<> -``` - -Number of variable genes found: `r variablesGenesStats` - -```{r findVariableGenes_summaryPlot, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE, warning = FALSE} -<> -``` - -
- -Top `r VARIABLE_FEATURES_SHOWTOP` most variable annotations: - -```{r findVariableGenes_summaryTable, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE, warning = FALSE} -<> -``` - -

- -## PCA results +# PCA results Visualizing both cells and features that define the PCA @@ -138,40 +112,12 @@ A ranking of principle components based on the percentage of variance explained <> ``` -Number of dimensions to use for PCA : `r snakemake@params[["pca_npc"]]`` - -PCA projections for dimensions `r paste( 1:PCA_PLOTS_NBDIMS, collapse="/")` (colored by batch) +Number of dimensions to use for PCA : `r pcs` ```{r heterogeneity_pca, echo = FALSE, out.width='33%', results = "asis", message = FALSE, warning = FALSE} <> ``` -
- -UMIs counts on PCA dimensions `r paste( 1:PCA_PLOTS_NBDIMS, collapse="/")` -```{r heterogeneity_pca_umisCounts, echo = FALSE, out.width='33%', results = "asis", message = FALSE, warning = FALSE} -<> -``` - -
- -Genes counts on PCA dimensions `r paste( 1:PCA_PLOTS_NBDIMS, collapse="/")` -```{r heterogeneity_pca_genesCounts, echo = FALSE, out.width='33%', results = "asis", message = FALSE, warning = FALSE} -<> -``` - -
- -Correlation (Spearman) of PCA dimensions `r paste( 1:PCA_PLOTS_NBDIMS, collapse="/")` with UMIs and genes counts -```{r heterogeneity_pca_correlations, echo = FALSE, fig.dim = c(9,6), results = "asis", message = FALSE, warning = FALSE} -<> -``` -
- -PCA loadings of top `r PCA_PLOTS_NBFEATURES` features for dimensions `r paste( 1:PCA_PLOTS_NBDIMS, collapse="/")` -```{r heterogeneity_pca_loadings, echo = FALSE, fig.dim = c(4, 4), out.width='33%', results = "asis", message = FALSE, warning = FALSE} -<> -```

## Dimensionality reduction @@ -198,7 +144,6 @@ rm("useReduction");

- ## Clusters identification We use a graph-based clustering approach : 1. Construct a KNN graph based on the euclidian distance in PCA space, and refine the edge weights between any two cells based on the shared overlap in their local neighborhoods (Jaccard similarity). 2. Modularity optimization techniques such as the Louvain algorithm or SLM, to iteratively group cells together, with the goal of optimizing the standard modularity function. @@ -228,11 +173,14 @@ rm("useReduction"); useReduction='tsne' <> rm("useReduction"); +``` -### +### {-}

```{r save_RDS, echo = FALSE, fig.height = 10, fig.width = 10, out.height = 600, out.width = 600, results = "asis", message = FALSE, warning = FALSE} <> ``` + + diff --git a/03_Script/seurat_reports_compilation.R b/03_Script/seurat_reports_compilation.R index 0ab3548..e928ea5 100644 --- a/03_Script/seurat_reports_compilation.R +++ b/03_Script/seurat_reports_compilation.R @@ -16,6 +16,8 @@ SEURAT_OUTPUT <- snakemake@output[["seurat_output"]] SC_DATA_PATH_LIST <- snakemake@params[["sc_data"]] REPORT_LIST <- snakemake@output[["seurat_report"]] SAMPLE_ID_LIST <- snakemake@params[["sample_id"]] +MULTIPLEX_LIST <- snakemake@params[["multiplex"]] +HTO_LIST <- snakemake@params[["hto"]] PLOT_RASTER_NBCELLS_THRESHOLD <- snakemake@params[["plot_raster_nbcells_threshold"]] CELL_RANGER_COUNT_PATH <- snakemake@params[["cell_ranger_count_path"]] RUN_DEMULTIPLEX <- snakemake@params[["run_demultiplex"]] @@ -74,6 +76,8 @@ while(i <= len) FILTER_PERCENT_MT <- FILTER_PERCENT_MT_LIST[i] FILTER_PERCENT_MT_MIN <- FILTER_PERCENT_MT_MIN_LIST[i] FILTER_PERCENT_RB <- FILTER_PERCENT_RB_LIST[i] + MULTIPLEX <- MULTIPLEX_LIST[i] + HTO <- HTO_LIST[i] rmarkdown::render( input = file.path(SCRIPTDIR, "seurat.Rmd"), output_dir = file.path(OUTPUTDIR, STEP, SAMPLE_ID), diff --git a/03_Script/sims.py b/03_Script/sims.py deleted file mode 100644 index eb381bf..0000000 --- a/03_Script/sims.py +++ /dev/null @@ -1,99 +0,0 @@ -#.............................................................. -# -#.............................................................. - -#------------------------------------- -# Import -#------------------------------------- -from scsims import SIMS -from pytorch_lightning.loggers import WandbLogger -from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint -import os -import sys -import anndata as an - -#------------------------------------- -# Path to files / folders -#------------------------------------- -DIRECTORY = os.getcwd() -REF = os.path.join(DIRECTORY, "01_Reference") -OUTPUTDIR = os.path.join(DIRECTORY, "05_Output") - -SAMPLE_ID = sys.argv[1] -REFERENCE_MATRIX = sys.argv[2] -PROJECT_NAME = sys.argv[3] -CLASS_LABEL = sys.argv[4] -NUM_WORKERS = sys.argv[5] -MONITOR = sys.argv[6] -PATIENCE = sys.argv[7] -MAX_EPOCH = sys.argv[8] -MATRIX = sys.argv[9] - -STEP3 = "03_sims/" - -# #----------------------------------------------------------------------------------------------- -# # To train the model -# #----------------------------------------------------------------------------------------------- - -# #------------------------------------- -# # Logger : Allows you to visualize -# # data training. -# #------------------------------------- -# logger = WandbLogger(project = PROJECT_NAME, name = PROJECT_NAME, save_dir = os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID), version = "") - -# #------------------------------------- -# # Load the anndata file -# #------------------------------------- -# reference_matrix = an.read_h5ad(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, REFERENCE_MATRIX + "_join_metadata.h5ad")) - -# #------------------------------------- -# # Custom training jobs -# #------------------------------------- -# sims = SIMS(data = reference_matrix, class_label = CLASS_LABEL, num_workers = 12, stratify = True) - -# # weighting loss inversely proportional by label freq, helps learn rare cell types (recommended) -# sims.setup_model(n_a=64, n_d=64, weights=sims.weights) - -# sims.setup_trainer( -# logger = logger, -# callbacks = [ -# EarlyStopping( -# monitor = MONITOR, -# patience = 25, -# verbose = 1 -# ), -# LearningRateMonitor(logging_interval = "epoch"), -# ModelCheckpoint( -# dirpath = os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID), -# filename = "checkpoint" -# ), -# ], -# max_epochs = 100, -# devices = 1 -# ) - -# sims.train() - -#----------------------------------------------------------------------------------------------- -# To predict labels on our data -#----------------------------------------------------------------------------------------------- - -#------------------------------------- -# Load the model -#------------------------------------- -model = os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, "checkpoint-v1.ckpt") - -#------------------------------------- -# Prediction -#------------------------------------- -sims = SIMS(weights_path = model) -cell_prediction = sims.predict(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, MATRIX + ".h5ad")) - -#------------------------------------- -# Write prediction in a csv file -#------------------------------------- -matrix = an.read_h5ad(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, MATRIX + ".h5ad")) - -cell_prediction.insert(0,"Cells", matrix.obs_names) - -cell_prediction.to_csv(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, MATRIX + "_prediction.csv"), index = False) \ No newline at end of file diff --git a/03_Script/sims_prediction.py b/03_Script/sims_prediction.py new file mode 100644 index 0000000..29c7f9f --- /dev/null +++ b/03_Script/sims_prediction.py @@ -0,0 +1,53 @@ +#.............................................................. +# SIMS : Scalable, Interpretable Machine Learning for +# Single-Cell : A machine learning tool for scRNA-Seq label +# transfer in neuroscience +#.............................................................. + +#------------------------------------- +# Import +#------------------------------------- +from scsims import SIMS +from pytorch_lightning.loggers import WandbLogger +from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint +import os +import sys +import anndata as an +import wandb + +#------------------------------------- +# Path to files / folders +#------------------------------------- +DIRECTORY = os.getcwd() +OUTPUTDIR = os.path.join(DIRECTORY, "05_Output") + +SAMPLE_ID = sys.argv[1] +MATRIX = sys.argv[2] +EPOCH = sys.argv[3] +REFERENCE_NAME = sys.argv[4] + +STEP3 = "03_sims/" + +#----------------------------------------------------------------------------------------------- +# To predict labels on our data +#----------------------------------------------------------------------------------------------- + +#------------------------------------- +# Load the model +#------------------------------------- +model = os.path.join(OUTPUTDIR, STEP3, REFERENCE_NAME, EPOCH) + +#------------------------------------- +# Prediction +#------------------------------------- +sims = SIMS(weights_path = model) +cell_prediction = sims.predict(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, MATRIX + ".h5ad")) + +#------------------------------------- +# Write prediction in a csv file +#------------------------------------- +matrix = an.read_h5ad(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, MATRIX + ".h5ad")) + +cell_prediction.insert(0,"Cells", matrix.obs_names) + +cell_prediction.to_csv(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, SAMPLE_ID + "_" + MATRIX + "_prediction.csv"), index = False) \ No newline at end of file diff --git a/03_Script/sims_training.py b/03_Script/sims_training.py new file mode 100644 index 0000000..abd3e3c --- /dev/null +++ b/03_Script/sims_training.py @@ -0,0 +1,84 @@ +#.............................................................. +# SIMS : Scalable, Interpretable Machine Learning for +# Single-Cell : A machine learning tool for scRNA-Seq label +# transfer in neuroscience +#.............................................................. + +#------------------------------------- +# Import +#------------------------------------- +from scsims import SIMS +from pytorch_lightning.loggers import WandbLogger +from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint +import os +import sys +import anndata as an +import wandb + +#------------------------------------- +# Path to files / folders +#------------------------------------- +DIRECTORY = os.getcwd() +REF = os.path.join(DIRECTORY, "01_Reference") +OUTPUTDIR = os.path.join(DIRECTORY, "05_Output") + +REFERENCE_NAME = sys.argv[1] +REFERENCE_MATRIX = sys.argv[2] +PROJECT_NAME = sys.argv[3] +CLASS_LABEL = sys.argv[4] +NUM_WORKERS = sys.argv[5] +MONITOR = sys.argv[6] +PATIENCE = sys.argv[7] +MAX_EPOCH = sys.argv[8] +MATRIX = sys.argv[9] +KEY = sys.argv[10] + +STEP3 = "03_sims/" + +#----------------------------------------------------------------------------------------------- +# To train the model +#----------------------------------------------------------------------------------------------- + +# ------------------------------------- +# Logger : Allows you to visualize +# data training. +# ------------------------------------- +### To use wandb +wandb.login(key = KEY) +logger = WandbLogger(project = PROJECT_NAME, name = PROJECT_NAME, save_dir = os.path.join(OUTPUTDIR, STEP3, REFERENCE_NAME), version = "") + +### To turn off wandb +# logger = WandbLogger(offline=True) + +#------------------------------------- +# Load the anndata file +#------------------------------------- +reference_matrix = an.read_h5ad(os.path.join(OUTPUTDIR, STEP3, REFERENCE_NAME, REFERENCE_MATRIX + "_join_metadata.h5ad")) + +#------------------------------------- +# Custom training jobs +#------------------------------------- +sims = SIMS(data = reference_matrix, class_label = CLASS_LABEL, num_workers = int(NUM_WORKERS), stratify = True) + +# weighting loss inversely proportional by label freq, helps learn rare cell types (recommended) +sims.setup_model(n_a=64, n_d=64, weights=sims.weights) + +sims.setup_trainer( + logger = logger, + callbacks = [ + EarlyStopping( + monitor = MONITOR, + patience = int(PATIENCE), + verbose = 1, + ), + LearningRateMonitor(logging_interval = "epoch"), + ModelCheckpoint( + dirpath = os.path.join(OUTPUTDIR, STEP3, REFERENCE_NAME), + every_n_epochs = 1, + save_top_k = -1 + ), + ], + max_epochs = int(MAX_EPOCH) +) + +sims.train() diff --git a/03_Script/to_anndata_file.py b/03_Script/to_anndata_file.py index 2b1efc1..7ba168b 100644 --- a/03_Script/to_anndata_file.py +++ b/03_Script/to_anndata_file.py @@ -21,63 +21,49 @@ OUTPUT_NAME_REF_MATRIX = sys.argv[2] OUTPUT_NAME_MATRIX = sys.argv[3] OUTPUT_NAME_REF_METADATA = sys.argv[4] +CELLS_COLUMN = sys.argv[5] +REFERENCE_NAME = sys.argv[6] STEP3 = "03_sims/" #------------------------------------- # Load reference matrix #------------------------------------- -reference_matrix = an.read_csv(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, OUTPUT_NAME_REF_MATRIX + ".csv")) -#------------------------------------- -# Write matrix as anndata -#------------------------------------- -reference_matrix.write_h5ad(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, OUTPUT_NAME_REF_MATRIX + ".h5ad")) - -anndata_ref_matrix = an.read_h5ad(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, OUTPUT_NAME_REF_MATRIX + ".h5ad")) -print("REFERENCE MATRIX") -print("-------------------------------------------------------------------") -print(anndata_ref_matrix) -print("-------------------------------------------------------------------") -print(anndata_ref_matrix.var_names) -print("-------------------------------------------------------------------") -print(anndata_ref_matrix.obs_names) -print("-------------------------------------------------------------------") - -#------------------------------------- -# Write matrix as anndata with -# metadata inside -#------------------------------------- -reference_metadata = pd.read_csv(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, OUTPUT_NAME_REF_METADATA + ".csv"), index_col = "NAME") -anndata_ref_matrix.obs = pd.concat([anndata_ref_matrix.obs, reference_metadata], axis=1, join='inner') - -anndata_ref_matrix.write_h5ad(os.path.join(OUTPUTDIR, STEP3, SAMPLE_ID, OUTPUT_NAME_REF_MATRIX + "_join_metadata.h5ad")) - -print("REFERENCE MATRIX AVEC METADATA") -print("-------------------------------------------------------------------") -print(anndata_ref_matrix) -print("-------------------------------------------------------------------") -print(anndata_ref_matrix.var_names) -print("-------------------------------------------------------------------") -print(anndata_ref_matrix.obs_names) -print("-------------------------------------------------------------------") - -cell_name_to_observe = "P4_AAACCTGAGATCGATA-1" - -cell_data = anndata_ref_matrix.obs.loc[cell_name_to_observe] -print("") -print("") -print("-------------------------------------------------------------------") -print(cell_data) - - -gene_name_to_observe = "mt-Cytb" - -gene_data = anndata_ref_matrix.var.loc[gene_name_to_observe ] -print("") -print("") -print("-------------------------------------------------------------------") -print(gene_data.head(5)) +# reference_matrix = an.read_csv(os.path.join(OUTPUTDIR, STEP3, REFERENCE_NAME, OUTPUT_NAME_REF_MATRIX + ".csv")) + +# #------------------------------------- +# # Write matrix as anndata +# #------------------------------------- +# reference_matrix.write_h5ad(os.path.join(OUTPUTDIR, STEP3, REFERENCE_NAME, OUTPUT_NAME_REF_MATRIX + ".h5ad")) + +# anndata_ref_matrix = an.read_h5ad(os.path.join(OUTPUTDIR, STEP3, REFERENCE_NAME, OUTPUT_NAME_REF_MATRIX + ".h5ad")) +# print("REFERENCE MATRIX") +# print("-------------------------------------------------------------------") +# print(anndata_ref_matrix) +# print("-------------------------------------------------------------------") +# print(anndata_ref_matrix.var_names) +# print("-------------------------------------------------------------------") +# print(anndata_ref_matrix.obs_names) +# print("-------------------------------------------------------------------") + +# #------------------------------------- +# # Write matrix as anndata with +# # metadata inside +# #------------------------------------- +# reference_metadata = pd.read_csv(os.path.join(OUTPUTDIR, STEP3, REFERENCE_NAME, OUTPUT_NAME_REF_METADATA + ".csv"), index_col = CELLS_COLUMN ) +# anndata_ref_matrix.obs = pd.concat([anndata_ref_matrix.obs, reference_metadata], axis=1, join='inner') + +# anndata_ref_matrix.write_h5ad(os.path.join(OUTPUTDIR, STEP3, REFERENCE_NAME, OUTPUT_NAME_REF_MATRIX + "_join_metadata.h5ad")) + +# print("REFERENCE MATRIX AVEC METADATA") +# print("-------------------------------------------------------------------") +# print(anndata_ref_matrix) +# print("-------------------------------------------------------------------") +# print(anndata_ref_matrix.var_names) +# print("-------------------------------------------------------------------") +# print(anndata_ref_matrix.obs_names) +# print("-------------------------------------------------------------------") #------------------------------------- # Load the matrix to annotate diff --git a/03_Script/umapCellAssignation.R b/03_Script/umapCellAssignation.R new file mode 100644 index 0000000..79896c2 --- /dev/null +++ b/03_Script/umapCellAssignation.R @@ -0,0 +1,120 @@ +#------------------------------------- +# This script add the cell labels in a seurat object +# Unknown cell labels are add by the use of threshold +# (difference btw first pred and second pred) +# UMAP are produced : +# - For each labels +# - Global UMAP with sims annotation without unknown threshold +# - Global UMAP with sims annotation with unknown threshold +#------------------------------------- + +library(Seurat) +library(data.table) +library(cowplot) +library(ggplot2) + +#------------------------------------- +# Path to files / folders +#------------------------------------- +DIRECTORY = getwd() +TEXT_OUTPUT = snakemake@output[["data_for_sims_output"]] +OUTPUTDIR = file.path((DIRECTORY), "05_Output") +RDS_ASSIGNATION = snakemake@params[["rds_assignation"]] +PRED_FILTERED = snakemake@params[["pred_filtered"]] + +SAMPLE_ID = snakemake@params[["sample_id"]] + +MATRIX = snakemake@params[["matrix"]] + +STEP2 = "02_seurat/" +STEP3 = "03_sims/" + +# Load data +data <- readRDS(file = file.path(OUTPUTDIR, STEP2, SAMPLE_ID, paste0(SAMPLE_ID,"_filtered_seurat_object.rds"))) +pred <- fread(file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(SAMPLE_ID, "_", MATRIX, "_prediction.csv"))) + +# Assigning predictions and probabilities to data +data$labels <- pred$first_pred +Idents(data) <- data$labels +diff_prob <- pred$first_prob - pred$second_prob +data$diff_prob <- diff_prob + +# Threshold to say if a cell is unknown or not +threshold <- snakemake@params[["threshold"]] +cells_threshold <- colnames(data)[data$diff_prob < threshold] + +Idents(object = data, cells = cells_threshold) <- "unknown" + +# List of unique identifiers +list <- unique(Idents(data)) +# list <- c(list,"unknown") + +# Function for creating graphics +createUMAPPlot <- function(data, id) { + cells <- WhichCells(data, idents = id) + umap <- DimPlot(data, reduction = "umap", group.by = "labels", + cells.highlight = list(cells), cols.highlight = c("brown2"), + cols = "grey") + ggtitle(id) + return(umap) +} + +# Create a list of graphics +plots_list <- lapply(list, createUMAPPlot, data = data) + +# Set number of graphics per page +num_graphiques_par_page <- 1 + +# Divide graphics into pages +pages <- split(plots_list, ceiling(seq_along(plots_list) / num_graphiques_par_page)) + +#------------------------------------- +# Save each figure on a pdf +#------------------------------------- +pdf(file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(SAMPLE_ID, "_umap_per_labels.pdf"))) +for (page in pages) { + current_page <- plot_grid(plotlist = page, ncol = 1) + print(current_page) +} +dev.off() + +# Create PDF with global UMAP graph (with sims assignation) +title <- paste("UMAP", SAMPLE_ID, sep=' ') +pdf(file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(SAMPLE_ID, "_umap_sims.pdf")), width=12, height=12) +UMAP <- DimPlot(data, group.by = "labels", label.size = 6,repel = TRUE) + theme(legend.position = "bottom") + ggtitle(title) +print(UMAP) +dev.off() + +# Create PDF with UMAP graph of cells exceeding threshold (without NA cells) +cells_threshold <- subset(data, subset = diff_prob >= threshold) +title_threshold <- paste("UMAP filtered", SAMPLE_ID, sep=' ') + +pdf(file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(SAMPLE_ID, "_umap_sims_threshold.pdf")), width=12, height=12) +UMAP_threshold <- DimPlot(cells_threshold, group.by = "labels", repel = TRUE) + theme(legend.position = "bottom") + ggtitle(title_threshold) +print(UMAP_threshold) +dev.off() + +#------------------------------------- +# Update the label in the metadata with +# unknown cells and save the RDS +#------------------------------------- + +data$labels <- Idents(data) +saveRDS(data, file = file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(SAMPLE_ID, "_", RDS_ASSIGNATION))) + +#------------------------------------- +# create the predicted matrix with the +# output file for the snakemake rule +#------------------------------------- + +tab_pred_filtered <- paste(Cells(cells_threshold),cells_threshold$labels,cells_threshold$diff_prob) +write.table(file=file.path(OUTPUTDIR, STEP3, SAMPLE_ID, paste0(SAMPLE_ID, "_", PRED_FILTERED)), tab_pred_filtered ,sep=",", quote=F, row.names=F, col.names=F) + +#------------------------------------- +# create the output file for the snakemake rule +#------------------------------------- + +TEXT_OUTPUT <- snakemake@output[["umapAssignation_output"]] + +output_file<-file(TEXT_OUTPUT) +writeLines(c("Rules UMAP cell assignation finished"), output_file) +close(output_file) \ No newline at end of file diff --git a/04_Workflow/SIMS.smk b/04_Workflow/SIMS.smk index 4597f3e..14a90d0 100644 --- a/04_Workflow/SIMS.smk +++ b/04_Workflow/SIMS.smk @@ -4,24 +4,25 @@ # single-cell RNA-seq data #---------------------------------------- -rule sims: +rule sims_training: input: anndata_for_sims_output = expand(OUTPUTDIR + "03_sims/anndata_for_sims_output.txt"), output: - sims_output = expand(OUTPUTDIR + "03_sims/output_sims.txt"), + sims_training_output = expand(OUTPUTDIR + "03_sims/output_sims_training.txt"), params: sims_rule = config["rules"]["sims_rule"], - sample_id = config["reference_sims"]["sample_id"], + reference_name = config["reference_sims"]["reference_name"], reference_matrix = config["reference_sims"]["output_name_ref_matrix"], project_name = config["sims"]["project_name"], class_label = config["sims"]["class_label"], - num_workers = int(config["sims"]["num_workers"]), + num_workers = config["sims"]["num_workers"], monitor = config["sims"]["monitor"], - patience = int(config["sims"]["patience"]), - max_epoch = int(config["sims"]["max_epoch"]), + patience = config["sims"]["patience"], + max_epoch = config["sims"]["max_epoch"], matrix = config["reference_sims"]["output_name_matrix"], + key = config["sims"]["key"], message: "Run SIMS for cell annotation" @@ -30,32 +31,42 @@ rule sims: """ if [ {params.sims_rule} = "FALSE" ] then - touch {output.sims_output} + touch {output.sims_training_output} else - pip install --use-pep517 git+https://github.com/braingeneers/SIMS.git - python 03_Script/sims.py {params.sample_id} {params.reference_matrix} {params.project_name} {params.class_label} \ - {params.num_workers} {params.monitor} {params.patience} {params.max_epoch} {params.matrix} - touch {output.sims_output} + pip install --use-pep517 git+https://github.com/braingeneers/SIMS.git@c3cc547e9223e979fdc70f9af2ae932b729da88e + python 03_Script/sims_training.py {params.reference_name} {params.reference_matrix} {params.project_name} {params.class_label} \ + {params.num_workers} {params.monitor} {params.patience} {params.max_epoch} {params.matrix} {params.key} + touch {output.sims_training_output} fi """ +rule sims_prediction: + input: + sims_training_output = expand(OUTPUTDIR + "03_sims/output_sims_training.txt"), + output: + sims_prediction_output = expand(OUTPUTDIR + "03_sims/output_sims_prediction.txt"), + # sims_prediction_report = report(expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_data_matrix_prediction.csv", sample_id = SAMPLE_ID), caption = REPORT + "label.rst", category = "03 sims"), + # sims_prediction_unknown_report = report(expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_data_matrix_prediction_filtered.csv", sample_id = SAMPLE_ID), caption = REPORT + "label.rst", category = "03 sims"), - # Si ne marche pas de ma façon : A REFAIRE COMME CA !!! - # shell: - # """ - # if [ {params.sims.rule} = "FALSE" ] - # then - # touch {output.sims.output} - # else - # export PATH="/home/lchabot/.local/bin:$PATH" - # export PATH="/scratch/lchabot/BIOINFO_PROJECT/workflow_scrnaseq_P5-30/.local/bin:$PATH" - # pip install --upgrade pip setuptools wheel - # pip install protobuf==3.20.* - # pip install --use-pep517 git+https://github.com/braingeneers/SIMS.git - # python 03_Script/sims.py {params.reference_0} {params.metadata_0} {params.project_name} {params.class_label} {params.max_epoch} - # touch {output.sims.output} - # fi - # """ - - # export Path => Chemin ou est localiser l'instalation python et pip \ No newline at end of file + params: + sims_rule = config["rules"]["sims_rule"], + sample_id = config["reference_sims"]["sample_id"], + matrix = config["reference_sims"]["output_name_matrix"], + epoch = config["sims"]["epoch"], + reference_name = config["reference_sims"]["reference_name"], + + message: + "Run SIMS for cell annotation" + + shell: + """ + if [ {params.sims_rule} = "FALSE" ] + then + touch {output.sims_prediction_output} + else + pip install --use-pep517 git+https://github.com/braingeneers/SIMS.git@c3cc547e9223e979fdc70f9af2ae932b729da88e + python 03_Script/sims_prediction.py {params.sample_id} {params.matrix} {params.epoch} {params.reference_name} + touch {output.sims_prediction_output} + fi + """ \ No newline at end of file diff --git a/04_Workflow/knnor.smk b/04_Workflow/knnor.smk new file mode 100644 index 0000000..dfdd0f7 --- /dev/null +++ b/04_Workflow/knnor.smk @@ -0,0 +1,38 @@ +#---------------------------------------- +# KNNOR : K-Nearest Neighbor +# OveRsampliing method : For imbalanced +# datasets +#---------------------------------------- +rule knnor: + input: + data_for_sims_output = expand(OUTPUTDIR + "03_sims/data_for_sims_output.txt"), + + output: + knnor_output = expand(OUTPUTDIR +"03_sims/knnor_output.txt"), + + params: + knnor_rule = config["rules"]["knnor_rule"], + sample_id = config["reference_sims"]["sample_id"], + matrix = config["reference_sims"]["output_name_ref_matrix"], + metadata = config["reference_sims"]["output_name_ref_metadata"], + class_label = config["knnor"]["class_label"], + max_oversampling = config["knnor"]["max_oversampling"], + cells_column = config["reference_sims"]["cells_column"], + + conda: + CONTAINER + "knnor.yaml" + + message: + "Oversampling data with KNNOR" + + shell: + """ + if [ {params.knnor_rule} = "FALSE" ] + then + touch {output.knnor_output} + else + + python 03_Script/imbalanced_datasets.py {params.sample_id} {params.matrix} {params.metadata} {params.class_label} "{params.max_oversampling}" {params.cells_column} + touch {output.knnor_output} + fi + """ \ No newline at end of file diff --git a/04_Workflow/prepare_data_sims.smk b/04_Workflow/prepare_data_sims.smk index 0c2f8bc..862b379 100644 --- a/04_Workflow/prepare_data_sims.smk +++ b/04_Workflow/prepare_data_sims.smk @@ -11,25 +11,21 @@ rule data_for_sims: params: # sims_rule = config["rules"]["sims_rule"], - # General + ### General r_script = config["reference_sims"]["r_script"], + reference_name = config["reference_sims"]["reference_name"], sample_id = config["reference_sims"]["sample_id"], output_name_ref_metadata = config["reference_sims"]["output_name_ref_metadata"], output_name_ref_matrix = config["reference_sims"]["output_name_ref_matrix"], output_name_matrix = config["reference_sims"]["output_name_matrix"], - # Reference Arlotta + norm_method = config["seurat"]["norm_method"], + norm_scale_factor = config["seurat"]["norm_scale_factor"], + ### Reference Arlotta arlotta_metadata = config["reference_sims"]["arlotta_metadata"], - arlotta_matrix = config["reference_sims"]["arlotta_matrix"], - arlotta_cells = config["reference_sims"]["arlotta_cells"], - arlotta_features = config["reference_sims"]["arlotta_features"], - pattern_to_keep = config["reference_sims"]["pattern_to_keep"], - # Reference Allen mouse whole cortex + developmental_time = config["reference_sims"]["developmental_time"], + ### Reference Allen mouse whole cortex allen_metadata = config["reference_sims"]["allen_metadata"], allen_matrix = config["reference_sims"]["allen_matrix"], - allen_genes = config["reference_sims"]["allen_genes"], - allen_cells = config["reference_sims"]["allen_cells"], - # Reference MouseGastrulation - mousegastrulation_samples = config["reference_sims"]["mousegastrulation_samples"].split(','), conda: CONTAINER + "preparation_sims.yaml" @@ -58,6 +54,8 @@ rule anndata_for_sims: output_name_ref_matrix = config["reference_sims"]["output_name_ref_matrix"], output_name_matrix = config["reference_sims"]["output_name_matrix"], output_name_ref_metadata = config["reference_sims"]["output_name_ref_metadata"], + cells_column = config["reference_sims"]["cells_column"], + reference_name = config["reference_sims"]["reference_name"], conda: CONTAINER + "preparation_sims.yaml" @@ -71,7 +69,7 @@ rule anndata_for_sims: then touch {output.anndata_for_sims_output} else - python 03_Script/to_anndata_file.py {params.sample_id} {params.output_name_ref_matrix} {params.output_name_matrix} {params.output_name_ref_metadata} + python 03_Script/to_anndata_file.py {params.sample_id} {params.output_name_ref_matrix} {params.output_name_matrix} {params.output_name_ref_metadata} {params.cells_column} {params.reference_name} touch {output.anndata_for_sims_output} fi """ \ No newline at end of file diff --git a/04_Workflow/remi.smk b/04_Workflow/remi.smk new file mode 100644 index 0000000..b18fd1f --- /dev/null +++ b/04_Workflow/remi.smk @@ -0,0 +1,18 @@ +rule remi: + input: + ref_cellranger_output = expand(OUTPUTDIR + "01_cellranger/ref_cellranger_output.txt"), + + output: + remi_output = expand(OUTPUTDIR + "remi_output.txt"), + + conda: + CONTAINER + "preparation_sims.yaml" + + message: + "TEST REMI" + + shell: + """ + Rscript 03_Script/remi.R + touch {output.remi_output} + """ \ No newline at end of file diff --git a/04_Workflow/seurat.smk b/04_Workflow/seurat.smk index 0c6ac8d..5c0e0ff 100644 --- a/04_Workflow/seurat.smk +++ b/04_Workflow/seurat.smk @@ -18,6 +18,8 @@ rule seurat: sc_data = expand(OUTPUTDIR + "01_cellranger/{sample_id}/count/sample_filtered_feature_bc_matrix/",sample_id = SAMPLE_ID), cell_ranger_count_path = config["seurat"]["cell_ranger_count_path"], sample_id = expand("{sample_id.id}", sample_id = sample_id.itertuples()), + # multiplex = expand("{multiplex.multiplex}", multiplex = multiplex.itertuples()), + # hto = expand("{hto.hto}", hto = hto.itertuples()), plot_raster_nbcells_threshold = config["seurat"]["plot_raster_nbcells_threshold"], qc_exploration_mode = config["seurat"]["qc_exploration_mode"], min_cells = config["seurat"]["min_cells"], @@ -31,18 +33,18 @@ rule seurat: filter_percent_rb = config["seurat"]["filter_percent_rb"].split(','), pattern_mt = config["seurat"]["pattern_mt"], pattern_rb = config["seurat"]["pattern_rb"], - norm_method = config["seurat"]["norm_method"], - norm_scale_factor = config["seurat"]["norm_scale_factor"], - feature_select_method = config["seurat"]["feature_select_method"], - variable_features = config["seurat"]["variable_features"], - variable_features_showtop = config["seurat"]["variable_features_showtop"], + # norm_method = config["seurat"]["norm_method"], + # norm_scale_factor = config["seurat"]["norm_scale_factor"], + # feature_select_method = config["seurat"]["feature_select_method"], + # variable_features = config["seurat"]["variable_features"], + # variable_features_showtop = config["seurat"]["variable_features_showtop"], dims = config["seurat"]["dims"], - pca_npc = config["seurat"]["pca_npc"], - pca_plots_nbdims = config["seurat"]["pca_plots_nbdims"], - pca_plot_nbfeatures = config["seurat"]["pca_plot_nbfeatures"], - dimreduc_use_pca_nbdims = config["seurat"]["dimreduc_use_pca_nbdims"], - findclusters_use_pca_nbdims = config["seurat"]["findclusters_use_pca_nbdims"], - findneighbors_k = config["seurat"]["findneighbors_k"], + # pca_npc = config["seurat"]["pca_npc"], + # pca_plots_nbdims = config["seurat"]["pca_plots_nbdims"], + # pca_plot_nbfeatures = config["seurat"]["pca_plot_nbfeatures"], + # dimreduc_use_pca_nbdims = config["seurat"]["dimreduc_use_pca_nbdims"], + # findclusters_use_pca_nbdims = config["seurat"]["findclusters_use_pca_nbdims"], + # findneighbors_k = config["seurat"]["findneighbors_k"], findclusters_resolution = config["seurat"]["findclusters_resolution"], findclusters_algorithm = config["seurat"]["findclusters_algorithm"], @@ -50,4 +52,58 @@ rule seurat: "Run Seurat for the clustering" script: - SCRIPTDIR + "seurat_reports_compilation.R" \ No newline at end of file + SCRIPTDIR + "seurat_reports_compilation.R" + +#---------------------------------------- +# Label Transfert with Seurat +#---------------------------------------- + +rule seurat_labelTransfert: + input: + seurat_output = expand(OUTPUTDIR + "02_seurat/seurat_output.txt"), + + output: + seurat_labelTransfert_output = expand(OUTPUTDIR + "02_seurat/seurat_labelTransfert_output.txt"), + + conda: + CONTAINER + "labelTransfer.yaml" + + params: + scriptdir = SCRIPTDIR, + sample_id = expand("{sample_id.id}", sample_id = sample_id.itertuples()), + + message: + "Run Seurat for the transfert data" + + shell: + """ + Rscript -e 'if (!requireNamespace("DoubletFinder", quietly=TRUE)) {{ + if (!requireNamespace("remotes", quietly=TRUE)) install.packages("remotes"); + remotes::install_github("chris-mcginnis-ucsf/DoubletFinder") + }}' + Rscript {params.scriptdir}/labelTransfert_seurat.R {params.sample_id} + touch {output.seurat_labelTransfert_output} + """ + +#---------------------------------------- +# DEG analysis with Seurat +#---------------------------------------- + +rule seurat_DEG: + input: + seurat_labelTransfert_output = expand(OUTPUTDIR + "02_seurat/seurat_labelTransfert_output.txt"), + + output: + seurat_DEG_output = expand(OUTPUTDIR + "04_diffexp/seurat_DEG_output.txt"), + + conda: + CONTAINER + "seurat.yaml" + + params: + time_point = config["diffexp"]["time_point"], + + message: + "Run Seurat for the DE analysis" + + script: + SCRIPTDIR + "DEG_analysis.R" diff --git a/04_Workflow/umapCellAssignation.smk b/04_Workflow/umapCellAssignation.smk new file mode 100644 index 0000000..5f7d02f --- /dev/null +++ b/04_Workflow/umapCellAssignation.smk @@ -0,0 +1,25 @@ +rule umapCellAssignation: + input: + sims_prediction_output = expand(OUTPUTDIR + "03_sims/output_sims_prediction.txt"), + + output: + umapAssignation_output = expand(OUTPUTDIR + "03_sims/umapAssignation_output.txt"), + # umap_sims_report = report(expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_umap_sims.pdf", sample_id = SAMPLE_ID), caption = REPORT + "umap_sims.rst", category = "03 sims"), + # umap_sims_threshold_report = report(expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_umap_sims_threshold.pdf", sample_id = SAMPLE_ID), caption = REPORT + "umap_sims.rst", category = "03 sims"), + # umap_per_labels_report = report(expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_umap_per_labels.pdf", sample_id = SAMPLE_ID), caption = REPORT + "umap_sims.rst", category = "03 sims"), + + params: + sample_id = config["reference_sims"]["sample_id"], + matrix = config["reference_sims"]["output_name_matrix"], + threshold = config["sims"]["threshold"], + rds_assignation = config["sims"]["rds_assignation"], + pred_filtered = config["sims"]["pred_filtered"], + + conda: + CONTAINER + "seurat.yaml" + + message: + "Cells assignation on UMAP" + + script: + SCRIPTDIR + "umapCellAssignation.R" \ No newline at end of file diff --git a/06_Schemas/config.schema.yaml b/06_Schemas/config.schema.yaml index 4087adc..112a404 100644 --- a/06_Schemas/config.schema.yaml +++ b/06_Schemas/config.schema.yaml @@ -4,173 +4,173 @@ description: snakemake configuration file type: object -properties: - rules: - type: object - properties: - reference_enhancer_rule: - type: string - ref_cellranger_rule: - type: string - multi_rule: - type: string - required: - - reference_enhancer_rule - - ref_cellranger_rule +# properties: +# rules: +# type: object +# properties: +# reference_enhancer_rule: +# type: string +# ref_cellranger_rule: +# type: string +# multi_rule: +# type: string +# required: +# - reference_enhancer_rule +# - ref_cellranger_rule - fastq: - type: object - properties: - sname: - type: string - biop: - type: string - sra: - type: string - required: - - sname - - biop +# fastq: +# type: object +# properties: +# sname: +# type: string +# biop: +# type: string +# sra: +# type: string +# required: +# - sname +# - biop - reference: - type: object - properties: - link_ref_fasta: - type: string - ref_gtf: - type: string - link_ref_gtf: - type: string - ref_cellranger: - type: string - ref_version: - type: string - required: - - link_ref_fasta - - link_ref_gtf - - ref_cellranger - - ref_version +# reference: +# type: object +# properties: +# link_ref_fasta: +# type: string +# ref_gtf: +# type: string +# link_ref_gtf: +# type: string +# ref_cellranger: +# type: string +# ref_version: +# type: string +# required: +# - link_ref_fasta +# - link_ref_gtf +# - ref_cellranger +# - ref_version - cellranger: - type: object - properties: - fastqs: - type: string - cells: - type: integer - localcores: - type: integer - localmem: - type: integer - required: - - fastqs - - expect_cells - - localcores - - localmem +# cellranger: +# type: object +# properties: +# fastqs: +# type: string +# cells: +# type: integer +# localcores: +# type: integer +# localmem: +# type: integer +# required: +# - fastqs +# - expect_cells +# - localcores +# - localmem - demuxlet: - type: object - properties: - spooled: - type: string - required: - - spooled +# demuxlet: +# type: object +# properties: +# spooled: +# type: string +# required: +# - spooled - seurat: - type: object - properties: - plot_raster_nbcells_threshold: - type: integer - qc_exploration_mode: - type: string - min_cells: - type: integer - min_features: - type: integer - filter_umi_min: - type: string - filter_umi_max: - type: string - filter_feature_min: - type: string - filter_feature_max: - type: string - filter_percent_mt: - type: string - filter_percent_mt_min: - type: string - filter_percent_rb: - type: string - pattern_mt: - type: string - pattern_rb: - type: string - norm_method: - type: string - norm_scale_factor: - type: integer - feature_select_method: - type: string - variable_features: - type: integer - variable_features_showtop: - type: integer - pca_npc: - type: integer - pca_plots_nbdims: - type: integer - pca_plot_nbfeatures: - type: integer - dimreduc_use_pca_nbdims: - type: integer - findclusters_use_pca_nbdims: - type: integer - findneighbors_k: - type: integer - findclusters_resolution: - type: integer - findclusters_algorithm: - type: integer - required: - - plot_raster_nbcells_threshold - - qc_exploration_mode - - min_cells - - min_features - - filter_umi_min - - filter_umi_max - - filter_feature_min - - filter_feature_max - - filter_percent_mt - - filter_percent_mt_min - - filter_percent_rb - - pattern_mt - - pattern_rb - - norm_method - - norm_scale_factor - - feature_select_method - - variable_features - - variable_features_showtop - - pca_npc - - pca_plots_nbdims - - pca_plot_nbfeatures - - dimreduc_use_pca_nbdims - - findclusters_use_pca_nbdims - - findneighbors_k - - findclusters_resolution - - findclusters_algorithm +# seurat: +# type: object +# properties: +# plot_raster_nbcells_threshold: +# type: integer +# qc_exploration_mode: +# type: string +# min_cells: +# type: integer +# min_features: +# type: integer +# filter_umi_min: +# type: string +# filter_umi_max: +# type: string +# filter_feature_min: +# type: string +# filter_feature_max: +# type: string +# filter_percent_mt: +# type: string +# filter_percent_mt_min: +# type: string +# filter_percent_rb: +# type: string +# pattern_mt: +# type: string +# pattern_rb: +# type: string +# norm_method: +# type: string +# norm_scale_factor: +# type: integer +# feature_select_method: +# type: string +# variable_features: +# type: integer +# variable_features_showtop: +# type: integer +# pca_npc: +# type: integer +# pca_plots_nbdims: +# type: integer +# pca_plot_nbfeatures: +# type: integer +# dimreduc_use_pca_nbdims: +# type: integer +# findclusters_use_pca_nbdims: +# type: integer +# findneighbors_k: +# type: integer +# findclusters_resolution: +# type: integer +# findclusters_algorithm: +# type: integer +# required: +# - plot_raster_nbcells_threshold +# - qc_exploration_mode +# - min_cells +# - min_features +# - filter_umi_min +# - filter_umi_max +# - filter_feature_min +# - filter_feature_max +# - filter_percent_mt +# - filter_percent_mt_min +# - filter_percent_rb +# - pattern_mt +# - pattern_rb +# - norm_method +# - norm_scale_factor +# - feature_select_method +# - variable_features +# - variable_features_showtop +# - pca_npc +# - pca_plots_nbdims +# - pca_plot_nbfeatures +# - dimreduc_use_pca_nbdims +# - findclusters_use_pca_nbdims +# - findneighbors_k +# - findclusters_resolution +# - findclusters_algorithm - diffexp: - type: object - properties: - features: - type: string - cell_marker: - type: string - test: - type: string - threshold: - type: integer - required: - - features - - cell_marker - - test - - threshold +# diffexp: +# type: object +# properties: +# features: +# type: string +# cell_marker: +# type: string +# test: +# type: string +# threshold: +# type: integer +# required: +# - features +# - cell_marker +# - test +# - threshold diff --git a/Snakefile b/Snakefile index 2c3ed2b..6619b0a 100644 --- a/Snakefile +++ b/Snakefile @@ -33,6 +33,8 @@ if run_slurm: rawsample = pd.read_table(config["sample"]).set_index(["rawsample"], drop=False) sample_id = pd.read_table(config["sample"]).set_index(["id"], drop=False) sample_name = pd.read_table(config["sample"]).set_index(["name"], drop=False) +multiplex = pd.read_table(config["sample"]).set_index(["multiplex"], drop=False) +hto = pd.read_table(config["sample"]).set_index(["hto"], drop=False) RAWSAMPLE = expand("{rawsample.rawsample}", rawsample = rawsample.itertuples()) # 2_S1_L001, 3_S1_L001 ... SAMPLE_ID = expand("{sample_id.id}", sample_id = sample_id.itertuples()) # P5_KO,P5_WT,P30_KO,P30_WT @@ -59,66 +61,47 @@ rule all: # fastqc_output = expand(OUTPUTDIR + "00_clean/fastqc_output.txt"), ### multiqc ### # multiqc_output = expand(OUTPUTDIR + "00_clean/multiqc_output.txt"), + # raw_multiqc_html = expand(OUTPUTDIR + "00_clean/raw_multiqc.html"), ### ReferenceEnhancer to generate a scRNA-seq optimized transcriptomic reference ### ### To improve with manual curration of the file overlapping_gene_list # reference_enhancer_output = expand(OUTPUTDIR + "01_cellranger/reference_enhancer_output.txt"), - ### reference for cellranger ### + # ### reference for cellranger ### # ref_cellranger_output = expand(OUTPUTDIR + "01_cellranger/ref_cellranger_output.txt"), ### Cell Multiplexing with cellranger multi ### # cellranger_output = expand(OUTPUTDIR + "01_cellranger/cellranger_output.txt"), - ## If you need to aggregate your data + # cellranger_html = expand(OUTPUTDIR + "01_cellranger/{sample_id}/{sample_id}_web_summary.html", sample_id = SAMPLE_ID), + ### If you need to aggregate your data # aggrcsv = ROOTDIR + "/aggregation.csv", # out_aggregate = expand(OUTPUTDIR + "01_cellranger/{agrr}/outs/aggregate_web_summary.html", agrr=AGRR), - ## If demuxiplexing is used + ### If demuxiplexing is used # bcf = OUTPUTDIR + "01_cellranger/Mix_MM_lines/outs/demuxlet_Mix_MM_lines.bcf", # demuxlet = OUTPUTDIR + "01_cellranger/Mix_MM_lines/outs/demuxlet_Mix_MM_lines.best", # tabdemuxlet = OUTPUTDIR + "01_cellranger/Mix_MM_lines/outs/demuxlet_Mix_MM_lines.tsv", - # Seurat - # seurat_report = expand(OUTPUTDIR + "02_seurat/{sample_id}/{sample_id}_seurat_report.html", sample_id = SAMPLE_ID), + ### Seurat + # seurat_report = expand(OUTPUTDIR + "02_seurat/{sample_id}/{sample_id}_seurat_report.html", sample_id = SAMPLE_ID), + ### Remi data + # remi_output = expand(OUTPUTDIR + "remi_output.txt"), + ### Label Transfert with seurat + seurat_labelTransfert_output = expand(OUTPUTDIR + "02_seurat/seurat_labelTransfert_output.txt"), + ### DEG analysis with seurat + # seurat_DEG_output = expand(OUTPUTDIR + "04_diffexp/seurat_DEG_output.txt"), ### Prepare data for SIMS - data_for_sims_output = expand(OUTPUTDIR + "03_sims/data_for_sims_output.txt"), + # data_for_sims_output = expand(OUTPUTDIR + "03_sims/data_for_sims_output.txt"), + # anndata_for_sims_output = expand(OUTPUTDIR + "03_sims/anndata_for_sims_output.txt"), + ### KNNOR + # knnor_output = expand(OUTPUTDIR +"03_sims/knnor_output.txt"), # anndata_for_sims_output = expand(OUTPUTDIR + "03_sims/anndata_for_sims_output.txt"), ### SIMS - # sims_output = expand(OUTPUTDIR + "03_sims/output_sims.txt"), - - ## Differential expression analyses - # violinplot = expand(OUTPUTDIR + "03_diffexp/violin_plot/{features}_violin_plot.pdf", features=FEATURES), - # umapfeature = expand(OUTPUTDIR + "03_diffexp/umap_plot/{features}_umapfeature_plot.pdf", features=FEATURES), - # tsnefeature = expand(OUTPUTDIR + "03_diffexp/tsne_plot/{features}_tsnefeature_plot.pdf", features=FEATURES), - # ridgefeature = expand(OUTPUTDIR + "03_diffexp/ridge_plot/{features}_ridgefeature_plot.pdf", features=FEATURES), - # heatmapfeature = OUTPUTDIR + "03_diffexp/heatmap/heatmapfeature.pdf", - # diffexp_report = OUTPUTDIR + "03_diffexp/" + NPROJ + "_diffexp_report.html", - # defile_allcells = expand(OUTPUTDIR + "03_diffexp/differential_expression_features/{wt}_vs_AllCellMarker_DE.csv", wt=WT), - # defile = expand(OUTPUTDIR + "03_diffexp/differential_expression_features/{wt}_vs_{cellmarker}_DE.csv", wt=WT, cellmarker=CELLMARKER), - # volcano_allcluster = expand(OUTPUTDIR + "03_diffexp/volcanoplot/volcano_{wt}_vs_AllCellMarker.pdf", wt=WT), - # volcano = expand(OUTPUTDIR + "03_diffexp/volcanoplot/volcano_{wt}_vs_{cellmarker}.pdf", wt=WT, cellmarker=CELLMARKER), - # sign_up_allcell = expand(OUTPUTDIR + "03_diffexp/de_significant/{wt}_vs_AllCellMarker_signif-up-regulated.txt", wt=WT), - # sign_up = expand(OUTPUTDIR + "03_diffexp/de_significant/{wt}_vs_{cellmarker}_signif-up-regulated.txt", wt=WT, cellmarker=CELLMARKER), - # sign_down_allcell = expand(OUTPUTDIR + "03_diffexp/de_significant/{wt}_vs_AllCellMarker_signif-down-regulated.txt", wt=WT), - # sign_down = expand(OUTPUTDIR + "03_diffexp/de_significant/{wt}_vs_{cellmarker}_signif-down-regulated.txt", wt=WT, cellmarker=CELLMARKER), - ### Differential expression analyses inside each cluster after subset (by gene expression) - # diffexp_subset_report = OUTPUTDIR + "04_diffexp_subset/" + NPROJ + "_diffexp_subset_report.html", - # defile_subset = expand(OUTPUTDIR + "04_diffexp_subset/differential_expression/{cluster}_subcluster_DE.csv", cluster=CLUSTER), - # volcano_subset = expand(OUTPUTDIR + "04_diffexp_subset/volcanoplot/volcano_{cluster}.pdf", cluster=CLUSTER), - # sign_up_subset = expand(OUTPUTDIR + "04_diffexp_subset/de_significant/{cluster}_signif-up-regulated.txt", cluster=CLUSTER), - # sign_down_subset = expand(OUTPUTDIR + "04_diffexp_subset/de_significant/{cluster}_signif-down-regulated.txt", cluster=CLUSTER), - # violinplot_subset = expand(OUTPUTDIR + "04_diffexp_subset/violin_plot/{features}_violin_plot.pdf", features=FEATURES), - ### Compress result files and figures ### - # data_matrix_tar = OUTPUTDIR + "02_seurat/data_matrix.tar.gz", - # violinplot_tar = OUTPUTDIR + "03_diffexp/violin_plot.tar.gz", - # umapfeature_tar = OUTPUTDIR + "03_diffexp/umapfeature_plot.tar.gz", - # tsnefeature_tar = OUTPUTDIR + "03_diffexp/tsnefeature_plot.tar.gz", - # ridgefeature_tar = OUTPUTDIR + "03_diffexp/ridgefeature_plot.tar.gz", - # heatmapfeature_tar = OUTPUTDIR + "03_diffexp/heatmapfeature.tar.gz", - # defile_tar = OUTPUTDIR + "03_diffexp/differencial_expression_tests.tar.gz", - # volcano_tar = OUTPUTDIR + "03_diffexp/volcano_plot.tar.gz", - # sign_up_down_tar = OUTPUTDIR + "03_diffexp/up_down_regulated_genes_list.tar.gz", - # defile_subset_tar = OUTPUTDIR + "04_diffexp_subset/differencial_expression_tests.tar.gz", - # volcano_subset_tar = OUTPUTDIR + "04_diffexp_subset/volcano_plot.tar.gz", - # sign_up_subset_tar = OUTPUTDIR + "04_diffexp_subset/up_down_regulated_genes_list.tar.gz", - # violinplot_subset_tar = OUTPUTDIR + "04_diffexp_subset/violin_plot.tar.gz", - # clean = OUTPUTDIR + "03_diffexp/clean.txt", - + # sims_training_output = expand(OUTPUTDIR + "03_sims/output_sims_training.txt"), + # sims_prediction_output = expand(OUTPUTDIR + "03_sims/output_sims_prediction.txt"), + # sims_prediction_report = expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_data_matrix_prediction.csv", sample_id = SAMPLE_ID), + # sims_prediction_unknown_report = expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_data_matrix_prediction_filtered.csv", sample_id = SAMPLE_ID), + ### Representing cellular assignation on UMAP + # umapAssignation_output = expand(OUTPUTDIR + "03_sims/umapAssignation_output.txt"), + # umap_sims_report = expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_umap_sims.pdf", sample_id = SAMPLE_ID), + # umap_sims_threshold_report = expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_umap_sims_threshold.pdf", sample_id = SAMPLE_ID), + # umap_per_labels_report = expand(OUTPUTDIR + "03_sims/{sample_id}/{sample_id}_umap_per_labels.pdf", sample_id = SAMPLE_ID), + # ---------------------------------------------- # Load rules # ---------------------------------------------- @@ -127,21 +110,22 @@ run_demultiplex = config["run_demultiplex"] run_multiplex = config["run_multiplex"] if run_demultiplex: - include: ENVDIR + "clean_demultiplex.smk" - include: ENVDIR + "cellranger_demultiplex.smk" + # include: ENVDIR + "clean_demultiplex.smk" + # include: ENVDIR + "cellranger_demultiplex.smk" include: ENVDIR + "seurat.smk" - include: ENVDIR + "prepare_data_sims.smk" - include: ENVDIR + "SIMS.smk" + # include: ENVDIR + "remi.smk" + # include: ENVDIR + "prepare_data_sims.smk" + # include: ENVDIR + "SIMS.smk" + # include: ENVDIR + "knnor.smk" + # include: ENVDIR + "umapCellAssignation.smk" if run_multiplex: # include: ENVDIR + "clean.smk" # include: ENVDIR + "cellranger.smk" include: ENVDIR + "seurat.smk" - include: ENVDIR + "prepare_data_sims.smk" + # include: ENVDIR + "remi.smk" + # include: ENVDIR + "prepare_data_sims.smk" # include: ENVDIR + "SIMS.smk" + # include: ENVDIR + "knnor.smk" + # include: ENVDIR + "umapCellAssignation.smk" -# include: ENVDIR + "demuxlet.smk" -# include: ENVDIR + "seurat.smk" -# include: ENVDIR + "diffexp.smk" -# include: ENVDIR + "diffexp_subset.smk" -# include: ENVDIR + "report.smk" diff --git a/config.yaml b/config.yaml index 060cdcd..2213d81 100644 --- a/config.yaml +++ b/config.yaml @@ -22,6 +22,8 @@ rules: ref_cellranger_rule: "FALSE" # Run SIMS to anotate cells sims_rule: "FALSE" + # Run Knnor to help with imbalanced datasets + knnor_rule: "FALSE" run: # FASTQ file naming convention : [Sample name]_S1_L00[Lane Number]_[Read Type]_001.fastq.gz @@ -45,12 +47,12 @@ fastq: # Genome and gff link to create the pre-built Cell Ranger reference reference: - link_ref_fasta: "https://ftp.ensembl.org/pub/release-98/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz" - ref_fasta: "Mus_musculus.GRCm38.dna.primary_assembly.fa" - link_ref_gtf: "https://storage.googleapis.com/generecovery/mouse_mm10_optimized_annotation_v2.gtf.gz" + link_ref_fasta: "https://ftp.ensembl.org/pub/release-109/fasta/mus_musculus/dna/Mus_musculus.GRCm39.dna.primary_assembly.fa.gz" + ref_fasta: "Mus_musculus.GRCm39.dna.primary_assembly.fa" + link_ref_gtf: "https://ftp.ensembl.org/pub/release-109/gtf/mus_musculus/Mus_musculus.GRCm39.109.gtf.gz" ref_gtf: "Mus_musculus.GRCm39.109.gtf" # Name for the file of the cellranger reference - ref_cellranger: "refdata-cellranger-mm10" + ref_cellranger: "refdata-cellranger-mm39" # Optional reference version string to include with ref_version: "1.0.0" @@ -62,7 +64,7 @@ reference: # Parameter for cellranger multi multi: # A unique run ID string: e.g. sample345 that is also the output folder name. Cannot be more than 64 characters. - id: "S004647_to_S004650" + id: "E14" # Path to config CSV file with input libraries and analysis parameters. config: "multi_config.csv" @@ -94,7 +96,7 @@ seurat: # Number of cells above which use ggplot instead of interactive plotly plot_raster_nbcells_threshold : 10000 # CellRanger count path for each sample for Read10X - cell_ranger_count_path: "/count/sample_filtered_feature_bc_matrix/" + cell_ranger_count_path: "/filtered_feature_bc_matrix/" ##### Filtering # Switch from QC exploration mode and QC filtering @@ -108,18 +110,18 @@ seurat: min_features: 200 # Cells with number of UMIs outside the range will be excluded # Can be a list of UMIs but MUST be on the same order than sample.tsv id. - filter_umi_min: "0,0,0,0" - filter_umi_max: "55000,42000,42000,40000" + filter_umi_min: "800" + filter_umi_max: "6000" # Filter cells that have unique feature counts over/less n # Can be a list of feature counts but MUST be on the same order than sample.tsv id. - filter_feature_min: "1700,1700,1400,1400" - filter_feature_max: "7900,7500,7700,7200" + filter_feature_min: "800" + filter_feature_max: "2500" # Filter cells that have > n% mitochondrial counts - filter_percent_mt: "10,10,8,8" + filter_percent_mt: "6" # Filter cells that have < n% mitochondrial counts - filter_percent_mt_min: "0.2,0.2,0.2,0.2" + filter_percent_mt_min: "0" # Filter cells that have < n% ribosomal counts - filter_percent_rb: "0,0,0,0" + filter_percent_rb: "0" # Pattern to match features against => mitochondrial genes | for mouse : "^mt-" / for human : "^MT-" pattern_mt: "^mt-" # Pattern to match features against => ribosomal genes @@ -127,36 +129,36 @@ seurat: ##### Normalization # Normalization parameters (see Seurat::NormalizeData()) - norm_method: "LogNormalize" - norm_scale_factor: 10000 + # norm_method: "LogNormalize" + # norm_scale_factor: 10000 ##### Find the most variable genes - # How to choose top variable features => vst, mean.var.plot or dispersion - feature_select_method: "vst" - # Number of features to select as top variable features; only used when select_method is set to 'dispersion' or 'vst' - variable_features: 2000 - # For table in report - variable_features_showtop : 200 + # # How to choose top variable features => vst, mean.var.plot or dispersion + # feature_select_method: "vst" + # # Number of features to select as top variable features; only used when select_method is set to 'dispersion' or 'vst' + # variable_features: 2000 + # # For table in report + # variable_features_showtop : 200 ##### PCA parameters # Number of dimention to show for the vizualisation - dims: 30 - # Default number of dimensions to use for PCA (see Seurat::RunPCA()) - pca_npc: 22 - # Number of dimensions to show in PCA-related plots - pca_plots_nbdims : 3 - # Number of'top' features to show when plotting PCA loadings - pca_plot_nbfeatures : 3 + dims: 10 + # # Default number of dimensions to use for PCA (see Seurat::RunPCA()) + # pca_npc: 2 + # # Number of dimensions to show in PCA-related plots + # pca_plots_nbdims : 3 + # # Number of'top' features to show when plotting PCA loadings + # pca_plot_nbfeatures : 3 ##### Dimensionality reduction parameters (TSNE/UMAP) - # Number of dimensions to use from PCA results - dimreduc_use_pca_nbdims : 22 + # # Number of dimensions to use from PCA results + # dimreduc_use_pca_nbdims : 2 ##### Cluster identification parameters # Number of dimensions to use from PCA results - findclusters_use_pca_nbdims : 22 - # Nearest-neighbor graph construction - findneighbors_k : 30 + # findclusters_use_pca_nbdims : 3 + # Nearest-neighb # findclusters_use_pca_nbdims : 3or graph construction + # findneighbors_k : 30 # Value of the resolution parameter, use a value above (below) 1.0 if you want to obtain a larger (smaller) number of communities. findclusters_resolution : 1 # Algorithm for modularity optimization @@ -166,75 +168,85 @@ seurat: # Parameters to build correct reference(s) for sims reference_sims: ##### General ##### - # Path of the script you want to execute (Corespond to the reference you want to use) - # For now : arlotta.R, allen_mouse_whole_cortex.R, mousegastrulation.R - r_script: "mousegastrulation.R" - # - output_name_ref_metadata: "mousegastrulation_reference_metadata" - # - output_name_ref_matrix: "mousegastrulation_reference_matrix" - # - output_name_matrix: "mousegastrulation_matrix" + # Name of the script you want to execute (Corespond to the reference you want to use) + # For now : arlotta.R, allen_mouse_whole_cortex.R + r_script: "arlotta.R" + # Output folder name for the reference + reference_name: "02_Arlotta_not_norm" + # Output file name for the reference metadata + output_name_ref_metadata: "metadata" + # Output file name for the reference matrix + output_name_ref_matrix: "reference_matrix" + # Output file name for the matrix to annotate + output_name_matrix: "data_matrix" + # Name of the column that contain the cells + # allen mouse whole cortex : "sample_name", arlotta : "NEW_NAME" + cells_column: "sample_name" ##### Our Matrix to annotate ##### - # The name of the folder of the sample you want to annotate (Folder in the Seurat folder) - sample_id: "72h" + # The name of the folder of the sample you want to annotate (One of the output in the seurat folder) + sample_id: "E14sn" ##### Paola Arlotta ##### # Path for the arlotta metadata arlotta_metadata: "arlotta/metaData_scDevSC.txt" - # - arlotta_matrix: "arlotta/gene_sorted-matrix.mtx.gz" - # - arlotta_cells: "arlotta/barcodes.tsv" - # - arlotta_features: "arlotta/genes.tsv" # Pattern that correspond to the mouse age that we want to keep for the training of our data - # It's possible to select multiple ones if separate by a coma - # In this reference : E10, E11, E12, E13, E14, E15, E16, E17, E18, P1, P4 - pattern_to_keep: "P4" - + # It's possible to select multiple ones if separate by a coma, exemple: ["E10", "E11", "P4"] + # In this reference : E10, E11, E12, E13, E14, E15, E16, E17, E18_S1, E18_S3, P1, P1_S1, P4 + developmental_time: ["E10", "E11", "E12", "E13", "E14", "E15", "E16", "E17", "E18_S1", "E18_S3", "P1", "P1_S1", "P4"] + ##### Allen_mouse_whole_cortex ##### - # - allen_metadata: "allen/" - # - allen_matrix: "allen/" - # - allen_genes: "allen/" - # - allen_cells: "allen/" - - ##### mousegastrulation ##### - # Samples from the reference as described in EmbryoAtlasData object (for lenne project : c(1:20,23:37) - # mousegastrulation_samples: "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37" - mousegastrulation_samples: "1,5" + # Path for the arlotta metadata + allen_metadata: "allen/metadata.csv" + # Path for the allen matrix (matrix + genes + cells) + allen_matrix: "allen/expression_matrix.hdf5" +# An oversampling technique for imalanced datasets +knnor: + # Name of the column in your metadata you want to oversample + class_label: "subclass_label" + # This method will increase the minority labels up to the value of the majority label. + # If you want to reduce the maximum number of new labels created, you need to remove the majority labels. + # If multiple labels to delete, put a comma between them (with NO SPACE). (You can also keep them all by not writting) + max_oversampling : + # "Apical progenitors,UL CPN,Migrating neurons,Interneurons,Intermediate progenitors,Layer 4,CThPN,DL CPN,SCPN,Astrocytes,Immature neurons,Oligodendrocytes,Cycling glial cells,Cajal Retzius cells,DL_CPN_1,NP,VLMC" sims: ##### General # Name of the column in your metadata on which you wish to train your model. - class_label: "New_cellType" - # - # https://lightning.ai/docs/pytorch/stable/advanced/speed.html + # Aallen mouse whole cortex : "subclass_label", arlotta : "New_cellType" + class_label: "celltype_label" + # Your personnal key to connect to your wandb account + key: "5f86a9267b08ff77c21a8872d763cb76048dfaca" + # Number of cores (Put the same in the run_slurm.sh if necessary) num_workers: 12 - # - max_epoch: 100 + # Maximum number of epoch to train our model (Will stop after, start counting from 0) + max_epoch: 500 ##### Logger - # Name of the project (Will be the name of the folder where the model checkpoint will be saved) - project_name: "P5_WT_test" + # Name of the project (Will be the name of the run and checkpoint) + project_name: "02_Arlotta_not_norm" ##### Early Stopping # The validation metric you wish to monitor for early stop. monitor: "val_loss" - # The number of epochs during which the validation metric may not improve before early shutdown is triggered. - # 10% of the number of epoch to beggin with - patience: 10 + # The number of epochs during which the validation metric (monitor) may not improve before early shutdown is triggered. (To prevent overtraining) + patience: 20 - ##### Model Checkpoint - # + # Epoch file from wandb to chosse for the cell assignation + epoch: "epoch=58-step=80004.ckpt" + + ##### Cells annotation + # Threshold of the difference between the first and second score of the prediction to say if a cell is unknown or not + threshold: 0.1 + # Seurat object with sims cell_assignation + rds_assignation: filtered_assigned_seurat_object.rds + # Matrix with cell predtion after filtration with the threshold + pred_filtered: data_matrix_prediction_filtered.csv # Differentially expressed features analyses diffexp: + # The name of the time point analysis (without WT or KO, ex : E14) + time_point : "E14" # Features of interest features: "SOX10,SOX9,RIPOR2,MITF,DCT,MYC,FLT1,TNFRSF11B,XIRP2,ST3GAL1" # Cells of interest to find differentially expressed features against "Normal" or Wild Type cells @@ -256,6 +268,3 @@ diffexpsubset: genetoremove: "" # Cluster on the seurat object (choose cluster with high number of the marker gene of interest) cluster: "Normal_Melanocytes,M15,M27,A375,MM057,MM087,MM001,MM074" - - - diff --git a/feature_config.csv b/feature_config.csv new file mode 100644 index 0000000..afc355e --- /dev/null +++ b/feature_config.csv @@ -0,0 +1,16 @@ +id,name,read,pattern,sequence,feature_type +HTO-1,HTO-1,R2,^(BC),TTCCTGCCATTACTA,Antibody Capture +HTO-2,HTO-2,R2,^(BC),CCGTACCTCATTGTT,Antibody Capture +HTO-3,HTO-3,R2,^(BC),GGTAGATGTCCTCAG,Antibody Capture +HTO-4,HTO-4,R2,^(BC),TGGTGTCATTCTTGA,Antibody Capture +HTO-5,HTO-5,R2,^(BC),ATGATGAACAGCCAG,Antibody Capture +HTO-6,HTO-6,R2,^(BC),CTCGAACGCTTATCG,Antibody Capture +HTO-7,HTO-7,R2,^(BC),CTTATCACCGCTCAA,Antibody Capture +HTO-8,HTO-8,R2,^(BC),TGACGCCGTTGTTGT,Antibody Capture +HTO-9,HTO-9,R2,^(BC),GCCTAGTATGATCCA,Antibody Capture +HTO-10,HTO-10,R2,^(BC),AGTCACAGTATTCCA,Antibody Capture +HTO-11,HTO-11,R2,^(BC),TTGGCCTTTGTATCG,Antibody Capture +HTO-12,HTO-12,R2,^(BC),AACGCCAGTATGAAC,Antibody Capture +HTO-13,HTO-13,R2,^(BC),GCTTCCGTATATCTG,Antibody Capture +HTO-14,HTO-14,R2,^(BC),AGCAACTCACTCTTT,Antibody Capture +HTO-15,HTO-15,R2,^(BC),CACTAGAGCTTGGAT,Antibody Capture \ No newline at end of file diff --git a/multi_config.csv b/multi_config.csv index ef14fd5..a9ba8ac 100644 --- a/multi_config.csv +++ b/multi_config.csv @@ -1,16 +1,17 @@ [gene-expression] -reference,/scratch/tvannier/ibdm_lenne_scrnaseq_gastruloids/01_Reference/mouse_mm10_optimized_reference_v2 -expect-cells,10000 +reference,/scratch/lchabot/scRNAseq_analysis_workflow/01_Reference/refdata-cellranger-mm39/ +expect-cells,30000 include-introns,true +[feature] +reference,/scratch/lchabot/scRNAseq_analysis_workflow/feature_config.csv + [libraries] fastq_id,fastqs,feature_types -S004647_to_S004650_1,/scratch/tvannier/ibdm_lenne_scrnaseq_gastruloids/00_RawData/S004647_to_S004650_1,Gene Expression -S004647_to_S004650_2,/scratch/tvannier/ibdm_lenne_scrnaseq_gastruloids/00_RawData/S004647_to_S004650_2,Multiplexing Capture +NGc-3-CTX,/scratch/lchabot/scRNAseq_analysis_workflow/00_RawData/NGc_3_CTX,Gene Expression +NGh-HTO-CTX,/scratch/lchabot/scRNAseq_analysis_workflow/00_RawData/NGh_HTO_CTX,Antibody Capture [samples] -sample_id,cmo_ids,description -72h,CMO301,72h -80h,CMO302,80h -86h,CMO303,86h -96h,CMO304,96h \ No newline at end of file +sample_id,cmo_ids +E14_WT,HTO1 +E14_KO,HTO3 \ No newline at end of file diff --git a/run_slurm.sh b/run_slurm.sh index d7cd545..d93aadd 100644 --- a/run_slurm.sh +++ b/run_slurm.sh @@ -1,12 +1,12 @@ #!/bin/sh -#SBATCH -J Snakemake_run + +#SBATCH -J P30_WT_umap #SBATCH -p skylake -#SBATCH -N 1 -#SBATCH -n 12 +#SBATCH --ntasks-per-node 18 #SBATCH -A b324 -#SBATCH -t 2:00:00 +#SBATCH -t 01:00:00 #SBATCH -o ./%N.%x.out -#SBATCH -e ./%N.%x.err +#SBATCH -e ./%N.%x.errs # This script needs to be started from the run directory @@ -16,10 +16,15 @@ module load userspace/all module load python3/3.6.3 module load singularity/3.5.1 pip install snakemake==6.3.0 -pip install pandas +# module load cuda/11.6 # ================================================ # Run the workflow # ================================================ -snakemake --snakefile Snakefile --use-singularity --use-conda --conda-frontend conda --conda-not-block-search-path-envvars --singularity-args="-B /scratch/$SLURM_JOB_USER/ibdm_lenne_scrnaseq_gastruloids/" --cores 12 \ No newline at end of file +# For the worflow in general +snakemake --unlock +snakemake --snakefile Snakefile --use-singularity --use-conda --conda-frontend conda --conda-not-block-search-path-envvars --singularity-args="-B /scratch/$SLURM_JOB_USER/scRNAseq_analysis_workflow/" --cores 16 + +# ### For sims +# snakemake --snakefile Snakefile --use-singularity --singularity-args="-B /scratch/$SLURM_JOB_USER/scRNAseq_analysis_workflow/" --cores 18 \ No newline at end of file diff --git a/sample.tsv b/sample.tsv index 8064522..8bf6467 100644 --- a/sample.tsv +++ b/sample.tsv @@ -1,5 +1,2 @@ -name id rawsample sample lane fastq1 fastq2 barcode -2 72h S004647_to_S004650 S004647 L001 /00_RawData/2_S1_L001_R1_001.fastq.gz /00_RawData/2_S1_L001_R2_001.fastq.gz /00_RawData/2_S1_L001_I1_001.fastq.gz -3 80h S004647_to_S004650 S004648 L001 /00_RawData/3_S1_L001_R1_001.fastq.gz /00_RawData/3_S1_L001_R2_001.fastq.gz /00_RawData/3_S1_L001_I1_001.fastq.gz -6 86h S004647_to_S004650 S004649 L001 /00_RawData/6_S1_L001_R1_001.fastq.gz /00_RawData/6_S1_L001_R2_001.fastq.gz /00_RawData/6_S1_L001_I1_001.fastq.gz -7 96h S004647_to_S004650 S004650 L001 /00_RawData/7_S1_L001_R1_001.fastq.gz /00_RawData/7_S1_L001_R2_001.fastq.gz /00_RawData/7_S1_L001_I1_001.fastq.gz \ No newline at end of file +name id rawsample sample lane multiplex hto + P8_P16_WT TRUE HTO-11,HTO-15 \ No newline at end of file