diff --git a/Arabidopsis/Lasky2016/FullData.rds b/Arabidopsis/Lasky2016/FullData.rds new file mode 100644 index 0000000..7d0873f Binary files /dev/null and b/Arabidopsis/Lasky2016/FullData.rds differ diff --git a/Arabidopsis/Lasky2016/Lasky2016Processing.R b/Arabidopsis/Lasky2016/Lasky2016Processing.R new file mode 100644 index 0000000..d076032 --- /dev/null +++ b/Arabidopsis/Lasky2016/Lasky2016Processing.R @@ -0,0 +1,56 @@ +AccClim <- read.csv("~/Downloads/SOO/Lasky2012/AccessionClimateData.csv") +AccClim$ecotype_id <- paste0("X", AccClim$ecotype_id) + +CommClim <- read.csv("~/Downloads/SOO/Lasky2012/CommonGard_Clim.csv") +Pheno <- read.csv("~/Downloads/SOO/Fournier-Level.csv") +Pheno$ecotype_id <- paste0("X", Pheno$ecotype_id) +SNPs <- read.csv("~/Downloads/SOO/call_method_75/call_method_75_TAIR9.csv", header = T, skip = 1) + +AccClim <- AccClim[AccClim$ecotype_id %in% Pheno$ecotype_id ,] #114 +Pheno <- Pheno[Pheno$ecotype_id %in% AccClim$ecotype_id ,] #137 accessions + +SeqIndx <- which(colnames(SNPs) %in% Pheno$ecotype_id) +SNPs <- SNPs[c(1,2, SeqIndx)] + +#Order the Phenotypes, Climate data and SNPs by ecotype ID +Pheno <- Pheno[match(colnames(SNPs[3:ncol(SNPs)]), Pheno$ecotype_id) ,] +AccClim <- AccClim[match(colnames(SNPs[3:ncol(SNPs)]), AccClim$ecotype_id) ,] + +dim(Pheno); dim(AccClim); dim(SNPs) + + +#Recode SNPs +SNPinfo <- SNPs[1:2] +SNPs <- t(SNPs[3:ncol(SNPs)]) + +#Recode ATCG with 0,1,2 +#minor allele gets coded as 0, major coded as 2 +SNPmat <- matrix(0, ncol = ncol(SNPs), nrow = nrow(SNPs)) +for (i in 1:ncol(SNPs)){ + tmpCnts <- table(SNPs[,i]) + MajAllele <- names(tmpCnts[which(tmpCnts == max(tmpCnts))]) + MinAllele <- names(tmpCnts[which(tmpCnts == min(tmpCnts))]) + SNPmat[which(SNPs[,i] %in% MajAllele),i] <- 2 + SNPmat[which(SNPs[,i] %in% MinAllele),i] <- 0 +} +colnames(SNPmat) <- colnames(SNPs) +row.names(SNPmat) <- row.names(SNPs) + +#filter low MAF (0.1) SNPs +freq <- colMeans(SNPmat) / 2 +maf <- ifelse(freq > 0.5, 1-freq, freq) +maf.index <- which(maf < 0.1) +length(maf.index) + +SNPmat <- SNPmat[, -maf.index] #137 x 200682 +SNPinfo <- SNPinfo[-maf.index ,] #200682 x 2 + +dim(SNPmat); dim(SNPinfo) #114 168407 + +FullData <- list(SNPmat = SNPmat, + MAP = SNPinfo, + AccessionClimData = AccClim, + CommonGardenClimData = CommClim, + Phenotypes = Pheno) + +saveRDS(FullData, "~/Downloads/SOO/FullData.rds") diff --git a/Arabidopsis/Lasky2016/README.md b/Arabidopsis/Lasky2016/README.md new file mode 100644 index 0000000..a5b1f87 --- /dev/null +++ b/Arabidopsis/Lasky2016/README.md @@ -0,0 +1,4 @@ +# Lasky 2016 +This is a dataset from Jesse Lasky's 2016 [paper](https://onlinelibrary.wiley.com/doi/abs/10.1111/1755-0998.12714) that describes an approach to synthesize data from common garden and genome–environment associations to infer SNP effects for local adaptation. The paper uses data from [Fournier-Level et al 2011](https://science.sciencemag.org/content/334/6052/86), [Handcock et al (2012)](https://science.sciencemag.org/content/334/6052/83), and [Horton et al (2012)](https://www.nature.com/articles/ng.1042). I have taken these data and selected accessions that have phenotypic information, climate data at their collection sites, and genotypic data. The .Rds file contains all these data. +SNPs were obtained from [here](http://bergelson.uchicago.edu/wp-content/uploads/2015/04/call_method_75.tar.gz) +Phenotypic and climate data were accessed from [here](https://onlinelibrary.wiley.com/doi/abs/10.1111/1755-0998.12714) diff --git a/README.md b/README.md index d52945a..e792ec5 100644 --- a/README.md +++ b/README.md @@ -15,3 +15,5 @@ The directories and contents are listed below. * NGR * Olatoye_2018 +* Arabidopsis + * Lasky 2016