diff --git a/r-scripts/.RData b/r-scripts/.RData
new file mode 100644
index 0000000..13df746
Binary files /dev/null and b/r-scripts/.RData differ
diff --git a/r-scripts/.Rhistory b/r-scripts/.Rhistory
new file mode 100644
index 0000000..57b244e
--- /dev/null
+++ b/r-scripts/.Rhistory
@@ -0,0 +1,512 @@
+prep_word2vec(origin='features.txt',destination = 'vectors.txt',lowercase = T)
+model = train_word2vec("vectors.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=0)
+word2vec = read.vectors("feature_vectors.bin")
+View(word2vec)
+head(word2vec)
+dim(word2vec)
+word2vecDf=as.data.frame(word2vec)
+word2vecDf
+head(word2vecDf)
+colnames(word2vecDf)
+word2vecDf.index
+word2vec[1]
+word2vec[0]
+View(word2vecDf)
+?prep_word2vec
+model = train_word2vec("features.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=0)
+word2vec = read.vectors("feature_vectors.bin")
+word2vecDf=as.data.frame(word2vec)
+View(word2vecDf)
+a=unique(all)
+a=as.data.frame(unique(all))
+View(a)
+word2vecDf[0]
+word2vec[1:67]
+word2vec[1:67]
+word2vec[1]
+word2vec[1,1]
+model = train_word2vec("features.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=0,sample=0.001,binary=1)
+?word2vec
+?train_word2vec
+model = train_word2vec("features.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=5)
+length(all)
+all=tolower(all)
+all=gsub("[[:punct:]]", "", all)
+all
+write(all,'features.txt')
+prep_word2vec(origin='features.txt',destination = 'vectors.txt',lowercase = T)
+model = train_word2vec("features.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=5)
+write(all,'features.txt')
+model = train_word2vec("features.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=5)
+model = train_word2vec("features.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=5)
+write(all,'features.txt')
+model = train_word2vec("features.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=0)
+concat_features=so_data %>%
+select(IDE,HaveWorkedLanguage,Country,HaveWorkedDatabase,HaveWorkedFramework,HaveWorkedPlatform,WantWorkLanguage,WantWorkFramework,WantWorkDatabase,WantWorkPlatform) %>%
+filter(Country=='Pakistan') %>%
+select(-Country) %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(IDE=paste(IDE,";",sep="")) %>%
+mutate(HaveWorkedLanguage=paste(HaveWorkedLanguage,";",sep="")) %>%
+mutate(HaveWorkedDatabase=paste(HaveWorkedDatabase,";",sep="")) %>%
+mutate(HaveWorkedFramework=paste(HaveWorkedFramework,";",sep="")) %>%
+mutate(WantWorkLanguage=paste(WantWorkLanguage,";",sep="")) %>%
+mutate(WantWorkDatabase=paste(WantWorkDatabase,";",sep="")) %>%
+mutate(WantWorkFramework=paste(WantWorkFramework,";",sep="")) %>%
+mutate(concat_have=paste(IDE,HaveWorkedLanguage,HaveWorkedDatabase,HaveWorkedFramework,HaveWorkedPlatform)) %>%
+mutate(concat_have=gsub(" ","",concat_have)) %>%
+mutate(concat_want=paste(IDE,WantWorkLanguage,WantWorkDatabase,WantWorkFramework,WantWorkPlatform)) %>%
+mutate(concat_want=gsub(" ","",concat_want))
+vectorAllWant=concat_features %>%
+pull(concat_want)
+vectorAllHave=concat_features %>%
+pull(concat_have)
+embeddingAllWant=unlist(strsplit(as.character(vectorAllWant),';'))
+embeddingAllHave=unlist(strsplit(as.character(vectorAllHave),';'))
+all=c(embeddingAllWant,embeddingAllHave)
+doc=Corpus(VectorSource(all))
+dtm <- TermDocumentMatrix(doc)
+m <- as.matrix(dtm)
+v <- sort(rowSums(m),decreasing=TRUE)
+d <- data.frame(word = names(v),freq=v)
+set.seed(1234)
+wordcloud(words = d$word, freq = d$freq, min.freq = 1,
+max.words=200, random.order=FALSE, rot.per=0.35,
+colors=brewer.pal(8, "Dark2"))
+write(all,'features.txt')
+model = train_word2vec("features.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=0)
+?tcm
+install.packages('text2vec')
+?create_tcm
+library(text2vec)
+?create_tcm
+v = create_vocabulary(all)
+View(v)
+vectorizer = vocab_vectorizer(v, grow_dtm = F, skip_grams_window = 5)
+?vocab_vectorizer
+vectorizer = vocab_vectorizer(v, grow_dtm = F, skip_grams_window = 5)
+vectorizer = vocab_vectorizer(v)
+vectorizer
+tcm=create_tcm(all,vectorizer)
+?create_tcm
+it = itoken(all)
+it
+tcm=create_tcm(it,vectorizer)
+?itoken
+all
+unlist(all)
+as.data.frame(all)
+tmp=as.data.frame(all)
+View(tmp)
+it = itoken(tmp$all)
+View(tmp)
+tmp = tmp %>% mutate(all=as.character(all))
+it = itoken(tmp$all)
+it
+v = create_vocabulary(it)
+View(v)
+vectorizer = vocab_vectorizer(v)
+tcm=create_tcm(it,vectorizer)
+tcm
+fit <- glove(tcm = tcm,
+word_vectors_size = 50,
+x_max = 10, learning_rate = 0.2,
+num_iters = 15)
+fit <- GloVe(tcm = tcm,
+word_vectors_size = 50,
+x_max = 10, learning_rate = 0.2,
+num_iters = 15)
+fit <- glove(tcm = tcm,
+word_vectors_size = 50,
+x_max = 10, learning_rate = 0.2,
+num_iters = 15)
+vectorizer = vocab_vectorizer(v, grow_dtm = F, skip_grams_window = 5)
+vectorizer = vocab_vectorizer(v)
+vectorizer
+word_array
+clear
+tmp=as.data.frame(all)
+tmp = tmp %>% mutate(all=as.character(all)) %>% tolower(.)
+tmp
+it = itoken(tmp$all)
+tmp=as.data.frame(all)
+tmp = tmp %>% mutate(all=as.character(all)) %>% mutate(all=tolower(all))
+it = itoken(tmp$all)
+v = create_vocabulary(it)
+v
+vectorizer = vocab_vectorizer(v)
+tcm=create_tcm(it,vectorizer)
+v = create_vocabulary(it)  prune_vocabulary(term_count_min = 5)
+v = create_vocabulary(it) %>%  prune_vocabulary(term_count_min = 5)
+vectorizer = vocab_vectorizer(v)
+tcm=create_tcm(it,vectorizer)
+vectorizer = vocab_vectorizer(v,skip_grams_window = 5)
+?crate_tcm
+?create_tcm
+tcm=create_tcm(it,vectorizer,skip_grams_window=5)
+tcm=create_tcm(it,vectorizer,skip_grams_window=5L)
+?create_tcm
+tcm
+word2vec
+word2vec[1:67]
+word2vec[1:66]
+?plot
+??Rtsne
+Rtsne(word2vec)
+reduction <- Rtsne(as.matrix(word2vec), dims = 2, initial_dims = 50,
+perplexity = 300, theta = 0.5, check_duplicates = F,
+pca = F, max_iter = 1000, verbose = F,
+is_distance = F, Y_init = NULL)
+word2vec
+word2vecDf=as.data.frame(word2vec)
+reduction <- Rtsne(as.matrix(word2vecdf), dims = 2, initial_dims = 50,
+perplexity = 300, theta = 0.5, check_duplicates = F,
+pca = F, max_iter = 1000, verbose = F,
+is_distance = F, Y_init = NULL)
+reduction <- Rtsne(as.matrix(word2vec), dims = 2, initial_dims = 300,
+perplexity = 300, theta = 0.5, check_duplicates = F,
+pca = F, max_iter = 1000, verbose = F,
+is_distance = F, Y_init = NULL)
+reduction <- Rtsne(as.matrix(word2vec), dims = 2, initial_dims = 50,
+perplexity = 21, theta = 0.5, check_duplicates = F,
+pca = F, max_iter = 1000, verbose = F,
+is_distance = F, Y_init = NULL)
+reduction
+df <- as.data.frame(reduction$Y)
+View(df)
+rows <- rownames(word2vec)
+rows
+rownames(df) <- rows
+View(df)
+View(df)
+df % filter(rownames(df) == '</s>')
+df %>% filter(rownames(df) == '</s>')
+df %>% filter(rownames(df) != '</s>')
+df=df %>% filter(rownames(df) != '</s>')
+View(df)
+rows
+rows[2:]
+rows[2:66]
+rows[2:67]
+rows[2:68]
+rows[2:67]
+rownames(df) <- rows[2:67]
+View(df)
+ggplot(df) +
+geom_point(aes(x = V1, y = V2), color = "red") +
+geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) +
+xlab("Dimension 1") +
+ylab("Dimension 2 ") +
+# geom_text(fontface = 2, alpha = .8) +
+theme_bw(base_size = 12) +
+theme(legend.position = "none") +
+ggtitle(paste0("2D reduction of Word Embedding Model ", ref_name," using t_SNE"))
+ggsave(paste0(ref_name, ".jpeg"), path = path, width = 24,
+height = 18, dpi = 100)
+ggplot(df) +
+geom_point(aes(x = V1, y = V2), color = "red") +
+geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) +
+xlab("Dimension 1") +
+ylab("Dimension 2 ") +
+# geom_text(fontface = 2, alpha = .8) +
+theme_bw(base_size = 12) +
+theme(legend.position = "none") +
+ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE"))
+ggsave(paste0(ref_name, ".jpeg"), path = path, width = 24,
+height = 18, dpi = 100)
+ggplot(df) +
+geom_point(aes(x = V1, y = V2), color = "red") +
+geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) +
+xlab("Dimension 1") +
+ylab("Dimension 2 ") +
+# geom_text(fontface = 2, alpha = .8) +
+theme_bw(base_size = 12) +
+theme(legend.position = "none") +
+ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE"))
+ggsave(paste0("without_lower", ".jpeg"), path = path, width = 24,
+height = 18, dpi = 100)
+ggplot(df) +
+geom_point(aes(x = V1, y = V2), color = "red") +
+geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) +
+xlab("Dimension 1") +
+ylab("Dimension 2 ") +
+# geom_text(fontface = 2, alpha = .8) +
+theme_bw(base_size = 12) +
+theme(legend.position = "none") +
+ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE"))
+prep_word2vec(origin='features.txt',destination = 'vectors.txt',lowercase = T)
+model = train_word2vec("vectors.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=0)
+word2vec = read.vectors("feature_vectors.bin")
+reduction <- Rtsne(as.matrix(word2vec), dims = 2, initial_dims = 50,
+perplexity = 21, theta = 0.5, check_duplicates = F,
+pca = F, max_iter = 1000, verbose = F,
+is_distance = F, Y_init = NULL)
+df <- as.data.frame(reduction$Y)
+rows <- rownames(word2vec)
+rownames(df) <- rows
+df=df %>% filter(rownames(df) != '</s>')
+rownames(df) <- rows[2:67]
+# Create t-SNE plot and save as jpeg
+ggplot(df) +
+geom_point(aes(x = V1, y = V2), color = "red") +
+geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) +
+xlab("Dimension 1") +
+ylab("Dimension 2 ") +
+# geom_text(fontface = 2, alpha = .8) +
+theme_bw(base_size = 12) +
+theme(legend.position = "none") +
+ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE"))
+model
+model = train_word2vec("vectors.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=0)
+word2vec = read.vectors("feature_vectors.bin")
+word2vec
+reduction <- Rtsne(word2vec, dims = 2, initial_dims = 50,
+perplexity = 21, theta = 0.5, check_duplicates = F,
+pca = F, max_iter = 1000, verbose = F,
+is_distance = F, Y_init = NULL)
+df <- as.data.frame(reduction$Y)
+rows <- rownames(word2vec)
+rownames(df) <- rows
+df=df %>% filter(rownames(df) != '</s>')
+rownames(df) <- rows[2:67]
+rows[2:65]
+rows[2:66]
+df=df %>% filter(rownames(df) != '</s>')
+rownames(df) <- rows[2:65]
+ggplot(df) +
+geom_point(aes(x = V1, y = V2), color = "red") +
+geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) +
+xlab("Dimension 1") +
+ylab("Dimension 2 ") +
+# geom_text(fontface = 2, alpha = .8) +
+theme_bw(base_size = 12) +
+theme(legend.position = "none") +
+ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE"))
+reduction <- Rtsne(word2vec, dims = 2, initial_dims = 50,
+perplexity = 21, theta = 0.5, check_duplicates = F,
+pca = F, max_iter = 1000, verbose = F,
+is_distance = F, Y_init = NULL)
+df <- as.data.frame(reduction$Y)
+rows <- rownames(word2vec)
+ggplot(df) +
+geom_point(aes(x = V1, y = V2), color = "red") +
+geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) +
+xlab("Dimension 1") +
+ylab("Dimension 2 ") +
+# geom_text(fontface = 2, alpha = .8) +
+theme_bw(base_size = 12) +
+theme(legend.position = "none") +
+ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE"))
+rows <- rownames(word2vec)
+rownames(df) <- rows
+ggplot(df) +
+geom_point(aes(x = V1, y = V2), color = "red") +
+geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) +
+xlab("Dimension 1") +
+ylab("Dimension 2 ") +
+# geom_text(fontface = 2, alpha = .8) +
+theme_bw(base_size = 12) +
+theme(legend.position = "none") +
+ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE"))
+word2vec %>% closest_to('vim')
+model
+model = train_word2vec("vectors.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=0)
+model = train_word2vec("vectors.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=0)
+model
+rownames(model)
+rownames(model)[2:65]
+model[2:65]
+model[:,2:65]
+model[,2:65]
+model[2:65,]
+model
+model[2:65,]
+word2vec=model[2:65,]  # remove at end of string
+word2vec %>% closest_to('vim')
+word2vec %>% closest_to('aws')
+reduction <- Rtsne(word2vec, dims = 2, initial_dims = 50,
+perplexity = 21, theta = 0.5, check_duplicates = F,
+pca = F, max_iter = 1000, verbose = F,
+is_distance = F, Y_init = NULL)
+df <- as.data.frame(reduction$Y)
+rows <- rownames(word2vec)
+rownames(df) <- rows
+#df=df %>% filter(rownames(df) != '</s>')
+#rownames(df) <- rows[2:65]
+# Create t-SNE plot and save as jpeg
+ggplot(df) +
+geom_point(aes(x = V1, y = V2), color = "red") +
+geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) +
+xlab("Dimension 1") +
+ylab("Dimension 2 ") +
+# geom_text(fontface = 2, alpha = .8) +
+theme_bw(base_size = 12) +
+theme(legend.position = "none") +
+ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE"))
+save(ggplot(df) +
+geom_point(aes(x = V1, y = V2), color = "red") +
+geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) +
+xlab("Dimension 1") +
+ylab("Dimension 2 ") +
+# geom_text(fontface = 2, alpha = .8) +
+theme_bw(base_size = 12) +
+theme(legend.position = "none") +
+ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE")),'a.png')
+ggsave('s_removed.png')
+word2vec = read.vectors("feature_vectors.bin")
+reduction <- Rtsne(word2vec, dims = 2, initial_dims = 50,
+perplexity = 21, theta = 0.5, check_duplicates = F,
+pca = F, max_iter = 1000, verbose = F,
+is_distance = F, Y_init = NULL)
+df <- as.data.frame(reduction$Y)
+rows <- rownames(word2vec)
+rownames(df) <- rows
+ggplot(df) +
+geom_point(aes(x = V1, y = V2), color = "red") +
+geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) +
+xlab("Dimension 1") +
+ylab("Dimension 2 ") +
+# geom_text(fontface = 2, alpha = .8) +
+theme_bw(base_size = 12) +
+theme(legend.position = "none") +
+ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE"))
+ggsave('s_removed.png')
+word2vec=model[2:65,]  # remove at end of string
+# Create t-SNE plot and save as jpeg
+ggplot(df) +
+geom_point(aes(x = V1, y = V2), color = "red") +
+geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) +
+xlab("Dimension 1") +
+ylab("Dimension 2 ") +
+# geom_text(fontface = 2, alpha = .8) +
+theme_bw(base_size = 12) +
+theme(legend.position = "none") +
+ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE"))
+ggsave('s_removed.png')
+ggplot(df) +
+geom_point(aes(x = V1, y = V2), color = "red") +
+geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) +
+xlab("Dimension 1") +
+ylab("Dimension 2 ") +
+# geom_text(fontface = 2, alpha = .8) +
+theme_bw(base_size = 12) +
+theme(legend.position = "none") +
+ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE"))
+prep_word2vec(origin='features.txt',destination = 'vectors.txt',lowercase = T)
+model = train_word2vec("vectors.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=0)
+word2vec=model[2:65,]  # remove at end of string
+prep_word2vec(origin='features.txt',destination = 'vectors.txt',lowercase = T)
+model = train_word2vec("vectors.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=0)
+word2vec=model[2:65,]  # remove at end of string
+word2vec
+reduction <- Rtsne(word2vec, dims = 2, initial_dims = 50,
+perplexity = 21, theta = 0.5, check_duplicates = F,
+pca = F, max_iter = 1000, verbose = F,
+is_distance = F, Y_init = NULL)
+df <- as.data.frame(reduction$Y)
+rows <- rownames(word2vec)
+rownames(df) <- rows
+#df=df %>% filter(rownames(df) != '</s>')
+#rownames(df) <- rows[2:65]
+# Create t-SNE plot and save as jpeg
+ggplot(df) +
+geom_point(aes(x = V1, y = V2), color = "red") +
+geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) +
+xlab("Dimension 1") +
+ylab("Dimension 2 ") +
+# geom_text(fontface = 2, alpha = .8) +
+theme_bw(base_size = 12) +
+theme(legend.position = "none") +
+ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE"))
+ggsave('s_removed.png')
+word2vec = read.vectors("feature_vectors.bin")
+word2vec
+reduction <- Rtsne(word2vec, dims = 2, initial_dims = 50,
+perplexity = 21, theta = 0.5, check_duplicates = F,
+pca = F, max_iter = 1000, verbose = F,
+is_distance = F, Y_init = NULL)
+df <- as.data.frame(reduction$Y)
+rows <- rownames(word2vec)
+rownames(df) <- rows
+#df=df %>% filter(rownames(df) != '</s>')
+#rownames(df) <- rows[2:65]
+# Create t-SNE plot and save as jpeg
+ggplot(df) +
+geom_point(aes(x = V1, y = V2), color = "red") +
+geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) +
+xlab("Dimension 1") +
+ylab("Dimension 2 ") +
+# geom_text(fontface = 2, alpha = .8) +
+theme_bw(base_size = 12) +
+theme(legend.position = "none") +
+ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE"))
+ggsave('s_not_removed.png')
+wordcloud(words = d$word, freq = d$freq, min.freq = 1,
+max.words=200, random.order=FALSE, rot.per=0.35,
+colors=brewer.pal(8, "Dark2"))
+prep_word2vec(origin='features.txt',destination = 'vectors.txt',lowercase = T)
+model = train_word2vec("vectors.txt",vectors=300,threads=4,window=5,iter=5,negative_samples=0)
+word2vec=model[2:65,]  # remove at end of string
+?Rtsne
+reduction <- Rtsne(word2vec, dims = 2, initial_dims = 50,
+perplexity = 21, theta = 0, check_duplicates = F,
+pca = F, max_iter = 1000, verbose = T,
+is_distance = F, Y_init = NULL)
+df <- as.data.frame(reduction$Y)
+rows <- rownames(word2vec)
+rownames(df) <- rows
+ggplot(df) +
+geom_point(aes(x = V1, y = V2), color = "red") +
+geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) +
+xlab("Dimension 1") +
+ylab("Dimension 2 ") +
+# geom_text(fontface = 2, alpha = .8) +
+theme_bw(base_size = 12) +
+theme(legend.position = "none") +
+ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE"))
+reduction <- Rtsne(word2vec, dims = 2, initial_dims = 50,
+perplexity = 21, theta = 0.5, check_duplicates = F,
+pca = F, max_iter = 1000, verbose = T,
+is_distance = F, Y_init = NULL)
+df <- as.data.frame(reduction$Y)
+rows <- rownames(word2vec)
+rownames(df) <- rows
+ggplot(df) +
+geom_point(aes(x = V1, y = V2), color = "red") +
+geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) +
+xlab("Dimension 1") +
+ylab("Dimension 2 ") +
+# geom_text(fontface = 2, alpha = .8) +
+theme_bw(base_size = 12) +
+theme(legend.position = "none") +
+ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE"))
+ggsave('wordembeddings.png')
+wordcloud(words = d$word, freq = d$freq, min.freq = 1,
+max.words=200, random.order=FALSE, rot.per=0.35,
+colors=brewer.pal(8, "Dark2"))
+write( embeddingAllHave,'have_features.txt')
+prep_word2vec(origin='have_features.txt',destination = 'vectors.txt',lowercase = T)
+model = train_word2vec("vectors.txt",vectors=300,threads=4,window=5,iter=5,negative_samples=0)
+word2vec=model[2:65,]  # remove at end of string
+#tsne (dimensionality reduction)
+reduction <- Rtsne(word2vec, dims = 2, initial_dims = 50,
+perplexity = 21, theta = 0.5, check_duplicates = F,
+pca = F, max_iter = 1000, verbose = T,
+is_distance = F, Y_init = NULL)
+df <- as.data.frame(reduction$Y)
+rows <- rownames(word2vec)
+rownames(df) <- rows
+#df=df %>% filter(rownames(df) != '</s>')
+#rownames(df) <- rows[2:65]
+# Create t-SNE plot and save as jpeg
+ggplot(df) +
+geom_point(aes(x = V1, y = V2), color = "red") +
+geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) +
+xlab("Dimension 1") +
+ylab("Dimension 2 ") +
+# geom_text(fontface = 2, alpha = .8) +
+theme_bw(base_size = 12) +
+theme(legend.position = "none") +
+ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE"))
+ggsave('wordembeddings_have.png')
diff --git a/r-scripts/ExploratoryAnalysis.R b/r-scripts/ExploratoryAnalysis.R
new file mode 100644
index 0000000..cc96d08
--- /dev/null
+++ b/r-scripts/ExploratoryAnalysis.R
@@ -0,0 +1,342 @@
+library(dplyr)
+library(ggplot2)
+library(stringr)
+library(jsonlite)
+library(data.table)
+library(reshape2)
+library(tidyr)
+
+# Job Sastifaction of developer type , pakistan compared with other big wig countries in IT (tested)
+
+jobDf = so_data %>%
+  select(DeveloperType,JobSatisfaction,Country) %>%
+  filter(Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') %>%
+  mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+  unnest(DeveloperType) %>%
+  na.omit(.) %>%
+  mutate(DeveloperType=trimws(DeveloperType)) %>%
+  group_by(DeveloperType,Country) %>%
+  summarise_all(funs(sum))
+
+
+setDT(jobDf)
+jobDf_melt=melt(jobDf)
+ggplot(data=jobDf_melt, aes(x=jobDf_melt$DeveloperType, y = jobDf_melt$value,fill=Country))+
+  theme(axis.text.x = element_text(face="bold", color="#993333",size=8, angle=90)) +
+  ggtitle("Job Satisfaction of Developers") +
+  geom_bar(stat="identity") +
+  xlab("Developer type") +
+  ylab("Number of Respondents") +
+  theme()
+
+exportJson2<-jobDf %>% toJSON(pretty = T)
+
+write(exportJson2, "Job_Satisfaction_Of_Each_Developer_Type_In_4_Big_IT-Countires.json")
+
+
+
+# Relation between Have worked and want to work language among developers in Pakistan (tested)
+
+
+haveWorked=so_data %>% 
+  select(HaveWorkedLanguage,Country) %>%
+  filter(Country=='Pakistan') %>%
+  select(-Country) %>%
+  na.omit() %>%
+  mutate_if(is.factor,as.character) %>%
+  mutate(HaveWorkedLanguage=gsub(" ","",HaveWorkedLanguage)) %>%
+  pull(HaveWorkedLanguage) %>%
+  str_c(.,collapse=';') %>%
+  str_split(';') %>%
+  unlist() %>%
+  table() %>%
+  as.data.frame()
+
+haveWant=so_data %>% 
+  select(WantWorkLanguage,Country) %>%
+  filter(Country=='Pakistan') %>%
+  select(-Country) %>%
+  na.omit() %>%
+  mutate_if(is.factor,as.character) %>%
+  mutate(WantWorkLanguage=gsub(" ","",WantWorkLanguage)) %>%
+  pull(WantWorkLanguage) %>%
+  str_c(.,collapse=';') %>%
+  str_split(';') %>%
+  unlist() %>%
+  table() %>%
+  as.data.frame()
+
+
+colnames(haveWorked)<-c("language","worked")
+colnames(haveWant)<-c("language","want")
+language_join<-inner_join(haveWant,haveWorked,by="language")
+language_join %>% 
+  ggplot(aes(y=want,x=worked,color=language))+geom_point()+
+  ggrepel::geom_label_repel(aes(label=language))+theme(legend.position = "None")+
+  geom_abline(intercept = 45,color="red")+labs(title="Have worked and want to work(Language) in Pakistan")
+
+exportJson2<-language_join %>% toJSON(pretty = T)
+
+write(exportJson2,"Relaton_between_haveWorked_and_WantToWork_with_Languages_in_pakistan.json")
+
+
+# Relation between job stasifaction and working years in  (tested)
+
+career_satisfied<-so_data %>% 
+  select(YearsCodedJob,JobSatisfaction,Country) %>%
+  filter(Country=='Pakistan') %>%
+  mutate(YearsCodedJob = as.character(YearsCodedJob)) %>%
+  na.omit() %>%
+  mutate(YearsCodedJob=ifelse(test=YearsCodedJob == 'Less than a year',yes="1 year",no=YearsCodedJob))
+
+  career_satisfied$YearsCodedJob %<>% 
+    str_sub(1,2) %>% 
+    str_trim() %>% 
+    as.integer()
+  
+  career_satisfied %>%
+  arrange(YearsCodedJob) %>% 
+  group_by(YearsCodedJob) %>% 
+  summarise(JobSatisfaction=mean(JobSatisfaction)) %>% 
+  ggplot(aes(x=YearsCodedJob,y=JobSatisfaction))+
+  geom_point()+
+  geom_smooth(level=0,method = 'loess')+
+  labs(title="The relation between jobsatisfaction and working years in Pakistan")
+  
+  
+  df = career_satisfied %>%
+  arrange(YearsCodedJob) %>% 
+  group_by(YearsCodedJob) %>% 
+  summarise(JobSatisfaction=mean(JobSatisfaction))
+    
+  
+  exportJson = df %>% toJSON(pretty = T)
+  write(exportJson, "Pak_JobSatisfaction.json")
+  
+  
+
+# Relation between Learning New Tech and working years in Pakistan (doubtful)
+  
+  
+ learningnew_tech<-so_data %>% 
+  select(YearsCodedJob,LearningNewTech,Country) %>%
+  filter(Country=='Pakistan') %>%
+  mutate(YearsCodedJob = as.character(YearsCodedJob)) %>%
+  na.omit() %>%
+  mutate(YearsCodedJob=ifelse(test=YearsCodedJob == 'Less than a year',yes="1 year",no=YearsCodedJob)) %>%
+  mutate(LearningNewTech=ifelse((LearningNewTech=='Agree' | LearningNewTech=='Strongly agree' |  LearningNewTech=='Somewhat agree'),yes=1,no=0))
+ 
+ 
+  learningnew_tech$YearsCodedJob %<>% 
+    str_sub(1,2) %>% 
+    str_trim() %>% 
+    as.integer()
+  
+  
+  learningnew_tech %>%
+  arrange(YearsCodedJob) %>% 
+  group_by(YearsCodedJob) %>% 
+  summarise(learning_new_tech=mean(LearningNewTech)) %>% 
+  ggplot(aes(x=YearsCodedJob,y=learning_new_tech))+
+  geom_point()+
+  geom_smooth(level=0,method = 'loess')+
+  labs(title="The relation between Learning New Tech  and working years in Pakistan")
+  
+  df2 =  learningnew_tech %>%
+  arrange(YearsCodedJob) %>% 
+  group_by(YearsCodedJob) %>% 
+  summarise(learning_new_tech=sum(LearningNewTech))
+  
+  exportJson2 = df2 %>% toJSON(pretty = T)
+  write(exportJson2, "Pak_LearningNewTech.json")
+  
+# Most Famous Language among Students in Pakistan  (tested)
+  
+  language_students=so_data %>% 
+    filter(Country=='Pakistan') %>%
+    filter(Professional=='Student') %>%
+    select(Professional,HaveWorkedLanguage) %>%
+    na.omit(.) %>%
+  mutate_if(is.factor,as.character) %>%
+ mutate(HaveWorkedLanguage=gsub(" ","",HaveWorkedLanguage)) %>%
+  pull(HaveWorkedLanguage) %>%
+  str_c(.,collapse=';') %>%
+  str_split(';') %>%
+  unlist() %>%
+  table() %>%
+  as.data.frame() 
+  
+  colnames(language_students)<-c("language","count")
+  language_students = language_students %>%
+    mutate(language=as.character(language))
+  
+  
+    ggplot(data=language_students,aes(x=language_students$language,y=language_students$count))+
+ geom_bar(stat="identity") +
+  xlab("Developer type") +
+  ylab("Number of Students") +
+  theme() + 
+  labs(title="Popular Languages among Students in Pakistan ")
+    
+    exportJson2<-language_students %>% toJSON(pretty = T)
+    write(exportJson2, "Languages_used_by_students_in_pakistan.json")
+    
+    
+# Female friendly languages  in Pakistan (tested)
+    
+    language <- so_data %>% 
+      filter(grepl("Female", Gender)) %>%
+      filter(Country=='Pakistan') %>%
+      select(Gender,HaveWorkedLanguage) %>%
+      na.omit() %>%
+      mutate_if(is.factor,as.character) %>%
+ mutate(HaveWorkedLanguage=gsub(" ","",HaveWorkedLanguage)) %>%
+  pull(HaveWorkedLanguage) %>%
+  str_c(.,collapse=';') %>%
+  str_split(';') %>%
+  unlist() %>%
+  table() %>%
+  as.data.frame() 
+  
+  colnames(language)<-c("language","count")
+  
+  female_language = language %>%
+    mutate(language=as.character(language))
+  group_by(language) %>% 
+      summarise(Total = round(n()))
+  
+female_language$language <- factor(female_language$language, levels = female_language$language)  # convert to factor to retain sorted order in plot.
+
+exportJson2<-female_language %>% toJSON(pretty = T)
+
+write(exportJson2, "Lamguages_used_by_females_in_pakistan.json")
+
+ggplot(female_language, aes(x = language  , y = count,fill = language )) + 
+    geom_bar(width = 0.85, stat="identity") +
+    coord_polar(theta = "y") +    
+    xlab("") + ylab("") +
+    ylim(c(0,30)) + 
+    geom_text(data = female_language, hjust = 1, size = 3, aes(x = language, y = 0, label = language )) +
+    theme(legend.position = "right", axis.text.y = element_blank() , axis.ticks = element_blank()) + 
+   labs(title="Most used languages by females in Pakistan")
+
+
+# Unemployment ratio of each developer type in Pakistan and rest of the world (tested)
+
+#note express unemployment count as % since count of develoepr type varies according to each country 
+
+count_dev<-so_data%>%select(Country,DeveloperType) %>%
+  filter(Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') %>%
+  na.omit(.) %>%
+  mutate_if(is.factor,as.character) %>%
+  mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+  mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+  unnest(DeveloperType) %>%
+  mutate(DeveloperType=trimws(DeveloperType)) %>%
+  gather(Country, DeveloperType) %>% 
+  group_by(Country,DeveloperType) %>% 
+  filter(DeveloperType!='Other') %>% 
+  summarise(Total_Count= n())
+
+
+unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>%
+  filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>%
+  na.omit(.) %>%
+  mutate_if(is.factor,as.character) %>%
+  mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+  mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+  unnest(DeveloperType) %>%
+  mutate(DeveloperType=trimws(DeveloperType)) %>%
+  mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) %>%
+  select(-EmploymentStatus) %>%
+  group_by(Country,DeveloperType) %>% 
+  filter(DeveloperType!='Other') %>% 
+  summarise(Unemployment_count= n())
+
+unemployment <- count_dev %>% 
+  left_join(id=c('Country','DeveloperType'),unemployment_dev) 
+  
+unemployment[is.na(unemployment)] <- 0
+
+
+
+
+# The dataframe is prepared just need tp plot here 
+
+  setDT(unemployment)  
+  unemployment_melt=melt(unemployment)
+ 
+  ggplot(data=unemployment_melt, aes(x=unemployment_melt$DeveloperType, y = unemployment_melt$value,fill=unemployment_melt$variable))+
+  theme(axis.text.x = element_text(face="bold", color="#993333",size=8, angle=90)) +
+  ggtitle("Unemployment among Developers across the world") +
+  geom_bar(stat="identity") +
+  xlab("Developer type") +
+  ylab("Number of Respondents") +
+  theme()  
+  
+  
+  
+  
+  
+  
+  
+
+  # Find Gender pay gap in countries with most number of respondents  (test it)
+  
+  # total count of developers in each country  (only male and female)
+  so_data %>%
+    group_by(Country) %>%
+    summarise(Count=n()) %>%
+    arrange(desc(Count)) %>%
+    head(20) -> top_countries
+  
+  top_20C <- top_countries %>% select(Country) %>% pull(.)
+  # total count of male/female developers in each country 
+  
+  
+country_mf <-  so_data %>%
+    filter(Country %in% top_countries$Country) %>%
+    group_by(Country) %>%
+    summarise(Males=sum(!is.na(Gender) & Gender == "Male"),
+              Females=sum(!is.na(Gender) & Gender == "Female")) %>%
+              mutate(Total=as.integer(Males) + as.integer(Females))
+  
+ # Taking top 20 countries which have most no of respondents (both make and female)
+  
+
+    
+      pay_gap <- so_data %>%  
+    select(Country,Gender,Salary) %>%
+   na.omit(.) %>%
+   filter(Country %in% top_20C &(Gender=='Male' | Gender=='Female')) %>%
+        mutate(Country=as.character(Country)) %>%
+        mutate(Gender=as.character(Gender))  %>%
+      group_by(Gender,Country) %>% summarise_all(funs(mean))
+      
+      pay_gapDt<-as.data.table(pay_gap)  
+      setkey(pay_gapDt,Country)
+      pay_gapDt<-pay_gapDt %>% select(Country,Salary)
+      pay_gapDt[ , Pay_Gap := c(NA, abs(diff(Salary))), by = Country]   
+      pay_gapDt <- pay_gapDt %>% select(Country,Pay_Gap) %>% na.omit(.) %>% arrange(desc(Pay_Gap))      
+      
+      genderPayGap <- pay_gapDt %>% inner_join(country_mf) %>% 
+        mutate(Ratio_of_female_devs_percentage=as.double(Females/Total*100))
+    
+      exportJson2<-genderPayGap %>%
+      select(Country,Males,Females,Ratio_of_female_devs_percentage) %>% 
+        toJSON(pretty = T)
+    
+      
+    write(exportJson2, "%_Female_dev_in_countries_with_highest_no_of_respondents.json")
+    
+    exportJson2<-genderPayGap %>%
+      select(Country,Males,Females,Pay_Gap) %>% 
+      toJSON(pretty = T)
+    
+    write(exportJson2, "Avg_pay_gap_in_countries_with_highest_no_of_respondents.json")
+    
+    # draw two maps here 
+      # one showing gender equaltiy in work force 
+      # other showing gender pay gap 
+      
+    
diff --git a/r-scripts/stackoverflow_json_files/%_Female_dev_in_countries_with_highest_no_of_respondents.json b/r-scripts/stackoverflow_json_files/%_Female_dev_in_countries_with_highest_no_of_respondents.json
new file mode 100644
index 0000000..f5b3c66
--- /dev/null
+++ b/r-scripts/stackoverflow_json_files/%_Female_dev_in_countries_with_highest_no_of_respondents.json
@@ -0,0 +1,122 @@
+[
+  {
+    "Country": "Australia",
+    "Males": 634,
+    "Females": 40,
+    "Ratio_of_female_devs_percentage": 5.9347
+  },
+  {
+    "Country": "Israel",
+    "Males": 346,
+    "Females": 40,
+    "Ratio_of_female_devs_percentage": 10.3627
+  },
+  {
+    "Country": "Netherlands",
+    "Males": 598,
+    "Females": 29,
+    "Ratio_of_female_devs_percentage": 4.6252
+  },
+  {
+    "Country": "Canada",
+    "Males": 1446,
+    "Females": 150,
+    "Ratio_of_female_devs_percentage": 9.3985
+  },
+  {
+    "Country": "Brazil",
+    "Males": 491,
+    "Females": 25,
+    "Ratio_of_female_devs_percentage": 4.845
+  },
+  {
+    "Country": "Romania",
+    "Males": 362,
+    "Females": 29,
+    "Ratio_of_female_devs_percentage": 7.4169
+  },
+  {
+    "Country": "United States",
+    "Males": 7447,
+    "Females": 847,
+    "Ratio_of_female_devs_percentage": 10.2122
+  },
+  {
+    "Country": "United Kingdom",
+    "Males": 2954,
+    "Females": 221,
+    "Ratio_of_female_devs_percentage": 6.9606
+  },
+  {
+    "Country": "Spain",
+    "Males": 518,
+    "Females": 41,
+    "Ratio_of_female_devs_percentage": 7.3345
+  },
+  {
+    "Country": "France",
+    "Males": 1094,
+    "Females": 64,
+    "Ratio_of_female_devs_percentage": 5.5268
+  },
+  {
+    "Country": "Poland",
+    "Males": 806,
+    "Females": 98,
+    "Ratio_of_female_devs_percentage": 10.8407
+  },
+  {
+    "Country": "India",
+    "Males": 2422,
+    "Females": 230,
+    "Ratio_of_female_devs_percentage": 8.6727
+  },
+  {
+    "Country": "Switzerland",
+    "Males": 398,
+    "Females": 13,
+    "Ratio_of_female_devs_percentage": 3.163
+  },
+  {
+    "Country": "Italy",
+    "Males": 515,
+    "Females": 16,
+    "Ratio_of_female_devs_percentage": 3.0132
+  },
+  {
+    "Country": "Russian Federation",
+    "Males": 542,
+    "Females": 31,
+    "Ratio_of_female_devs_percentage": 5.4101
+  },
+  {
+    "Country": "Pakistan",
+    "Males": 231,
+    "Females": 15,
+    "Ratio_of_female_devs_percentage": 6.0976
+  },
+  {
+    "Country": "Iran",
+    "Males": 260,
+    "Females": 13,
+    "Ratio_of_female_devs_percentage": 4.7619
+  },
+  {
+    "Country": "Germany",
+    "Males": 2740,
+    "Females": 153,
+    "Ratio_of_female_devs_percentage": 5.2886
+  },
+  {
+    "Country": "Austria",
+    "Males": 322,
+    "Females": 23,
+    "Ratio_of_female_devs_percentage": 6.6667
+  },
+  {
+    "Country": "Sweden",
+    "Males": 418,
+    "Females": 18,
+    "Ratio_of_female_devs_percentage": 4.1284
+  }
+]
diff --git a/r-scripts/stackoverflow_json_files/Avg_pay_gap_in_countries_with_highest_no_of_respondents.json b/r-scripts/stackoverflow_json_files/Avg_pay_gap_in_countries_with_highest_no_of_respondents.json
new file mode 100644
index 0000000..ff06a22
--- /dev/null
+++ b/r-scripts/stackoverflow_json_files/Avg_pay_gap_in_countries_with_highest_no_of_respondents.json
@@ -0,0 +1,122 @@
+[
+  {
+    "Country": "Australia",
+    "Males": 634,
+    "Females": 40,
+    "Pay_Gap": 10737.1317
+  },
+  {
+    "Country": "Israel",
+    "Males": 346,
+    "Females": 40,
+    "Pay_Gap": 9251.1895
+  },
+  {
+    "Country": "Netherlands",
+    "Males": 598,
+    "Females": 29,
+    "Pay_Gap": 9246.5287
+  },
+  {
+    "Country": "Canada",
+    "Males": 1446,
+    "Females": 150,
+    "Pay_Gap": 8815.6772
+  },
+  {
+    "Country": "Brazil",
+    "Males": 491,
+    "Females": 25,
+    "Pay_Gap": 8303.0236
+  },
+  {
+    "Country": "Romania",
+    "Males": 362,
+    "Females": 29,
+    "Pay_Gap": 7984.4905
+  },
+  {
+    "Country": "United States",
+    "Males": 7447,
+    "Females": 847,
+    "Pay_Gap": 7472.3785
+  },
+  {
+    "Country": "United Kingdom",
+    "Males": 2954,
+    "Females": 221,
+    "Pay_Gap": 6837.036
+  },
+  {
+    "Country": "Spain",
+    "Males": 518,
+    "Females": 41,
+    "Pay_Gap": 6231.8437
+  },
+  {
+    "Country": "France",
+    "Males": 1094,
+    "Females": 64,
+    "Pay_Gap": 5134.0087
+  },
+  {
+    "Country": "Poland",
+    "Males": 806,
+    "Females": 98,
+    "Pay_Gap": 4981.3313
+  },
+  {
+    "Country": "India",
+    "Males": 2422,
+    "Females": 230,
+    "Pay_Gap": 4882.4146
+  },
+  {
+    "Country": "Switzerland",
+    "Males": 398,
+    "Females": 13,
+    "Pay_Gap": 4543.6383
+  },
+  {
+    "Country": "Italy",
+    "Males": 515,
+    "Females": 16,
+    "Pay_Gap": 4349.3793
+  },
+  {
+    "Country": "Russian Federation",
+    "Males": 542,
+    "Females": 31,
+    "Pay_Gap": 2943.8532
+  },
+  {
+    "Country": "Pakistan",
+    "Males": 231,
+    "Females": 15,
+    "Pay_Gap": 2832.3967
+  },
+  {
+    "Country": "Iran",
+    "Males": 260,
+    "Females": 13,
+    "Pay_Gap": 2130.0627
+  },
+  {
+    "Country": "Germany",
+    "Males": 2740,
+    "Females": 153,
+    "Pay_Gap": 2116.6578
+  },
+  {
+    "Country": "Austria",
+    "Males": 322,
+    "Females": 23,
+    "Pay_Gap": 1529.8644
+  },
+  {
+    "Country": "Sweden",
+    "Males": 418,
+    "Females": 18,
+    "Pay_Gap": 1270.0568
+  }
+]
diff --git a/r-scripts/stackoverflow_json_files/Job_Satisfaction_Of_Each_Developer_Type_In_4_Big_IT-Countires.json b/r-scripts/stackoverflow_json_files/Job_Satisfaction_Of_Each_Developer_Type_In_4_Big_IT-Countires.json
new file mode 100644
index 0000000..b80226a
--- /dev/null
+++ b/r-scripts/stackoverflow_json_files/Job_Satisfaction_Of_Each_Developer_Type_In_4_Big_IT-Countires.json
@@ -0,0 +1,282 @@
+[
+  {
+    "DeveloperType": "Database administrator",
+    "Country": "China",
+    "JobSatisfaction": 95
+  },
+  {
+    "DeveloperType": "Database administrator",
+    "Country": "India",
+    "JobSatisfaction": 2391
+  },
+  {
+    "DeveloperType": "Database administrator",
+    "Country": "Pakistan",
+    "JobSatisfaction": 359
+  },
+  {
+    "DeveloperType": "Database administrator",
+    "Country": "United States",
+    "JobSatisfaction": 8258
+  },
+  {
+    "DeveloperType": "Data scientist",
+    "Country": "China",
+    "JobSatisfaction": 101
+  },
+  {
+    "DeveloperType": "Data scientist",
+    "Country": "India",
+    "JobSatisfaction": 1452
+  },
+  {
+    "DeveloperType": "Data scientist",
+    "Country": "Pakistan",
+    "JobSatisfaction": 160
+  },
+  {
+    "DeveloperType": "Data scientist",
+    "Country": "United States",
+    "JobSatisfaction": 4955
+  },
+  {
+    "DeveloperType": "Desktop applications developer",
+    "Country": "China",
+    "JobSatisfaction": 223
+  },
+  {
+    "DeveloperType": "Desktop applications developer",
+    "Country": "India",
+    "JobSatisfaction": 3731
+  },
+  {
+    "DeveloperType": "Desktop applications developer",
+    "Country": "Pakistan",
+    "JobSatisfaction": 570
+  },
+  {
+    "DeveloperType": "Desktop applications developer",
+    "Country": "United States",
+    "JobSatisfaction": 15897
+  },
+  {
+    "DeveloperType": "Developer with a statistics or mathematics background",
+    "Country": "China",
+    "JobSatisfaction": 114
+  },
+  {
+    "DeveloperType": "Developer with a statistics or mathematics background",
+    "Country": "India",
+    "JobSatisfaction": 2145
+  },
+  {
+    "DeveloperType": "Developer with a statistics or mathematics background",
+    "Country": "Pakistan",
+    "JobSatisfaction": 250
+  },
+  {
+    "DeveloperType": "Developer with a statistics or mathematics background",
+    "Country": "United States",
+    "JobSatisfaction": 6799
+  },
+  {
+    "DeveloperType": "DevOps specialist",
+    "Country": "China",
+    "JobSatisfaction": 117
+  },
+  {
+    "DeveloperType": "DevOps specialist",
+    "Country": "India",
+    "JobSatisfaction": 1250
+  },
+  {
+    "DeveloperType": "DevOps specialist",
+    "Country": "Pakistan",
+    "JobSatisfaction": 116
+  },
+  {
+    "DeveloperType": "DevOps specialist",
+    "Country": "United States",
+    "JobSatisfaction": 7407
+  },
+  {
+    "DeveloperType": "Embedded applications/devices developer",
+    "Country": "China",
+    "JobSatisfaction": 93
+  },
+  {
+    "DeveloperType": "Embedded applications/devices developer",
+    "Country": "India",
+    "JobSatisfaction": 1087
+  },
+  {
+    "DeveloperType": "Embedded applications/devices developer",
+    "Country": "Pakistan",
+    "JobSatisfaction": 77
+  },
+  {
+    "DeveloperType": "Embedded applications/devices developer",
+    "Country": "United States",
+    "JobSatisfaction": 5054
+  },
+  {
+    "DeveloperType": "Graphic designer",
+    "Country": "China",
+    "JobSatisfaction": 22
+  },
+  {
+    "DeveloperType": "Graphic designer",
+    "Country": "India",
+    "JobSatisfaction": 859
+  },
+  {
+    "DeveloperType": "Graphic designer",
+    "Country": "Pakistan",
+    "JobSatisfaction": 179
+  },
+  {
+    "DeveloperType": "Graphic designer",
+    "Country": "United States",
+    "JobSatisfaction": 1938
+  },
+  {
+    "DeveloperType": "Graphics programming",
+    "Country": "China",
+    "JobSatisfaction": 66
+  },
+  {
+    "DeveloperType": "Graphics programming",
+    "Country": "India",
+    "JobSatisfaction": 484
+  },
+  {
+    "DeveloperType": "Graphics programming",
+    "Country": "Pakistan",
+    "JobSatisfaction": 111
+  },
+  {
+    "DeveloperType": "Graphics programming",
+    "Country": "United States",
+    "JobSatisfaction": 2321
+  },
+  {
+    "DeveloperType": "Machine learning specialist",
+    "Country": "China",
+    "JobSatisfaction": 40
+  },
+  {
+    "DeveloperType": "Machine learning specialist",
+    "Country": "India",
+    "JobSatisfaction": 711
+  },
+  {
+    "DeveloperType": "Machine learning specialist",
+    "Country": "Pakistan",
+    "JobSatisfaction": 81
+  },
+  {
+    "DeveloperType": "Machine learning specialist",
+    "Country": "United States",
+    "JobSatisfaction": 2162
+  },
+  {
+    "DeveloperType": "Mobile developer",
+    "Country": "China",
+    "JobSatisfaction": 206
+  },
+  {
+    "DeveloperType": "Mobile developer",
+    "Country": "India",
+    "JobSatisfaction": 6600
+  },
+  {
+    "DeveloperType": "Mobile developer",
+    "Country": "Pakistan",
+    "JobSatisfaction": 775
+  },
+  {
+    "DeveloperType": "Mobile developer",
+    "Country": "United States",
+    "JobSatisfaction": 11007
+  },
+  {
+    "DeveloperType": "Other",
+    "Country": "China",
+    "JobSatisfaction": 40
+  },
+  {
+    "DeveloperType": "Other",
+    "Country": "India",
+    "JobSatisfaction": 933
+  },
+  {
+    "DeveloperType": "Other",
+    "Country": "Pakistan",
+    "JobSatisfaction": 62
+  },
+  {
+    "DeveloperType": "Other",
+    "Country": "United States",
+    "JobSatisfaction": 6196
+  },
+  {
+    "DeveloperType": "Quality assurance engineer",
+    "Country": "China",
+    "JobSatisfaction": 23
+  },
+  {
+    "DeveloperType": "Quality assurance engineer",
+    "Country": "India",
+    "JobSatisfaction": 679
+  },
+  {
+    "DeveloperType": "Quality assurance engineer",
+    "Country": "Pakistan",
+    "JobSatisfaction": 105
+  },
+  {
+    "DeveloperType": "Quality assurance engineer",
+    "Country": "United States",
+    "JobSatisfaction": 2362
+  },
+  {
+    "DeveloperType": "Systems administrator",
+    "Country": "China",
+    "JobSatisfaction": 98
+  },
+  {
+    "DeveloperType": "Systems administrator",
+    "Country": "India",
+    "JobSatisfaction": 1053
+  },
+  {
+    "DeveloperType": "Systems administrator",
+    "Country": "Pakistan",
+    "JobSatisfaction": 131
+  },
+  {
+    "DeveloperType": "Systems administrator",
+    "Country": "United States",
+    "JobSatisfaction": 6582
+  },
+  {
+    "DeveloperType": "Web developer",
+    "Country": "China",
+    "JobSatisfaction": 702
+  },
+  {
+    "DeveloperType": "Web developer",
+    "Country": "India",
+    "JobSatisfaction": 14604
+  },
+  {
+    "DeveloperType": "Web developer",
+    "Country": "Pakistan",
+    "JobSatisfaction": 1516
+  },
+  {
+    "DeveloperType": "Web developer",
+    "Country": "United States",
+    "JobSatisfaction": 41856
+  }
+]
diff --git a/r-scripts/stackoverflow_json_files/Languages_used_by_females_in_pakistan.json b/r-scripts/stackoverflow_json_files/Languages_used_by_females_in_pakistan.json
new file mode 100644
index 0000000..84ba6fb
--- /dev/null
+++ b/r-scripts/stackoverflow_json_files/Languages_used_by_females_in_pakistan.json
@@ -0,0 +1,66 @@
+[
+  {
+    "language": "Assembly",
+    "count": 2
+  },
+  {
+    "language": "C",
+    "count": 5
+  },
+  {
+    "language": "C#",
+    "count": 8
+  },
+  {
+    "language": "C++",
+    "count": 7
+  },
+  {
+    "language": "Java",
+    "count": 8
+  },
+  {
+    "language": "JavaScript",
+    "count": 10
+  },
+  {
+    "language": "Matlab",
+    "count": 3
+  },
+  {
+    "language": "Objective-C",
+    "count": 1
+  },
+  {
+    "language": "PHP",
+    "count": 7
+  },
+  {
+    "language": "Python",
+    "count": 3
+  },
+  {
+    "language": "Scala",
+    "count": 1
+  },
+  {
+    "language": "SQL",
+    "count": 9
+  },
+  {
+    "language": "Swift",
+    "count": 1
+  },
+  {
+    "language": "VBA",
+    "count": 1
+  },
+  {
+    "language": "VB.NET",
+    "count": 4
+  },
+  {
+    "language": "VisualBasic6",
+    "count": 3
+  }
+]
diff --git a/r-scripts/stackoverflow_json_files/Languages_used_by_students_in_pakistan.json b/r-scripts/stackoverflow_json_files/Languages_used_by_students_in_pakistan.json
new file mode 100644
index 0000000..0cccb02
--- /dev/null
+++ b/r-scripts/stackoverflow_json_files/Languages_used_by_students_in_pakistan.json
@@ -0,0 +1,66 @@
+[
+  {
+    "language": "Assembly",
+    "count": 8
+  },
+  {
+    "language": "C",
+    "count": 22
+  },
+  {
+    "language": "C#",
+    "count": 20
+  },
+  {
+    "language": "C++",
+    "count": 28
+  },
+  {
+    "language": "Haskell",
+    "count": 1
+  },
+  {
+    "language": "Java",
+    "count": 27
+  },
+  {
+    "language": "JavaScript",
+    "count": 24
+  },
+  {
+    "language": "Matlab",
+    "count": 7
+  },
+  {
+    "language": "PHP",
+    "count": 21
+  },
+  {
+    "language": "Python",
+    "count": 15
+  },
+  {
+    "language": "R",
+    "count": 3
+  },
+  {
+    "language": "Ruby",
+    "count": 3
+  },
+  {
+    "language": "SQL",
+    "count": 19
+  },
+  {
+    "language": "TypeScript",
+    "count": 1
+  },
+  {
+    "language": "VB.NET",
+    "count": 3
+  },
+  {
+    "language": "VisualBasic6",
+    "count": 2
+  }
+]
diff --git a/r-scripts/stackoverflow_json_files/Relaton_between_haveWorked_and_WantToWork_with_Languages_in_pakistan.json b/r-scripts/stackoverflow_json_files/Relaton_between_haveWorked_and_WantToWork_with_Languages_in_pakistan.json
new file mode 100644
index 0000000..df9b809
--- /dev/null
+++ b/r-scripts/stackoverflow_json_files/Relaton_between_haveWorked_and_WantToWork_with_Languages_in_pakistan.json
@@ -0,0 +1,152 @@
+[
+  {
+    "language": "Assembly",
+    "want": 8,
+    "worked": 29
+  },
+  {
+    "language": "C",
+    "want": 27,
+    "worked": 58
+  },
+  {
+    "language": "C#",
+    "want": 87,
+    "worked": 109
+  },
+  {
+    "language": "C++",
+    "want": 36,
+    "worked": 75
+  },
+  {
+    "language": "CoffeeScript",
+    "want": 9,
+    "worked": 6
+  },
+  {
+    "language": "CommonLisp",
+    "want": 1,
+    "worked": 1
+  },
+  {
+    "language": "Dart",
+    "want": 5,
+    "worked": 1
+  },
+  {
+    "language": "Elixir",
+    "want": 8,
+    "worked": 1
+  },
+  {
+    "language": "Go",
+    "want": 28,
+    "worked": 1
+  },
+  {
+    "language": "Groovy",
+    "want": 4,
+    "worked": 5
+  },
+  {
+    "language": "Haskell",
+    "want": 4,
+    "worked": 1
+  },
+  {
+    "language": "Java",
+    "want": 105,
+    "worked": 111
+  },
+  {
+    "language": "JavaScript",
+    "want": 124,
+    "worked": 161
+  },
+  {
+    "language": "Julia",
+    "want": 1,
+    "worked": 2
+  },
+  {
+    "language": "Lua",
+    "want": 4,
+    "worked": 4
+  },
+  {
+    "language": "Matlab",
+    "want": 13,
+    "worked": 16
+  },
+  {
+    "language": "Objective-C",
+    "want": 33,
+    "worked": 21
+  },
+  {
+    "language": "Perl",
+    "want": 6,
+    "worked": 4
+  },
+  {
+    "language": "PHP",
+    "want": 72,
+    "worked": 133
+  },
+  {
+    "language": "Python",
+    "want": 87,
+    "worked": 47
+  },
+  {
+    "language": "R",
+    "want": 16,
+    "worked": 5
+  },
+  {
+    "language": "Ruby",
+    "want": 45,
+    "worked": 18
+  },
+  {
+    "language": "Scala",
+    "want": 11,
+    "worked": 5
+  },
+  {
+    "language": "Smalltalk",
+    "want": 2,
+    "worked": 1
+  },
+  {
+    "language": "SQL",
+    "want": 81,
+    "worked": 134
+  },
+  {
+    "language": "Swift",
+    "want": 45,
+    "worked": 14
+  },
+  {
+    "language": "TypeScript",
+    "want": 37,
+    "worked": 11
+  },
+  {
+    "language": "VBA",
+    "want": 1,
+    "worked": 3
+  },
+  {
+    "language": "VB.NET",
+    "want": 11,
+    "worked": 22
+  },
+  {
+    "language": "VisualBasic6",
+    "want": 6,
+    "worked": 10
+  }
+]
diff --git a/r-scripts/stackoverflow_json_files/word_cloud_technologies.png b/r-scripts/stackoverflow_json_files/word_cloud_technologies.png
new file mode 100644
index 0000000..15aa79f
Binary files /dev/null and b/r-scripts/stackoverflow_json_files/word_cloud_technologies.png differ
diff --git a/r-scripts/wordembeddings.R b/r-scripts/wordembeddings.R
index 255e5b0..d02341b 100644
--- a/r-scripts/wordembeddings.R
+++ b/r-scripts/wordembeddings.R
@@ -1,6 +1,62 @@
+library(wordcloud)
+library(RColorBrewer)
+library(tm)
+library(Rtsne)
+library(text2vec)
+
+# ## gender and developer type 
+# 
+# genderDev=so_data %>% 
+#   select(DeveloperType,Country) %>%
+#   filter(Country=='Pakistan') %>%
+#   na.omit() %>%
+#   mutate(gender_developer=gsub(" ","",DeveloperType)) %>%
+#   pull(gender_developer)
+#   
+# write(genderDev,'gender_dev.txt')
+# prep_word2vec(origin='gender_dev.txt',destination = 'gender_dev_vectors.txt',lowercase = T)
+# model = train_word2vec("gender_dev_vectors.txt",vectors=100,threads=4,window=5,iter=1000,negative_samples=0,force = T)
+# word2vec=model[2:nrow(model),]  # remove at end of string
+
+
+#tsne (dimensionality reduction)
+# reduction <- Rtsne(word2vec, dims = 2, initial_dims = 50,
+#                    perplexity = 2, theta = 0.05, check_duplicates = F,
+#                    pca = F, max_iter = 1000, verbose = T,
+#                    is_distance = F, Y_init = NULL)
+# df <- as.data.frame(reduction$Y)
+# rows <- rownames(word2vec)
+#rownames(df) <- rows
+#df=df %>% filter(rownames(df) != '</s>')
+#rownames(df) <- rows[2:65]
+
+# Create t-SNE plot and save as jpeg
+# ggplot(df) +
+#   geom_point(aes(x = V1, y = V2), color = "red") +
+#   geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) +
+#   xlab("Dimension 1") +
+#   ylab("Dimension 2 ") +
+#   # geom_text(fontface = 2, alpha = .8) +
+#   theme_bw(base_size = 12) +
+#   theme(legend.position = "none") +
+#   ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE"))
+# 
+# pca = prcomp(word2vec,scale=T) 
+# plot(pca$x, t='n', main="pca")
+# text(pca$x, labels=rownames(df))
+# x=pca$x[,1]
+# y=pca$x[,2]
+# 
+# hc = hclust(dist(cbind(x,y)), method = 'ward.D')
+# plot(hc, axes=F,xlab='', ylab='',sub ='', main='Comp 1/2')
+# rect.hclust(hc, k=5, border='red')
+
+######################################
+  
+
+
 concat_features=so_data %>% 
   select(IDE,HaveWorkedLanguage,Country,HaveWorkedDatabase,HaveWorkedFramework,HaveWorkedPlatform,WantWorkLanguage,WantWorkFramework,WantWorkDatabase,WantWorkPlatform) %>%
-  filter(Country=='Pakistan') %>%
   select(-Country) %>%
   na.omit(.) %>%
   mutate_if(is.factor,as.character) %>%
@@ -24,9 +80,79 @@ concat_features=so_data %>%
   
   embeddingAllWant=unlist(strsplit(as.character(vectorAllWant),';'))
   embeddingAllHave=unlist(strsplit(as.character(vectorAllHave),';'))
+  
+  all=c(embeddingAllWant,embeddingAllHave)
+  doc=Corpus(VectorSource(all))
+  dtm <- TermDocumentMatrix(doc)
+  m <- as.matrix(dtm)
+  v <- sort(rowSums(m),decreasing=TRUE)
+  d <- data.frame(word = names(v),freq=v)
+  set.seed(1234)
+  wordcloud(words = d$word, freq = d$freq, min.freq = 1,
+            max.words=200, random.order=FALSE, rot.per=0.35, 
+            colors=brewer.pal(8, "Dark2"))
+  
+  
+  
+  write(all,'features.txt')
+  write(embeddingAllHave,'features_have.txt')
+  # question (should we include all and have as one vector and then form term document matrix ?)
+  # embedding is a character array of all the features. We will vectorize them using word2vec
+  # and tsne will be used for dimensionality reduction 
+  
+  # word2vec hyper parameter tuning 
+  vectors=seq(100,500)
+  min_count=seq(5,10)
+  negative_samples=seq(5,15)
+  iter=seq(100,1000)
+  window=seq(1,20)
+  
+  
+  ##################################
+  
+  prep_word2vec(origin='features_have.txt',destination = 'vectors.txt',lowercase = T)
+  model = train_word2vec("vectors.txt",vectors=300,threads=4,window=5,iter=1000,negative_samples=0,min_count = 10,force = T)
+  word2vec=model[2:nrow(model),]  # remove at end of string
+  #tsne (dimensionality reduction)
+  reduction <- Rtsne(word2vec, dims = 2, initial_dims = 50,
+                     perplexity = 10, theta = 0.05, check_duplicates = F,
+                     pca = F, max_iter = 1000, verbose = T,
+                     is_distance = F, Y_init = NULL)
+  df <- as.data.frame(reduction$Y)
+  rows <- rownames(word2vec)
+  rownames(df) <- rows
+  #df=df %>% filter(rownames(df) != '</s>')
+  #rownames(df) <- rows[2:65]
+ 
+  # Create t-SNE plot and save as jpeg
+  ggplot(df) +
+    geom_point(aes(x = V1, y = V2), color = "red") +
+    geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) +
+    xlab("Dimension 1") +
+    ylab("Dimension 2 ") +
+    # geom_text(fontface = 2, alpha = .8) +
+    theme_bw(base_size = 12) +
+    theme(legend.position = "none") +
+    ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE"))
+  
+  
+  pca = prcomp(word2vec,scale=T) 
+  plot(pca$x, t='n', main="pca")
+  text(pca$x, labels=rownames(df))
+  x=pca$x[,1]
+  y=pca$x[,2]
+  
 
-# embedding is a character array of all the features. We will vectorize them using word2vec
-# and tsne will be used for dimensionality reduction 
-
- #word2vec=read.binary.vectors(filename='/home/mustufain/Downloads/GoogleNews-vectors-negative300.bin')
- 
\ No newline at end of file
+  # hierarchical clustering using pca 
+  hc = hclust(dist(cbind(x,y)), method = 'ward.D')
+  plot(hc, axes=F,xlab='', ylab='',sub ='', main='Comp 1/2')
+  rect.hclust(hc, k=3, border='red')
+  
+  # hierachical clustering using tsne 
+  
+  hc = hclust(dist(cbind(xt,yt)), method = 'ward.D')
+  plot(hc, axes=F,xlab='', ylab='',sub ='', main='Comp 1/2')
+  rect.hclust(hc, k=3, border='red')
+  
+  
+  ggsave('wordembeddings.png')
\ No newline at end of file
diff --git a/r-shiny-app/.RData b/r-shiny-app/.RData
new file mode 100644
index 0000000..51cced9
Binary files /dev/null and b/r-shiny-app/.RData differ
diff --git a/r-shiny-app/.Rhistory b/r-shiny-app/.Rhistory
new file mode 100644
index 0000000..8c1d3cf
--- /dev/null
+++ b/r-shiny-app/.Rhistory
@@ -0,0 +1,512 @@
+gather(Country, DeveloperType) %>%
+group_by(Country,DeveloperType) %>%
+filter(DeveloperType!='Other')
+so_data%>%select(Country,DeveloperType,EmploymentStatus) %>%
+filter(Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+unnest(DeveloperType) %>%
+mutate(DeveloperType=trimws(DeveloperType))
+so_data%>%select(Country,DeveloperType,EmploymentStatus) %>%
+filter(Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India' | EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work') %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+unnest(DeveloperType) %>%
+mutate(DeveloperType=trimws(DeveloperType)) %>%
+gather(Country, DeveloperType) %>%
+group_by(Country,DeveloperType) %>%
+filter(DeveloperType!='Other')
+so_data%>%select(Country,DeveloperType,EmploymentStatus) %>%
+filter(Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India' | EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work') %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+unnest(DeveloperType) %>%
+mutate(DeveloperType=trimws(DeveloperType))
+so_data%>%select(Country,DeveloperType,EmploymentStatus) %>%
+filter(Country=='Pakistan' | EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work') %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+unnest(DeveloperType) %>%
+mutate(DeveloperType=trimws(DeveloperType))
+count_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>%
+filter(Country=='Pakistan' | EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work') %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+unnest(DeveloperType) %>%
+mutate(DeveloperType=trimws(DeveloperType))
+count_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>%
+filter(Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India' | EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work') %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+unnest(DeveloperType) %>%
+mutate(DeveloperType=trimws(DeveloperType))
+count_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>%
+filter(Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+unnest(DeveloperType) %>%
+mutate(DeveloperType=trimws(DeveloperType)) %>%
+gather(Country, DeveloperType) %>%
+group_by(Country,DeveloperType) %>%
+filter(DeveloperType!='Other') %>%
+summarise(Count= n())
+count_dev<-so_data%>%select(Country,DeveloperType) %>%
+filter(Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+unnest(DeveloperType) %>%
+mutate(DeveloperType=trimws(DeveloperType)) %>%
+gather(Country, DeveloperType) %>%
+group_by(Country,DeveloperType) %>%
+filter(DeveloperType!='Other') %>%
+summarise(Count= n())
+View(count_dev)
+so_data%>%select(Country,DeveloperType) %>%
+filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+unnest(DeveloperType) %>%
+mutate(DeveloperType=trimws(DeveloperType))
+so_data%>%select(Country,DeveloperType,EmploymentStatus) %>%
+filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+unnest(DeveloperType) %>%
+mutate(DeveloperType=trimws(DeveloperType))
+unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>%
+filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+unnest(DeveloperType) %>%
+mutate(DeveloperType=trimws(DeveloperType))
+View(unemployment_dev)
+unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>%
+filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+unnest(DeveloperType) %>%
+mutate(DeveloperType=trimws(DeveloperType)) %>%
+mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.))
+View(unemployment_dev)
+unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>%
+filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+unnest(DeveloperType) %>%
+mutate(DeveloperType=trimws(DeveloperType)) %>%
+mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) %>%
+gather(Country,DeveloperType) %>%
+group_by(Country,DeveloperType) %>%
+filter(DeveloperType!='Other') %>%
+summarise(Count= n())
+View(unemployment_dev)
+so_data%>%select(Country,DeveloperType,EmploymentStatus) %>%
+filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+unnest(DeveloperType) %>%
+mutate(DeveloperType=trimws(DeveloperType)) %>%
+mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) %>%
+gather(Country,EmploymentStatus,DeveloperType)
+unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>%
+filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+unnest(DeveloperType) %>%
+mutate(DeveloperType=trimws(DeveloperType)) %>%
+mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) %>%
+gather(Country,EmploymentStatus,DeveloperType) %>%
+group_by(Country,DeveloperType) %>%
+filter(DeveloperType!='Other') %>%
+summarise(Count= n())
+unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>%
+filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+unnest(DeveloperType) %>%
+mutate(DeveloperType=trimws(DeveloperType)) %>%
+mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) %>%
+gather(Country,EmploymentStatus,DeveloperType) %>%
+group_by(Country,DeveloperType,DeveloperType) %>%
+filter(DeveloperType!='Other') %>%
+summarise(Count= n())
+unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>%
+filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+unnest(DeveloperType) %>%
+mutate(DeveloperType=trimws(DeveloperType)) %>%
+mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) %>%
+gather(Country,EmploymentStatus,DeveloperType) %>% head()
+View(unemployment_dev)
+unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>%
+filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+unnest(DeveloperType) %>%
+mutate(DeveloperType=trimws(DeveloperType)) %>%
+mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) %>%
+gather(Country,Val,EmploymentStatus:DeveloperType) %>% head()
+View(unemployment_dev)
+unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>%
+filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+unnest(DeveloperType) %>%
+mutate(DeveloperType=trimws(DeveloperType)) %>%
+mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.))
+View(unemployment_dev)
+unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>%
+filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+unnest(DeveloperType) %>%
+mutate(DeveloperType=trimws(DeveloperType)) %>%
+mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) %>%
+select(-EmploymentStatus)
+group_by(Country,DeveloperType) %>%
+filter(DeveloperType!='Other') %>%
+summarise(Count= n())
+unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>%
+filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+unnest(DeveloperType) %>%
+mutate(DeveloperType=trimws(DeveloperType)) %>%
+mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) %>%
+select(-EmploymentStatus) %>%
+group_by(Country,DeveloperType) %>%
+filter(DeveloperType!='Other') %>%
+summarise(Count= n())
+View(unemployment_dev)
+View(count_dev)
+count_dev %>%
+inner_join(id=c('Country','DeveloperType'),unemployment_dev)
+unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>%
+filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+unnest(DeveloperType) %>%
+mutate(DeveloperType=trimws(DeveloperType)) %>%
+mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) %>%
+select(-EmploymentStatus) %>%
+group_by(Country,DeveloperType) %>%
+filter(DeveloperType!='Other') %>%
+summarise(Count= n())
+unemployment %>% count_dev %>%
+inner_join(id=c('Country','DeveloperType'),unemployment_dev)
+count_dev %>%
+inner_join(id=c('Country','DeveloperType'),unemployment_dev)
+count_dev
+unemployment_dev
+unemployment <- count_dev %>%
+inner_join(id=c('Country','DeveloperType'),unemployment_dev)
+View(unemployment)
+count_dev
+unemployment_dev
+count_dev %>%
+inner_join(id=c('Country','DeveloperType'),unemployment_dev)
+unemployment <- count_dev %>%
+lef_join(id=c('Country','DeveloperType'),unemployment_dev)
+count_dev %>%
+left_join(id=c('Country','DeveloperType'),unemployment_dev)
+count_dev %>%
+right_join(id=c('Country','DeveloperType'),unemployment_dev)
+unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>%
+filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+unnest(DeveloperType) %>%
+mutate(DeveloperType=trimws(DeveloperType)) %>%
+mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) %>%
+select(-EmploymentStatus) %>%
+group_by(Country,DeveloperType) %>%
+filter(DeveloperType!='Other') %>%
+summarise(freq= n())
+count_dev %>%
+inner_join(id=c('Country','DeveloperType'),unemployment_dev)
+unemployment <- count_dev %>%
+inner_join(id=c('Country','DeveloperType'),unemployment_dev)
+View(unemployment)
+count_dev<-so_data%>%select(Country,DeveloperType) %>%
+filter(Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+unnest(DeveloperType) %>%
+mutate(DeveloperType=trimws(DeveloperType)) %>%
+gather(Country, DeveloperType) %>%
+group_by(Country,DeveloperType) %>%
+filter(DeveloperType!='Other') %>%
+summarise(Total_Count= n())
+unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>%
+filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>%
+na.omit(.) %>%
+mutate_if(is.factor,as.character) %>%
+mutate(DeveloperType=gsub(" ","",DeveloperType)) %>%
+mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>%
+unnest(DeveloperType) %>%
+mutate(DeveloperType=trimws(DeveloperType)) %>%
+mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) %>%
+select(-EmploymentStatus) %>%
+group_by(Country,DeveloperType) %>%
+filter(DeveloperType!='Other') %>%
+summarise(Unemployment_count= n())
+unemployment <- count_dev %>%
+inner_join(id=c('Country','DeveloperType'),unemployment_dev)
+View(unemployment)
+count_dev
+unemployment_dev
+count_dev %>%
+left_join(id=c('Country','DeveloperType'),unemployment_dev)
+?na.replace
+unemployment <- count_dev %>%
+left_join(id=c('Country','DeveloperType'),unemployment_dev) %>%
+is.na[.]<-0
+?is.na
+View(unemployment)
+unemployment <- count_dev %>%
+left_join(id=c('Country','DeveloperType'),unemployment_dev)
+View(unemployment)
+View(unemployment)
+unemployment <- count_dev %>%
+left_join(id=c('Country','DeveloperType'),unemployment_dev) %>%
+is.na(.)<-0
+View(unemployment)
+unemployment <- count_dev %>%
+left_join(id=c('Country','DeveloperType'),unemployment_dev) %>%
+is.na(.)<-0
+?is.na
+is.na(unemployment)<-0
+is.na[unemployment]<-0
+unemployment[is.na(unemployment)] <- 0
+View(unemployment)
+View(test)
+pay_gap <- so_data %>%
+select(Country,Gender,Salary) %>%
+na.omit(.) %>%
+filter(Country %in% top_20C &(Gender=='Male' | Gender=='Female')) %>%
+mutate(Country=as.character(Country)) %>%
+mutate(Gender=as.character(Gender))  %>%
+group_by(Gender,Country) %>% summarise_all(funs(mean))
+pay_gapDt<-as.data.table(pay_gap)
+setkey(pay_gapDt,Country)
+pay_gapDt<-pay_gapDt %>% select(Country,Salary)
+pay_gapDt[ , Pay_Gap := c(NA, abs(diff(Salary))), by = Country]
+pay_gapDt <- pay_gapDt %>% select(Country,Pay_Gap) %>% na.omit(.) %>% arrange(desc(Pay_Gap))
+genderPayGap <- pay_gapDt %>% inner_join(country_mf)
+View(genderPayGap)
+View(genderPayGap)
+pay_gap <- so_data %>%
+select(Country,Gender,Salary) %>%
+na.omit(.) %>%
+filter(Country %in% top_20C &(Gender=='Male' | Gender=='Female')) %>%
+mutate(Country=as.character(Country)) %>%
+mutate(Gender=as.character(Gender))
+ay_gap <- so_data %>%
+select(Country,Gender,Salary) %>%
+na.omit(.) %>%
+filter(Country %in% top_20C &(Gender=='Male' | Gender=='Female')) %>%
+mutate(Country=as.character(Country)) %>%
+mutate(Gender=as.character(Gender))  %>%
+group_by(Gender,Country) %>% summarise_all(funs(mean))
+so_data %>%
+select(Country,Gender,Salary) %>%
+na.omit(.) %>%
+filter(Country %in% top_20C &(Gender=='Male' | Gender=='Female')) %>%
+mutate(Country=as.character(Country)) %>%
+mutate(Gender=as.character(Gender))
+View(pay_gap)
+pay_gap <- so_data %>%
+select(Country,Gender,Salary) %>%
+na.omit(.) %>%
+filter(Country %in% top_20C &(Gender=='Male' | Gender=='Female')) %>%
+mutate(Country=as.character(Country)) %>%
+mutate(Gender=as.character(Gender))  %>%
+group_by(Gender,Country) %>% summarise_all(funs(mean))
+View(pay_gap)
+pay_gap <- so_data %>%
+select(Country,Gender,Salary) %>%
+na.omit(.) %>%
+filter(Country %in% top_20C &(Gender=='Male' | Gender=='Female')) %>%
+mutate(Country=as.character(Country)) %>%
+mutate(Gender=as.character(Gender))  %>%
+group_by(Gender,Country) %>% summarise_all(funs(median))
+ay_gapDt<-as.data.table(pay_gap)
+setkey(pay_gapDt,Country)
+pay_gapDt<-pay_gapDt %>% select(Country,Salary)
+pay_gapDt[ , Pay_Gap := c(NA, abs(diff(Salary))), by = Country]
+pay_gapDt <- pay_gapDt %>% select(Country,Pay_Gap) %>% na.omit(.) %>% arrange(desc(Pay_Gap))
+pay_gapDt<-as.data.table(pay_gap)
+setkey(pay_gapDt,Country)
+pay_gapDt<-pay_gapDt %>% select(Country,Salary)
+pay_gapDt[ , Pay_Gap := c(NA, abs(diff(Salary))), by = Country]
+pay_gapDt <- pay_gapDt %>% select(Country,Pay_Gap) %>% na.omit(.) %>% arrange(desc(Pay_Gap))
+genderPayGap <- pay_gapDt %>% inner_join(country_mf)
+View(genderPayGap)
+pay_gap <- so_data %>%
+select(Country,Gender,Salary) %>%
+na.omit(.) %>%
+filter(Country %in% top_20C &(Gender=='Male' | Gender=='Female')) %>%
+mutate(Country=as.character(Country)) %>%
+mutate(Gender=as.character(Gender))  %>%
+group_by(Gender,Country) %>% summarise_all(funs(mean))
+pay_gapDt<-as.data.table(pay_gap)
+setkey(pay_gapDt,Country)
+pay_gapDt<-pay_gapDt %>% select(Country,Salary)
+pay_gapDt[ , Pay_Gap := c(NA, abs(diff(Salary))), by = Country]
+pay_gapDt <- pay_gapDt %>% select(Country,Pay_Gap) %>% na.omit(.) %>% arrange(desc(Pay_Gap))
+genderPayGap <- pay_gapDt %>% inner_join(country_mf)
+View(genderPayGap)
+so_data %>%
+filter(Gender=='Male' | Gender=='Female') %>%
+group_by(Country) %>%
+summarise(Count=n()) %>%
+arrange(desc(Count)) %>%
+head(20) -> top_countries
+top_countries
+so_data %>%
+filter(Country %in% top_countries$Country) %>%
+group_by(Country)
+summarise(Males=sum(!is.na(Gender) & Gender == "Male"),
+Females=sum(!is.na(Gender) & Gender == "Female"),
+Total=n()) -> country_mf
+top_countries
+so_data %>%
+filter(Country %in% top_countries$Country) %>%
+group_by(Country)
+summarise(Males=sum(!is.na(Gender) & Gender == "Male"),
+Females=sum(!is.na(Gender) & Gender == "Female"),
+Total=n()) -> country_mf
+so_data %>%
+group_by(Country) %>%
+summarise(Count=n()) %>%
+arrange(desc(Count)) %>%
+head(20) -> top_countries
+so_data %>%
+filter(Country %in% top_countries$Country) %>%
+group_by(Country)
+summarise(Males=sum(!is.na(Gender) & Gender == "Male"),
+Females=sum(!is.na(Gender) & Gender == "Female"),
+Total=n()) -> country_mf
+pay_gap <- so_data %>%
+select(Country,Gender,Salary) %>%
+na.omit(.) %>%
+filter(Country %in% top_20C &(Gender=='Male' | Gender=='Female')) %>%
+mutate(Country=as.character(Country)) %>%
+mutate(Gender=as.character(Gender))  %>%
+group_by(Gender,Country) %>% summarise_all(funs(mean))
+pay_gapDt<-as.data.table(pay_gap)
+setkey(pay_gapDt,Country)
+pay_gapDt<-pay_gapDt %>% select(Country,Salary)
+pay_gapDt[ , Pay_Gap := c(NA, abs(diff(Salary))), by = Country]
+pay_gapDt <- pay_gapDt %>% select(Country,Pay_Gap) %>% na.omit(.) %>% arrange(desc(Pay_Gap))
+genderPayGap <- pay_gapDt %>% inner_join(country_mf)
+View(genderPayGap)
+so_data %>% filter(Country=='Pakistan' & (Gender=='Male' | Gender=='Female')) %>% n()
+so_data %>% filter(Country=='Pakistan' & (Gender=='Male' | Gender=='Female')) %>% count(.)
+genderPayGap <- pay_gapDt %>% mutate(Total=as.integer(Males) + as.inetegr(Females))
+pay_gap <- so_data %>%
+select(Country,Gender,Salary) %>%
+na.omit(.) %>%
+filter(Country %in% top_20C &(Gender=='Male' | Gender=='Female')) %>%
+mutate(Country=as.character(Country)) %>%
+mutate(Gender=as.character(Gender))  %>%
+group_by(Gender,Country) %>% summarise_all(funs(mean)) %>%
+mutate(Total=as.integer(Males) + as.inetegr(Females))
+so_data %>%
+select(Country,Gender,Salary) %>%
+na.omit(.) %>%
+filter(Country %in% top_20C &(Gender=='Male' | Gender=='Female')) %>%
+mutate(Country=as.character(Country)) %>%
+mutate(Gender=as.character(Gender))  %>%
+group_by(Gender,Country) %>% summarise_all(funs(mean)
+)
+pay_gapDt
+country_mf
+country_mf <-  so_data %>%
+filter(Country %in% top_countries$Country) %>%
+group_by(Country)
+summarise(Males=sum(!is.na(Gender) & Gender == "Male"),
+Females=sum(!is.na(Gender) & Gender == "Female")) %>%
+mutate(Total=as.integer(Males) + as.integer(females))
+country_mf <-  so_data %>%
+filter(Country %in% top_countries$Country) %>%
+group_by(Country) %>%
+summarise(Males=sum(!is.na(Gender) & Gender == "Male"),
+Females=sum(!is.na(Gender) & Gender == "Female")) %>%
+mutate(Total=as.integer(Males) + as.integer(females))
+country_mf <-  so_data %>%
+filter(Country %in% top_countries$Country) %>%
+group_by(Country) %>%
+summarise(Males=sum(!is.na(Gender) & Gender == "Male"),
+Females=sum(!is.na(Gender) & Gender == "Female")) %>%
+mutate(Total=as.integer(Males) + as.integer(Females))
+country_mf
+pay_gap <- so_data %>%
+select(Country,Gender,Salary) %>%
+na.omit(.) %>%
+filter(Country %in% top_20C &(Gender=='Male' | Gender=='Female')) %>%
+mutate(Country=as.character(Country)) %>%
+mutate(Gender=as.character(Gender))  %>%
+group_by(Gender,Country) %>% summarise_all(funs(mean))
+pay_gapDt<-as.data.table(pay_gap)
+setkey(pay_gapDt,Country)
+pay_gapDt<-pay_gapDt %>% select(Country,Salary)
+pay_gapDt[ , Pay_Gap := c(NA, abs(diff(Salary))), by = Country]
+pay_gapDt <- pay_gapDt %>% select(Country,Pay_Gap) %>% na.omit(.) %>% arrange(desc(Pay_Gap))
+genderPayGap <- pay_gapDt %>% inner_join(country_mf)
+View(genderPayGap)
+genderPayGap <- pay_gapDt %>% inner_join(country_mf) %>% mutate(gender_equality_%=Female/Total*100)
+genderPayGap <- pay_gapDt %>% inner_join(country_mf) %>% mutate(gender_equality_%=Females/Total*100)
+genderPayGap <- pay_gapDt %>% inner_join(country_mf) %>% mutate(gender_equality_%=as.double(Females/Total*100))
+genderPayGap <- pay_gapDt %>% inner_join(country_mf) %>%
+mutate(gender_equality=as.double(Females/Total*100))
+View(genderPayGap)