diff --git a/r-scripts/.RData b/r-scripts/.RData new file mode 100644 index 0000000..13df746 Binary files /dev/null and b/r-scripts/.RData differ diff --git a/r-scripts/.Rhistory b/r-scripts/.Rhistory new file mode 100644 index 0000000..57b244e --- /dev/null +++ b/r-scripts/.Rhistory @@ -0,0 +1,512 @@ +prep_word2vec(origin='features.txt',destination = 'vectors.txt',lowercase = T) +model = train_word2vec("vectors.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=0) +word2vec = read.vectors("feature_vectors.bin") +View(word2vec) +head(word2vec) +dim(word2vec) +word2vecDf=as.data.frame(word2vec) +word2vecDf +head(word2vecDf) +colnames(word2vecDf) +word2vecDf.index +word2vec[1] +word2vec[0] +View(word2vecDf) +?prep_word2vec +model = train_word2vec("features.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=0) +word2vec = read.vectors("feature_vectors.bin") +word2vecDf=as.data.frame(word2vec) +View(word2vecDf) +a=unique(all) +a=as.data.frame(unique(all)) +View(a) +word2vecDf[0] +word2vec[1:67] +word2vec[1:67] +word2vec[1] +word2vec[1,1] +model = train_word2vec("features.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=0,sample=0.001,binary=1) +?word2vec +?train_word2vec +model = train_word2vec("features.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=5) +length(all) +all=tolower(all) +all=gsub("[[:punct:]]", "", all) +all +write(all,'features.txt') +prep_word2vec(origin='features.txt',destination = 'vectors.txt',lowercase = T) +model = train_word2vec("features.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=5) +write(all,'features.txt') +model = train_word2vec("features.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=5) +model = train_word2vec("features.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=5) +write(all,'features.txt') +model = train_word2vec("features.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=0) +concat_features=so_data %>% +select(IDE,HaveWorkedLanguage,Country,HaveWorkedDatabase,HaveWorkedFramework,HaveWorkedPlatform,WantWorkLanguage,WantWorkFramework,WantWorkDatabase,WantWorkPlatform) %>% +filter(Country=='Pakistan') %>% +select(-Country) %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(IDE=paste(IDE,";",sep="")) %>% +mutate(HaveWorkedLanguage=paste(HaveWorkedLanguage,";",sep="")) %>% +mutate(HaveWorkedDatabase=paste(HaveWorkedDatabase,";",sep="")) %>% +mutate(HaveWorkedFramework=paste(HaveWorkedFramework,";",sep="")) %>% +mutate(WantWorkLanguage=paste(WantWorkLanguage,";",sep="")) %>% +mutate(WantWorkDatabase=paste(WantWorkDatabase,";",sep="")) %>% +mutate(WantWorkFramework=paste(WantWorkFramework,";",sep="")) %>% +mutate(concat_have=paste(IDE,HaveWorkedLanguage,HaveWorkedDatabase,HaveWorkedFramework,HaveWorkedPlatform)) %>% +mutate(concat_have=gsub(" ","",concat_have)) %>% +mutate(concat_want=paste(IDE,WantWorkLanguage,WantWorkDatabase,WantWorkFramework,WantWorkPlatform)) %>% +mutate(concat_want=gsub(" ","",concat_want)) +vectorAllWant=concat_features %>% +pull(concat_want) +vectorAllHave=concat_features %>% +pull(concat_have) +embeddingAllWant=unlist(strsplit(as.character(vectorAllWant),';')) +embeddingAllHave=unlist(strsplit(as.character(vectorAllHave),';')) +all=c(embeddingAllWant,embeddingAllHave) +doc=Corpus(VectorSource(all)) +dtm <- TermDocumentMatrix(doc) +m <- as.matrix(dtm) +v <- sort(rowSums(m),decreasing=TRUE) +d <- data.frame(word = names(v),freq=v) +set.seed(1234) +wordcloud(words = d$word, freq = d$freq, min.freq = 1, +max.words=200, random.order=FALSE, rot.per=0.35, +colors=brewer.pal(8, "Dark2")) +write(all,'features.txt') +model = train_word2vec("features.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=0) +?tcm +install.packages('text2vec') +?create_tcm +library(text2vec) +?create_tcm +v = create_vocabulary(all) +View(v) +vectorizer = vocab_vectorizer(v, grow_dtm = F, skip_grams_window = 5) +?vocab_vectorizer +vectorizer = vocab_vectorizer(v, grow_dtm = F, skip_grams_window = 5) +vectorizer = vocab_vectorizer(v) +vectorizer +tcm=create_tcm(all,vectorizer) +?create_tcm +it = itoken(all) +it +tcm=create_tcm(it,vectorizer) +?itoken +all +unlist(all) +as.data.frame(all) +tmp=as.data.frame(all) +View(tmp) +it = itoken(tmp$all) +View(tmp) +tmp = tmp %>% mutate(all=as.character(all)) +it = itoken(tmp$all) +it +v = create_vocabulary(it) +View(v) +vectorizer = vocab_vectorizer(v) +tcm=create_tcm(it,vectorizer) +tcm +fit <- glove(tcm = tcm, +word_vectors_size = 50, +x_max = 10, learning_rate = 0.2, +num_iters = 15) +fit <- GloVe(tcm = tcm, +word_vectors_size = 50, +x_max = 10, learning_rate = 0.2, +num_iters = 15) +fit <- glove(tcm = tcm, +word_vectors_size = 50, +x_max = 10, learning_rate = 0.2, +num_iters = 15) +vectorizer = vocab_vectorizer(v, grow_dtm = F, skip_grams_window = 5) +vectorizer = vocab_vectorizer(v) +vectorizer +word_array +clear +tmp=as.data.frame(all) +tmp = tmp %>% mutate(all=as.character(all)) %>% tolower(.) +tmp +it = itoken(tmp$all) +tmp=as.data.frame(all) +tmp = tmp %>% mutate(all=as.character(all)) %>% mutate(all=tolower(all)) +it = itoken(tmp$all) +v = create_vocabulary(it) +v +vectorizer = vocab_vectorizer(v) +tcm=create_tcm(it,vectorizer) +v = create_vocabulary(it) prune_vocabulary(term_count_min = 5) +v = create_vocabulary(it) %>% prune_vocabulary(term_count_min = 5) +vectorizer = vocab_vectorizer(v) +tcm=create_tcm(it,vectorizer) +vectorizer = vocab_vectorizer(v,skip_grams_window = 5) +?crate_tcm +?create_tcm +tcm=create_tcm(it,vectorizer,skip_grams_window=5) +tcm=create_tcm(it,vectorizer,skip_grams_window=5L) +?create_tcm +tcm +word2vec +word2vec[1:67] +word2vec[1:66] +?plot +??Rtsne +Rtsne(word2vec) +reduction <- Rtsne(as.matrix(word2vec), dims = 2, initial_dims = 50, +perplexity = 300, theta = 0.5, check_duplicates = F, +pca = F, max_iter = 1000, verbose = F, +is_distance = F, Y_init = NULL) +word2vec +word2vecDf=as.data.frame(word2vec) +reduction <- Rtsne(as.matrix(word2vecdf), dims = 2, initial_dims = 50, +perplexity = 300, theta = 0.5, check_duplicates = F, +pca = F, max_iter = 1000, verbose = F, +is_distance = F, Y_init = NULL) +reduction <- Rtsne(as.matrix(word2vec), dims = 2, initial_dims = 300, +perplexity = 300, theta = 0.5, check_duplicates = F, +pca = F, max_iter = 1000, verbose = F, +is_distance = F, Y_init = NULL) +reduction <- Rtsne(as.matrix(word2vec), dims = 2, initial_dims = 50, +perplexity = 21, theta = 0.5, check_duplicates = F, +pca = F, max_iter = 1000, verbose = F, +is_distance = F, Y_init = NULL) +reduction +df <- as.data.frame(reduction$Y) +View(df) +rows <- rownames(word2vec) +rows +rownames(df) <- rows +View(df) +View(df) +df % filter(rownames(df) == '') +df %>% filter(rownames(df) == '') +df %>% filter(rownames(df) != '') +df=df %>% filter(rownames(df) != '') +View(df) +rows +rows[2:] +rows[2:66] +rows[2:67] +rows[2:68] +rows[2:67] +rownames(df) <- rows[2:67] +View(df) +ggplot(df) + +geom_point(aes(x = V1, y = V2), color = "red") + +geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) + +xlab("Dimension 1") + +ylab("Dimension 2 ") + +# geom_text(fontface = 2, alpha = .8) + +theme_bw(base_size = 12) + +theme(legend.position = "none") + +ggtitle(paste0("2D reduction of Word Embedding Model ", ref_name," using t_SNE")) +ggsave(paste0(ref_name, ".jpeg"), path = path, width = 24, +height = 18, dpi = 100) +ggplot(df) + +geom_point(aes(x = V1, y = V2), color = "red") + +geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) + +xlab("Dimension 1") + +ylab("Dimension 2 ") + +# geom_text(fontface = 2, alpha = .8) + +theme_bw(base_size = 12) + +theme(legend.position = "none") + +ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE")) +ggsave(paste0(ref_name, ".jpeg"), path = path, width = 24, +height = 18, dpi = 100) +ggplot(df) + +geom_point(aes(x = V1, y = V2), color = "red") + +geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) + +xlab("Dimension 1") + +ylab("Dimension 2 ") + +# geom_text(fontface = 2, alpha = .8) + +theme_bw(base_size = 12) + +theme(legend.position = "none") + +ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE")) +ggsave(paste0("without_lower", ".jpeg"), path = path, width = 24, +height = 18, dpi = 100) +ggplot(df) + +geom_point(aes(x = V1, y = V2), color = "red") + +geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) + +xlab("Dimension 1") + +ylab("Dimension 2 ") + +# geom_text(fontface = 2, alpha = .8) + +theme_bw(base_size = 12) + +theme(legend.position = "none") + +ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE")) +prep_word2vec(origin='features.txt',destination = 'vectors.txt',lowercase = T) +model = train_word2vec("vectors.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=0) +word2vec = read.vectors("feature_vectors.bin") +reduction <- Rtsne(as.matrix(word2vec), dims = 2, initial_dims = 50, +perplexity = 21, theta = 0.5, check_duplicates = F, +pca = F, max_iter = 1000, verbose = F, +is_distance = F, Y_init = NULL) +df <- as.data.frame(reduction$Y) +rows <- rownames(word2vec) +rownames(df) <- rows +df=df %>% filter(rownames(df) != '') +rownames(df) <- rows[2:67] +# Create t-SNE plot and save as jpeg +ggplot(df) + +geom_point(aes(x = V1, y = V2), color = "red") + +geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) + +xlab("Dimension 1") + +ylab("Dimension 2 ") + +# geom_text(fontface = 2, alpha = .8) + +theme_bw(base_size = 12) + +theme(legend.position = "none") + +ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE")) +model +model = train_word2vec("vectors.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=0) +word2vec = read.vectors("feature_vectors.bin") +word2vec +reduction <- Rtsne(word2vec, dims = 2, initial_dims = 50, +perplexity = 21, theta = 0.5, check_duplicates = F, +pca = F, max_iter = 1000, verbose = F, +is_distance = F, Y_init = NULL) +df <- as.data.frame(reduction$Y) +rows <- rownames(word2vec) +rownames(df) <- rows +df=df %>% filter(rownames(df) != '') +rownames(df) <- rows[2:67] +rows[2:65] +rows[2:66] +df=df %>% filter(rownames(df) != '') +rownames(df) <- rows[2:65] +ggplot(df) + +geom_point(aes(x = V1, y = V2), color = "red") + +geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) + +xlab("Dimension 1") + +ylab("Dimension 2 ") + +# geom_text(fontface = 2, alpha = .8) + +theme_bw(base_size = 12) + +theme(legend.position = "none") + +ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE")) +reduction <- Rtsne(word2vec, dims = 2, initial_dims = 50, +perplexity = 21, theta = 0.5, check_duplicates = F, +pca = F, max_iter = 1000, verbose = F, +is_distance = F, Y_init = NULL) +df <- as.data.frame(reduction$Y) +rows <- rownames(word2vec) +ggplot(df) + +geom_point(aes(x = V1, y = V2), color = "red") + +geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) + +xlab("Dimension 1") + +ylab("Dimension 2 ") + +# geom_text(fontface = 2, alpha = .8) + +theme_bw(base_size = 12) + +theme(legend.position = "none") + +ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE")) +rows <- rownames(word2vec) +rownames(df) <- rows +ggplot(df) + +geom_point(aes(x = V1, y = V2), color = "red") + +geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) + +xlab("Dimension 1") + +ylab("Dimension 2 ") + +# geom_text(fontface = 2, alpha = .8) + +theme_bw(base_size = 12) + +theme(legend.position = "none") + +ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE")) +word2vec %>% closest_to('vim') +model +model = train_word2vec("vectors.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=0) +model = train_word2vec("vectors.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=0) +model +rownames(model) +rownames(model)[2:65] +model[2:65] +model[:,2:65] +model[,2:65] +model[2:65,] +model +model[2:65,] +word2vec=model[2:65,] # remove at end of string +word2vec %>% closest_to('vim') +word2vec %>% closest_to('aws') +reduction <- Rtsne(word2vec, dims = 2, initial_dims = 50, +perplexity = 21, theta = 0.5, check_duplicates = F, +pca = F, max_iter = 1000, verbose = F, +is_distance = F, Y_init = NULL) +df <- as.data.frame(reduction$Y) +rows <- rownames(word2vec) +rownames(df) <- rows +#df=df %>% filter(rownames(df) != '') +#rownames(df) <- rows[2:65] +# Create t-SNE plot and save as jpeg +ggplot(df) + +geom_point(aes(x = V1, y = V2), color = "red") + +geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) + +xlab("Dimension 1") + +ylab("Dimension 2 ") + +# geom_text(fontface = 2, alpha = .8) + +theme_bw(base_size = 12) + +theme(legend.position = "none") + +ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE")) +save(ggplot(df) + +geom_point(aes(x = V1, y = V2), color = "red") + +geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) + +xlab("Dimension 1") + +ylab("Dimension 2 ") + +# geom_text(fontface = 2, alpha = .8) + +theme_bw(base_size = 12) + +theme(legend.position = "none") + +ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE")),'a.png') +ggsave('s_removed.png') +word2vec = read.vectors("feature_vectors.bin") +reduction <- Rtsne(word2vec, dims = 2, initial_dims = 50, +perplexity = 21, theta = 0.5, check_duplicates = F, +pca = F, max_iter = 1000, verbose = F, +is_distance = F, Y_init = NULL) +df <- as.data.frame(reduction$Y) +rows <- rownames(word2vec) +rownames(df) <- rows +ggplot(df) + +geom_point(aes(x = V1, y = V2), color = "red") + +geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) + +xlab("Dimension 1") + +ylab("Dimension 2 ") + +# geom_text(fontface = 2, alpha = .8) + +theme_bw(base_size = 12) + +theme(legend.position = "none") + +ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE")) +ggsave('s_removed.png') +word2vec=model[2:65,] # remove at end of string +# Create t-SNE plot and save as jpeg +ggplot(df) + +geom_point(aes(x = V1, y = V2), color = "red") + +geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) + +xlab("Dimension 1") + +ylab("Dimension 2 ") + +# geom_text(fontface = 2, alpha = .8) + +theme_bw(base_size = 12) + +theme(legend.position = "none") + +ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE")) +ggsave('s_removed.png') +ggplot(df) + +geom_point(aes(x = V1, y = V2), color = "red") + +geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) + +xlab("Dimension 1") + +ylab("Dimension 2 ") + +# geom_text(fontface = 2, alpha = .8) + +theme_bw(base_size = 12) + +theme(legend.position = "none") + +ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE")) +prep_word2vec(origin='features.txt',destination = 'vectors.txt',lowercase = T) +model = train_word2vec("vectors.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=0) +word2vec=model[2:65,] # remove at end of string +prep_word2vec(origin='features.txt',destination = 'vectors.txt',lowercase = T) +model = train_word2vec("vectors.txt","feature_vectors.bin",vectors=300,threads=4,window=5,iter=5,negative_samples=0) +word2vec=model[2:65,] # remove at end of string +word2vec +reduction <- Rtsne(word2vec, dims = 2, initial_dims = 50, +perplexity = 21, theta = 0.5, check_duplicates = F, +pca = F, max_iter = 1000, verbose = F, +is_distance = F, Y_init = NULL) +df <- as.data.frame(reduction$Y) +rows <- rownames(word2vec) +rownames(df) <- rows +#df=df %>% filter(rownames(df) != '') +#rownames(df) <- rows[2:65] +# Create t-SNE plot and save as jpeg +ggplot(df) + +geom_point(aes(x = V1, y = V2), color = "red") + +geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) + +xlab("Dimension 1") + +ylab("Dimension 2 ") + +# geom_text(fontface = 2, alpha = .8) + +theme_bw(base_size = 12) + +theme(legend.position = "none") + +ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE")) +ggsave('s_removed.png') +word2vec = read.vectors("feature_vectors.bin") +word2vec +reduction <- Rtsne(word2vec, dims = 2, initial_dims = 50, +perplexity = 21, theta = 0.5, check_duplicates = F, +pca = F, max_iter = 1000, verbose = F, +is_distance = F, Y_init = NULL) +df <- as.data.frame(reduction$Y) +rows <- rownames(word2vec) +rownames(df) <- rows +#df=df %>% filter(rownames(df) != '') +#rownames(df) <- rows[2:65] +# Create t-SNE plot and save as jpeg +ggplot(df) + +geom_point(aes(x = V1, y = V2), color = "red") + +geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) + +xlab("Dimension 1") + +ylab("Dimension 2 ") + +# geom_text(fontface = 2, alpha = .8) + +theme_bw(base_size = 12) + +theme(legend.position = "none") + +ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE")) +ggsave('s_not_removed.png') +wordcloud(words = d$word, freq = d$freq, min.freq = 1, +max.words=200, random.order=FALSE, rot.per=0.35, +colors=brewer.pal(8, "Dark2")) +prep_word2vec(origin='features.txt',destination = 'vectors.txt',lowercase = T) +model = train_word2vec("vectors.txt",vectors=300,threads=4,window=5,iter=5,negative_samples=0) +word2vec=model[2:65,] # remove at end of string +?Rtsne +reduction <- Rtsne(word2vec, dims = 2, initial_dims = 50, +perplexity = 21, theta = 0, check_duplicates = F, +pca = F, max_iter = 1000, verbose = T, +is_distance = F, Y_init = NULL) +df <- as.data.frame(reduction$Y) +rows <- rownames(word2vec) +rownames(df) <- rows +ggplot(df) + +geom_point(aes(x = V1, y = V2), color = "red") + +geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) + +xlab("Dimension 1") + +ylab("Dimension 2 ") + +# geom_text(fontface = 2, alpha = .8) + +theme_bw(base_size = 12) + +theme(legend.position = "none") + +ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE")) +reduction <- Rtsne(word2vec, dims = 2, initial_dims = 50, +perplexity = 21, theta = 0.5, check_duplicates = F, +pca = F, max_iter = 1000, verbose = T, +is_distance = F, Y_init = NULL) +df <- as.data.frame(reduction$Y) +rows <- rownames(word2vec) +rownames(df) <- rows +ggplot(df) + +geom_point(aes(x = V1, y = V2), color = "red") + +geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) + +xlab("Dimension 1") + +ylab("Dimension 2 ") + +# geom_text(fontface = 2, alpha = .8) + +theme_bw(base_size = 12) + +theme(legend.position = "none") + +ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE")) +ggsave('wordembeddings.png') +wordcloud(words = d$word, freq = d$freq, min.freq = 1, +max.words=200, random.order=FALSE, rot.per=0.35, +colors=brewer.pal(8, "Dark2")) +write( embeddingAllHave,'have_features.txt') +prep_word2vec(origin='have_features.txt',destination = 'vectors.txt',lowercase = T) +model = train_word2vec("vectors.txt",vectors=300,threads=4,window=5,iter=5,negative_samples=0) +word2vec=model[2:65,] # remove at end of string +#tsne (dimensionality reduction) +reduction <- Rtsne(word2vec, dims = 2, initial_dims = 50, +perplexity = 21, theta = 0.5, check_duplicates = F, +pca = F, max_iter = 1000, verbose = T, +is_distance = F, Y_init = NULL) +df <- as.data.frame(reduction$Y) +rows <- rownames(word2vec) +rownames(df) <- rows +#df=df %>% filter(rownames(df) != '') +#rownames(df) <- rows[2:65] +# Create t-SNE plot and save as jpeg +ggplot(df) + +geom_point(aes(x = V1, y = V2), color = "red") + +geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) + +xlab("Dimension 1") + +ylab("Dimension 2 ") + +# geom_text(fontface = 2, alpha = .8) + +theme_bw(base_size = 12) + +theme(legend.position = "none") + +ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE")) +ggsave('wordembeddings_have.png') diff --git a/r-scripts/ExploratoryAnalysis.R b/r-scripts/ExploratoryAnalysis.R new file mode 100644 index 0000000..cc96d08 --- /dev/null +++ b/r-scripts/ExploratoryAnalysis.R @@ -0,0 +1,342 @@ +library(dplyr) +library(ggplot2) +library(stringr) +library(jsonlite) +library(data.table) +library(reshape2) +library(tidyr) + +# Job Sastifaction of developer type , pakistan compared with other big wig countries in IT (tested) + +jobDf = so_data %>% + select(DeveloperType,JobSatisfaction,Country) %>% + filter(Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') %>% + mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% + unnest(DeveloperType) %>% + na.omit(.) %>% + mutate(DeveloperType=trimws(DeveloperType)) %>% + group_by(DeveloperType,Country) %>% + summarise_all(funs(sum)) + + +setDT(jobDf) +jobDf_melt=melt(jobDf) +ggplot(data=jobDf_melt, aes(x=jobDf_melt$DeveloperType, y = jobDf_melt$value,fill=Country))+ + theme(axis.text.x = element_text(face="bold", color="#993333",size=8, angle=90)) + + ggtitle("Job Satisfaction of Developers") + + geom_bar(stat="identity") + + xlab("Developer type") + + ylab("Number of Respondents") + + theme() + +exportJson2<-jobDf %>% toJSON(pretty = T) + +write(exportJson2, "Job_Satisfaction_Of_Each_Developer_Type_In_4_Big_IT-Countires.json") + + + +# Relation between Have worked and want to work language among developers in Pakistan (tested) + + +haveWorked=so_data %>% + select(HaveWorkedLanguage,Country) %>% + filter(Country=='Pakistan') %>% + select(-Country) %>% + na.omit() %>% + mutate_if(is.factor,as.character) %>% + mutate(HaveWorkedLanguage=gsub(" ","",HaveWorkedLanguage)) %>% + pull(HaveWorkedLanguage) %>% + str_c(.,collapse=';') %>% + str_split(';') %>% + unlist() %>% + table() %>% + as.data.frame() + +haveWant=so_data %>% + select(WantWorkLanguage,Country) %>% + filter(Country=='Pakistan') %>% + select(-Country) %>% + na.omit() %>% + mutate_if(is.factor,as.character) %>% + mutate(WantWorkLanguage=gsub(" ","",WantWorkLanguage)) %>% + pull(WantWorkLanguage) %>% + str_c(.,collapse=';') %>% + str_split(';') %>% + unlist() %>% + table() %>% + as.data.frame() + + +colnames(haveWorked)<-c("language","worked") +colnames(haveWant)<-c("language","want") +language_join<-inner_join(haveWant,haveWorked,by="language") +language_join %>% + ggplot(aes(y=want,x=worked,color=language))+geom_point()+ + ggrepel::geom_label_repel(aes(label=language))+theme(legend.position = "None")+ + geom_abline(intercept = 45,color="red")+labs(title="Have worked and want to work(Language) in Pakistan") + +exportJson2<-language_join %>% toJSON(pretty = T) + +write(exportJson2,"Relaton_between_haveWorked_and_WantToWork_with_Languages_in_pakistan.json") + + +# Relation between job stasifaction and working years in (tested) + +career_satisfied<-so_data %>% + select(YearsCodedJob,JobSatisfaction,Country) %>% + filter(Country=='Pakistan') %>% + mutate(YearsCodedJob = as.character(YearsCodedJob)) %>% + na.omit() %>% + mutate(YearsCodedJob=ifelse(test=YearsCodedJob == 'Less than a year',yes="1 year",no=YearsCodedJob)) + + career_satisfied$YearsCodedJob %<>% + str_sub(1,2) %>% + str_trim() %>% + as.integer() + + career_satisfied %>% + arrange(YearsCodedJob) %>% + group_by(YearsCodedJob) %>% + summarise(JobSatisfaction=mean(JobSatisfaction)) %>% + ggplot(aes(x=YearsCodedJob,y=JobSatisfaction))+ + geom_point()+ + geom_smooth(level=0,method = 'loess')+ + labs(title="The relation between jobsatisfaction and working years in Pakistan") + + + df = career_satisfied %>% + arrange(YearsCodedJob) %>% + group_by(YearsCodedJob) %>% + summarise(JobSatisfaction=mean(JobSatisfaction)) + + + exportJson = df %>% toJSON(pretty = T) + write(exportJson, "Pak_JobSatisfaction.json") + + + +# Relation between Learning New Tech and working years in Pakistan (doubtful) + + + learningnew_tech<-so_data %>% + select(YearsCodedJob,LearningNewTech,Country) %>% + filter(Country=='Pakistan') %>% + mutate(YearsCodedJob = as.character(YearsCodedJob)) %>% + na.omit() %>% + mutate(YearsCodedJob=ifelse(test=YearsCodedJob == 'Less than a year',yes="1 year",no=YearsCodedJob)) %>% + mutate(LearningNewTech=ifelse((LearningNewTech=='Agree' | LearningNewTech=='Strongly agree' | LearningNewTech=='Somewhat agree'),yes=1,no=0)) + + + learningnew_tech$YearsCodedJob %<>% + str_sub(1,2) %>% + str_trim() %>% + as.integer() + + + learningnew_tech %>% + arrange(YearsCodedJob) %>% + group_by(YearsCodedJob) %>% + summarise(learning_new_tech=mean(LearningNewTech)) %>% + ggplot(aes(x=YearsCodedJob,y=learning_new_tech))+ + geom_point()+ + geom_smooth(level=0,method = 'loess')+ + labs(title="The relation between Learning New Tech and working years in Pakistan") + + df2 = learningnew_tech %>% + arrange(YearsCodedJob) %>% + group_by(YearsCodedJob) %>% + summarise(learning_new_tech=sum(LearningNewTech)) + + exportJson2 = df2 %>% toJSON(pretty = T) + write(exportJson2, "Pak_LearningNewTech.json") + +# Most Famous Language among Students in Pakistan (tested) + + language_students=so_data %>% + filter(Country=='Pakistan') %>% + filter(Professional=='Student') %>% + select(Professional,HaveWorkedLanguage) %>% + na.omit(.) %>% + mutate_if(is.factor,as.character) %>% + mutate(HaveWorkedLanguage=gsub(" ","",HaveWorkedLanguage)) %>% + pull(HaveWorkedLanguage) %>% + str_c(.,collapse=';') %>% + str_split(';') %>% + unlist() %>% + table() %>% + as.data.frame() + + colnames(language_students)<-c("language","count") + language_students = language_students %>% + mutate(language=as.character(language)) + + + ggplot(data=language_students,aes(x=language_students$language,y=language_students$count))+ + geom_bar(stat="identity") + + xlab("Developer type") + + ylab("Number of Students") + + theme() + + labs(title="Popular Languages among Students in Pakistan ") + + exportJson2<-language_students %>% toJSON(pretty = T) + write(exportJson2, "Languages_used_by_students_in_pakistan.json") + + +# Female friendly languages in Pakistan (tested) + + language <- so_data %>% + filter(grepl("Female", Gender)) %>% + filter(Country=='Pakistan') %>% + select(Gender,HaveWorkedLanguage) %>% + na.omit() %>% + mutate_if(is.factor,as.character) %>% + mutate(HaveWorkedLanguage=gsub(" ","",HaveWorkedLanguage)) %>% + pull(HaveWorkedLanguage) %>% + str_c(.,collapse=';') %>% + str_split(';') %>% + unlist() %>% + table() %>% + as.data.frame() + + colnames(language)<-c("language","count") + + female_language = language %>% + mutate(language=as.character(language)) + group_by(language) %>% + summarise(Total = round(n())) + +female_language$language <- factor(female_language$language, levels = female_language$language) # convert to factor to retain sorted order in plot. + +exportJson2<-female_language %>% toJSON(pretty = T) + +write(exportJson2, "Lamguages_used_by_females_in_pakistan.json") + +ggplot(female_language, aes(x = language , y = count,fill = language )) + + geom_bar(width = 0.85, stat="identity") + + coord_polar(theta = "y") + + xlab("") + ylab("") + + ylim(c(0,30)) + + geom_text(data = female_language, hjust = 1, size = 3, aes(x = language, y = 0, label = language )) + + theme(legend.position = "right", axis.text.y = element_blank() , axis.ticks = element_blank()) + + labs(title="Most used languages by females in Pakistan") + + +# Unemployment ratio of each developer type in Pakistan and rest of the world (tested) + +#note express unemployment count as % since count of develoepr type varies according to each country + +count_dev<-so_data%>%select(Country,DeveloperType) %>% + filter(Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') %>% + na.omit(.) %>% + mutate_if(is.factor,as.character) %>% + mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% + mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% + unnest(DeveloperType) %>% + mutate(DeveloperType=trimws(DeveloperType)) %>% + gather(Country, DeveloperType) %>% + group_by(Country,DeveloperType) %>% + filter(DeveloperType!='Other') %>% + summarise(Total_Count= n()) + + +unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>% + filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>% + na.omit(.) %>% + mutate_if(is.factor,as.character) %>% + mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% + mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% + unnest(DeveloperType) %>% + mutate(DeveloperType=trimws(DeveloperType)) %>% + mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) %>% + select(-EmploymentStatus) %>% + group_by(Country,DeveloperType) %>% + filter(DeveloperType!='Other') %>% + summarise(Unemployment_count= n()) + +unemployment <- count_dev %>% + left_join(id=c('Country','DeveloperType'),unemployment_dev) + +unemployment[is.na(unemployment)] <- 0 + + + + +# The dataframe is prepared just need tp plot here + + setDT(unemployment) + unemployment_melt=melt(unemployment) + + ggplot(data=unemployment_melt, aes(x=unemployment_melt$DeveloperType, y = unemployment_melt$value,fill=unemployment_melt$variable))+ + theme(axis.text.x = element_text(face="bold", color="#993333",size=8, angle=90)) + + ggtitle("Unemployment among Developers across the world") + + geom_bar(stat="identity") + + xlab("Developer type") + + ylab("Number of Respondents") + + theme() + + + + + + + + + # Find Gender pay gap in countries with most number of respondents (test it) + + # total count of developers in each country (only male and female) + so_data %>% + group_by(Country) %>% + summarise(Count=n()) %>% + arrange(desc(Count)) %>% + head(20) -> top_countries + + top_20C <- top_countries %>% select(Country) %>% pull(.) + # total count of male/female developers in each country + + +country_mf <- so_data %>% + filter(Country %in% top_countries$Country) %>% + group_by(Country) %>% + summarise(Males=sum(!is.na(Gender) & Gender == "Male"), + Females=sum(!is.na(Gender) & Gender == "Female")) %>% + mutate(Total=as.integer(Males) + as.integer(Females)) + + # Taking top 20 countries which have most no of respondents (both make and female) + + + + pay_gap <- so_data %>% + select(Country,Gender,Salary) %>% + na.omit(.) %>% + filter(Country %in% top_20C &(Gender=='Male' | Gender=='Female')) %>% + mutate(Country=as.character(Country)) %>% + mutate(Gender=as.character(Gender)) %>% + group_by(Gender,Country) %>% summarise_all(funs(mean)) + + pay_gapDt<-as.data.table(pay_gap) + setkey(pay_gapDt,Country) + pay_gapDt<-pay_gapDt %>% select(Country,Salary) + pay_gapDt[ , Pay_Gap := c(NA, abs(diff(Salary))), by = Country] + pay_gapDt <- pay_gapDt %>% select(Country,Pay_Gap) %>% na.omit(.) %>% arrange(desc(Pay_Gap)) + + genderPayGap <- pay_gapDt %>% inner_join(country_mf) %>% + mutate(Ratio_of_female_devs_percentage=as.double(Females/Total*100)) + + exportJson2<-genderPayGap %>% + select(Country,Males,Females,Ratio_of_female_devs_percentage) %>% + toJSON(pretty = T) + + + write(exportJson2, "%_Female_dev_in_countries_with_highest_no_of_respondents.json") + + exportJson2<-genderPayGap %>% + select(Country,Males,Females,Pay_Gap) %>% + toJSON(pretty = T) + + write(exportJson2, "Avg_pay_gap_in_countries_with_highest_no_of_respondents.json") + + # draw two maps here + # one showing gender equaltiy in work force + # other showing gender pay gap + + diff --git a/r-scripts/stackoverflow_json_files/%_Female_dev_in_countries_with_highest_no_of_respondents.json b/r-scripts/stackoverflow_json_files/%_Female_dev_in_countries_with_highest_no_of_respondents.json new file mode 100644 index 0000000..f5b3c66 --- /dev/null +++ b/r-scripts/stackoverflow_json_files/%_Female_dev_in_countries_with_highest_no_of_respondents.json @@ -0,0 +1,122 @@ +[ + { + "Country": "Australia", + "Males": 634, + "Females": 40, + "Ratio_of_female_devs_percentage": 5.9347 + }, + { + "Country": "Israel", + "Males": 346, + "Females": 40, + "Ratio_of_female_devs_percentage": 10.3627 + }, + { + "Country": "Netherlands", + "Males": 598, + "Females": 29, + "Ratio_of_female_devs_percentage": 4.6252 + }, + { + "Country": "Canada", + "Males": 1446, + "Females": 150, + "Ratio_of_female_devs_percentage": 9.3985 + }, + { + "Country": "Brazil", + "Males": 491, + "Females": 25, + "Ratio_of_female_devs_percentage": 4.845 + }, + { + "Country": "Romania", + "Males": 362, + "Females": 29, + "Ratio_of_female_devs_percentage": 7.4169 + }, + { + "Country": "United States", + "Males": 7447, + "Females": 847, + "Ratio_of_female_devs_percentage": 10.2122 + }, + { + "Country": "United Kingdom", + "Males": 2954, + "Females": 221, + "Ratio_of_female_devs_percentage": 6.9606 + }, + { + "Country": "Spain", + "Males": 518, + "Females": 41, + "Ratio_of_female_devs_percentage": 7.3345 + }, + { + "Country": "France", + "Males": 1094, + "Females": 64, + "Ratio_of_female_devs_percentage": 5.5268 + }, + { + "Country": "Poland", + "Males": 806, + "Females": 98, + "Ratio_of_female_devs_percentage": 10.8407 + }, + { + "Country": "India", + "Males": 2422, + "Females": 230, + "Ratio_of_female_devs_percentage": 8.6727 + }, + { + "Country": "Switzerland", + "Males": 398, + "Females": 13, + "Ratio_of_female_devs_percentage": 3.163 + }, + { + "Country": "Italy", + "Males": 515, + "Females": 16, + "Ratio_of_female_devs_percentage": 3.0132 + }, + { + "Country": "Russian Federation", + "Males": 542, + "Females": 31, + "Ratio_of_female_devs_percentage": 5.4101 + }, + { + "Country": "Pakistan", + "Males": 231, + "Females": 15, + "Ratio_of_female_devs_percentage": 6.0976 + }, + { + "Country": "Iran", + "Males": 260, + "Females": 13, + "Ratio_of_female_devs_percentage": 4.7619 + }, + { + "Country": "Germany", + "Males": 2740, + "Females": 153, + "Ratio_of_female_devs_percentage": 5.2886 + }, + { + "Country": "Austria", + "Males": 322, + "Females": 23, + "Ratio_of_female_devs_percentage": 6.6667 + }, + { + "Country": "Sweden", + "Males": 418, + "Females": 18, + "Ratio_of_female_devs_percentage": 4.1284 + } +] diff --git a/r-scripts/stackoverflow_json_files/Avg_pay_gap_in_countries_with_highest_no_of_respondents.json b/r-scripts/stackoverflow_json_files/Avg_pay_gap_in_countries_with_highest_no_of_respondents.json new file mode 100644 index 0000000..ff06a22 --- /dev/null +++ b/r-scripts/stackoverflow_json_files/Avg_pay_gap_in_countries_with_highest_no_of_respondents.json @@ -0,0 +1,122 @@ +[ + { + "Country": "Australia", + "Males": 634, + "Females": 40, + "Pay_Gap": 10737.1317 + }, + { + "Country": "Israel", + "Males": 346, + "Females": 40, + "Pay_Gap": 9251.1895 + }, + { + "Country": "Netherlands", + "Males": 598, + "Females": 29, + "Pay_Gap": 9246.5287 + }, + { + "Country": "Canada", + "Males": 1446, + "Females": 150, + "Pay_Gap": 8815.6772 + }, + { + "Country": "Brazil", + "Males": 491, + "Females": 25, + "Pay_Gap": 8303.0236 + }, + { + "Country": "Romania", + "Males": 362, + "Females": 29, + "Pay_Gap": 7984.4905 + }, + { + "Country": "United States", + "Males": 7447, + "Females": 847, + "Pay_Gap": 7472.3785 + }, + { + "Country": "United Kingdom", + "Males": 2954, + "Females": 221, + "Pay_Gap": 6837.036 + }, + { + "Country": "Spain", + "Males": 518, + "Females": 41, + "Pay_Gap": 6231.8437 + }, + { + "Country": "France", + "Males": 1094, + "Females": 64, + "Pay_Gap": 5134.0087 + }, + { + "Country": "Poland", + "Males": 806, + "Females": 98, + "Pay_Gap": 4981.3313 + }, + { + "Country": "India", + "Males": 2422, + "Females": 230, + "Pay_Gap": 4882.4146 + }, + { + "Country": "Switzerland", + "Males": 398, + "Females": 13, + "Pay_Gap": 4543.6383 + }, + { + "Country": "Italy", + "Males": 515, + "Females": 16, + "Pay_Gap": 4349.3793 + }, + { + "Country": "Russian Federation", + "Males": 542, + "Females": 31, + "Pay_Gap": 2943.8532 + }, + { + "Country": "Pakistan", + "Males": 231, + "Females": 15, + "Pay_Gap": 2832.3967 + }, + { + "Country": "Iran", + "Males": 260, + "Females": 13, + "Pay_Gap": 2130.0627 + }, + { + "Country": "Germany", + "Males": 2740, + "Females": 153, + "Pay_Gap": 2116.6578 + }, + { + "Country": "Austria", + "Males": 322, + "Females": 23, + "Pay_Gap": 1529.8644 + }, + { + "Country": "Sweden", + "Males": 418, + "Females": 18, + "Pay_Gap": 1270.0568 + } +] diff --git a/r-scripts/stackoverflow_json_files/Job_Satisfaction_Of_Each_Developer_Type_In_4_Big_IT-Countires.json b/r-scripts/stackoverflow_json_files/Job_Satisfaction_Of_Each_Developer_Type_In_4_Big_IT-Countires.json new file mode 100644 index 0000000..b80226a --- /dev/null +++ b/r-scripts/stackoverflow_json_files/Job_Satisfaction_Of_Each_Developer_Type_In_4_Big_IT-Countires.json @@ -0,0 +1,282 @@ +[ + { + "DeveloperType": "Database administrator", + "Country": "China", + "JobSatisfaction": 95 + }, + { + "DeveloperType": "Database administrator", + "Country": "India", + "JobSatisfaction": 2391 + }, + { + "DeveloperType": "Database administrator", + "Country": "Pakistan", + "JobSatisfaction": 359 + }, + { + "DeveloperType": "Database administrator", + "Country": "United States", + "JobSatisfaction": 8258 + }, + { + "DeveloperType": "Data scientist", + "Country": "China", + "JobSatisfaction": 101 + }, + { + "DeveloperType": "Data scientist", + "Country": "India", + "JobSatisfaction": 1452 + }, + { + "DeveloperType": "Data scientist", + "Country": "Pakistan", + "JobSatisfaction": 160 + }, + { + "DeveloperType": "Data scientist", + "Country": "United States", + "JobSatisfaction": 4955 + }, + { + "DeveloperType": "Desktop applications developer", + "Country": "China", + "JobSatisfaction": 223 + }, + { + "DeveloperType": "Desktop applications developer", + "Country": "India", + "JobSatisfaction": 3731 + }, + { + "DeveloperType": "Desktop applications developer", + "Country": "Pakistan", + "JobSatisfaction": 570 + }, + { + "DeveloperType": "Desktop applications developer", + "Country": "United States", + "JobSatisfaction": 15897 + }, + { + "DeveloperType": "Developer with a statistics or mathematics background", + "Country": "China", + "JobSatisfaction": 114 + }, + { + "DeveloperType": "Developer with a statistics or mathematics background", + "Country": "India", + "JobSatisfaction": 2145 + }, + { + "DeveloperType": "Developer with a statistics or mathematics background", + "Country": "Pakistan", + "JobSatisfaction": 250 + }, + { + "DeveloperType": "Developer with a statistics or mathematics background", + "Country": "United States", + "JobSatisfaction": 6799 + }, + { + "DeveloperType": "DevOps specialist", + "Country": "China", + "JobSatisfaction": 117 + }, + { + "DeveloperType": "DevOps specialist", + "Country": "India", + "JobSatisfaction": 1250 + }, + { + "DeveloperType": "DevOps specialist", + "Country": "Pakistan", + "JobSatisfaction": 116 + }, + { + "DeveloperType": "DevOps specialist", + "Country": "United States", + "JobSatisfaction": 7407 + }, + { + "DeveloperType": "Embedded applications/devices developer", + "Country": "China", + "JobSatisfaction": 93 + }, + { + "DeveloperType": "Embedded applications/devices developer", + "Country": "India", + "JobSatisfaction": 1087 + }, + { + "DeveloperType": "Embedded applications/devices developer", + "Country": "Pakistan", + "JobSatisfaction": 77 + }, + { + "DeveloperType": "Embedded applications/devices developer", + "Country": "United States", + "JobSatisfaction": 5054 + }, + { + "DeveloperType": "Graphic designer", + "Country": "China", + "JobSatisfaction": 22 + }, + { + "DeveloperType": "Graphic designer", + "Country": "India", + "JobSatisfaction": 859 + }, + { + "DeveloperType": "Graphic designer", + "Country": "Pakistan", + "JobSatisfaction": 179 + }, + { + "DeveloperType": "Graphic designer", + "Country": "United States", + "JobSatisfaction": 1938 + }, + { + "DeveloperType": "Graphics programming", + "Country": "China", + "JobSatisfaction": 66 + }, + { + "DeveloperType": "Graphics programming", + "Country": "India", + "JobSatisfaction": 484 + }, + { + "DeveloperType": "Graphics programming", + "Country": "Pakistan", + "JobSatisfaction": 111 + }, + { + "DeveloperType": "Graphics programming", + "Country": "United States", + "JobSatisfaction": 2321 + }, + { + "DeveloperType": "Machine learning specialist", + "Country": "China", + "JobSatisfaction": 40 + }, + { + "DeveloperType": "Machine learning specialist", + "Country": "India", + "JobSatisfaction": 711 + }, + { + "DeveloperType": "Machine learning specialist", + "Country": "Pakistan", + "JobSatisfaction": 81 + }, + { + "DeveloperType": "Machine learning specialist", + "Country": "United States", + "JobSatisfaction": 2162 + }, + { + "DeveloperType": "Mobile developer", + "Country": "China", + "JobSatisfaction": 206 + }, + { + "DeveloperType": "Mobile developer", + "Country": "India", + "JobSatisfaction": 6600 + }, + { + "DeveloperType": "Mobile developer", + "Country": "Pakistan", + "JobSatisfaction": 775 + }, + { + "DeveloperType": "Mobile developer", + "Country": "United States", + "JobSatisfaction": 11007 + }, + { + "DeveloperType": "Other", + "Country": "China", + "JobSatisfaction": 40 + }, + { + "DeveloperType": "Other", + "Country": "India", + "JobSatisfaction": 933 + }, + { + "DeveloperType": "Other", + "Country": "Pakistan", + "JobSatisfaction": 62 + }, + { + "DeveloperType": "Other", + "Country": "United States", + "JobSatisfaction": 6196 + }, + { + "DeveloperType": "Quality assurance engineer", + "Country": "China", + "JobSatisfaction": 23 + }, + { + "DeveloperType": "Quality assurance engineer", + "Country": "India", + "JobSatisfaction": 679 + }, + { + "DeveloperType": "Quality assurance engineer", + "Country": "Pakistan", + "JobSatisfaction": 105 + }, + { + "DeveloperType": "Quality assurance engineer", + "Country": "United States", + "JobSatisfaction": 2362 + }, + { + "DeveloperType": "Systems administrator", + "Country": "China", + "JobSatisfaction": 98 + }, + { + "DeveloperType": "Systems administrator", + "Country": "India", + "JobSatisfaction": 1053 + }, + { + "DeveloperType": "Systems administrator", + "Country": "Pakistan", + "JobSatisfaction": 131 + }, + { + "DeveloperType": "Systems administrator", + "Country": "United States", + "JobSatisfaction": 6582 + }, + { + "DeveloperType": "Web developer", + "Country": "China", + "JobSatisfaction": 702 + }, + { + "DeveloperType": "Web developer", + "Country": "India", + "JobSatisfaction": 14604 + }, + { + "DeveloperType": "Web developer", + "Country": "Pakistan", + "JobSatisfaction": 1516 + }, + { + "DeveloperType": "Web developer", + "Country": "United States", + "JobSatisfaction": 41856 + } +] diff --git a/r-scripts/stackoverflow_json_files/Languages_used_by_females_in_pakistan.json b/r-scripts/stackoverflow_json_files/Languages_used_by_females_in_pakistan.json new file mode 100644 index 0000000..84ba6fb --- /dev/null +++ b/r-scripts/stackoverflow_json_files/Languages_used_by_females_in_pakistan.json @@ -0,0 +1,66 @@ +[ + { + "language": "Assembly", + "count": 2 + }, + { + "language": "C", + "count": 5 + }, + { + "language": "C#", + "count": 8 + }, + { + "language": "C++", + "count": 7 + }, + { + "language": "Java", + "count": 8 + }, + { + "language": "JavaScript", + "count": 10 + }, + { + "language": "Matlab", + "count": 3 + }, + { + "language": "Objective-C", + "count": 1 + }, + { + "language": "PHP", + "count": 7 + }, + { + "language": "Python", + "count": 3 + }, + { + "language": "Scala", + "count": 1 + }, + { + "language": "SQL", + "count": 9 + }, + { + "language": "Swift", + "count": 1 + }, + { + "language": "VBA", + "count": 1 + }, + { + "language": "VB.NET", + "count": 4 + }, + { + "language": "VisualBasic6", + "count": 3 + } +] diff --git a/r-scripts/stackoverflow_json_files/Languages_used_by_students_in_pakistan.json b/r-scripts/stackoverflow_json_files/Languages_used_by_students_in_pakistan.json new file mode 100644 index 0000000..0cccb02 --- /dev/null +++ b/r-scripts/stackoverflow_json_files/Languages_used_by_students_in_pakistan.json @@ -0,0 +1,66 @@ +[ + { + "language": "Assembly", + "count": 8 + }, + { + "language": "C", + "count": 22 + }, + { + "language": "C#", + "count": 20 + }, + { + "language": "C++", + "count": 28 + }, + { + "language": "Haskell", + "count": 1 + }, + { + "language": "Java", + "count": 27 + }, + { + "language": "JavaScript", + "count": 24 + }, + { + "language": "Matlab", + "count": 7 + }, + { + "language": "PHP", + "count": 21 + }, + { + "language": "Python", + "count": 15 + }, + { + "language": "R", + "count": 3 + }, + { + "language": "Ruby", + "count": 3 + }, + { + "language": "SQL", + "count": 19 + }, + { + "language": "TypeScript", + "count": 1 + }, + { + "language": "VB.NET", + "count": 3 + }, + { + "language": "VisualBasic6", + "count": 2 + } +] diff --git a/r-scripts/stackoverflow_json_files/Relaton_between_haveWorked_and_WantToWork_with_Languages_in_pakistan.json b/r-scripts/stackoverflow_json_files/Relaton_between_haveWorked_and_WantToWork_with_Languages_in_pakistan.json new file mode 100644 index 0000000..df9b809 --- /dev/null +++ b/r-scripts/stackoverflow_json_files/Relaton_between_haveWorked_and_WantToWork_with_Languages_in_pakistan.json @@ -0,0 +1,152 @@ +[ + { + "language": "Assembly", + "want": 8, + "worked": 29 + }, + { + "language": "C", + "want": 27, + "worked": 58 + }, + { + "language": "C#", + "want": 87, + "worked": 109 + }, + { + "language": "C++", + "want": 36, + "worked": 75 + }, + { + "language": "CoffeeScript", + "want": 9, + "worked": 6 + }, + { + "language": "CommonLisp", + "want": 1, + "worked": 1 + }, + { + "language": "Dart", + "want": 5, + "worked": 1 + }, + { + "language": "Elixir", + "want": 8, + "worked": 1 + }, + { + "language": "Go", + "want": 28, + "worked": 1 + }, + { + "language": "Groovy", + "want": 4, + "worked": 5 + }, + { + "language": "Haskell", + "want": 4, + "worked": 1 + }, + { + "language": "Java", + "want": 105, + "worked": 111 + }, + { + "language": "JavaScript", + "want": 124, + "worked": 161 + }, + { + "language": "Julia", + "want": 1, + "worked": 2 + }, + { + "language": "Lua", + "want": 4, + "worked": 4 + }, + { + "language": "Matlab", + "want": 13, + "worked": 16 + }, + { + "language": "Objective-C", + "want": 33, + "worked": 21 + }, + { + "language": "Perl", + "want": 6, + "worked": 4 + }, + { + "language": "PHP", + "want": 72, + "worked": 133 + }, + { + "language": "Python", + "want": 87, + "worked": 47 + }, + { + "language": "R", + "want": 16, + "worked": 5 + }, + { + "language": "Ruby", + "want": 45, + "worked": 18 + }, + { + "language": "Scala", + "want": 11, + "worked": 5 + }, + { + "language": "Smalltalk", + "want": 2, + "worked": 1 + }, + { + "language": "SQL", + "want": 81, + "worked": 134 + }, + { + "language": "Swift", + "want": 45, + "worked": 14 + }, + { + "language": "TypeScript", + "want": 37, + "worked": 11 + }, + { + "language": "VBA", + "want": 1, + "worked": 3 + }, + { + "language": "VB.NET", + "want": 11, + "worked": 22 + }, + { + "language": "VisualBasic6", + "want": 6, + "worked": 10 + } +] diff --git a/r-scripts/stackoverflow_json_files/word_cloud_technologies.png b/r-scripts/stackoverflow_json_files/word_cloud_technologies.png new file mode 100644 index 0000000..15aa79f Binary files /dev/null and b/r-scripts/stackoverflow_json_files/word_cloud_technologies.png differ diff --git a/r-scripts/wordembeddings.R b/r-scripts/wordembeddings.R index 255e5b0..d02341b 100644 --- a/r-scripts/wordembeddings.R +++ b/r-scripts/wordembeddings.R @@ -1,6 +1,62 @@ +library(wordcloud) +library(RColorBrewer) +library(tm) +library(Rtsne) +library(text2vec) + +# ## gender and developer type +# +# genderDev=so_data %>% +# select(DeveloperType,Country) %>% +# filter(Country=='Pakistan') %>% +# na.omit() %>% +# mutate(gender_developer=gsub(" ","",DeveloperType)) %>% +# pull(gender_developer) +# +# write(genderDev,'gender_dev.txt') +# prep_word2vec(origin='gender_dev.txt',destination = 'gender_dev_vectors.txt',lowercase = T) +# model = train_word2vec("gender_dev_vectors.txt",vectors=100,threads=4,window=5,iter=1000,negative_samples=0,force = T) +# word2vec=model[2:nrow(model),] # remove at end of string + + +#tsne (dimensionality reduction) +# reduction <- Rtsne(word2vec, dims = 2, initial_dims = 50, +# perplexity = 2, theta = 0.05, check_duplicates = F, +# pca = F, max_iter = 1000, verbose = T, +# is_distance = F, Y_init = NULL) +# df <- as.data.frame(reduction$Y) +# rows <- rownames(word2vec) +#rownames(df) <- rows +#df=df %>% filter(rownames(df) != '') +#rownames(df) <- rows[2:65] + +# Create t-SNE plot and save as jpeg +# ggplot(df) + +# geom_point(aes(x = V1, y = V2), color = "red") + +# geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) + +# xlab("Dimension 1") + +# ylab("Dimension 2 ") + +# # geom_text(fontface = 2, alpha = .8) + +# theme_bw(base_size = 12) + +# theme(legend.position = "none") + +# ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE")) +# +# pca = prcomp(word2vec,scale=T) +# plot(pca$x, t='n', main="pca") +# text(pca$x, labels=rownames(df)) +# x=pca$x[,1] +# y=pca$x[,2] +# +# hc = hclust(dist(cbind(x,y)), method = 'ward.D') +# plot(hc, axes=F,xlab='', ylab='',sub ='', main='Comp 1/2') +# rect.hclust(hc, k=5, border='red') + +###################################### + + + concat_features=so_data %>% select(IDE,HaveWorkedLanguage,Country,HaveWorkedDatabase,HaveWorkedFramework,HaveWorkedPlatform,WantWorkLanguage,WantWorkFramework,WantWorkDatabase,WantWorkPlatform) %>% - filter(Country=='Pakistan') %>% select(-Country) %>% na.omit(.) %>% mutate_if(is.factor,as.character) %>% @@ -24,9 +80,79 @@ concat_features=so_data %>% embeddingAllWant=unlist(strsplit(as.character(vectorAllWant),';')) embeddingAllHave=unlist(strsplit(as.character(vectorAllHave),';')) + + all=c(embeddingAllWant,embeddingAllHave) + doc=Corpus(VectorSource(all)) + dtm <- TermDocumentMatrix(doc) + m <- as.matrix(dtm) + v <- sort(rowSums(m),decreasing=TRUE) + d <- data.frame(word = names(v),freq=v) + set.seed(1234) + wordcloud(words = d$word, freq = d$freq, min.freq = 1, + max.words=200, random.order=FALSE, rot.per=0.35, + colors=brewer.pal(8, "Dark2")) + + + + write(all,'features.txt') + write(embeddingAllHave,'features_have.txt') + # question (should we include all and have as one vector and then form term document matrix ?) + # embedding is a character array of all the features. We will vectorize them using word2vec + # and tsne will be used for dimensionality reduction + + # word2vec hyper parameter tuning + vectors=seq(100,500) + min_count=seq(5,10) + negative_samples=seq(5,15) + iter=seq(100,1000) + window=seq(1,20) + + + ################################## + + prep_word2vec(origin='features_have.txt',destination = 'vectors.txt',lowercase = T) + model = train_word2vec("vectors.txt",vectors=300,threads=4,window=5,iter=1000,negative_samples=0,min_count = 10,force = T) + word2vec=model[2:nrow(model),] # remove at end of string + #tsne (dimensionality reduction) + reduction <- Rtsne(word2vec, dims = 2, initial_dims = 50, + perplexity = 10, theta = 0.05, check_duplicates = F, + pca = F, max_iter = 1000, verbose = T, + is_distance = F, Y_init = NULL) + df <- as.data.frame(reduction$Y) + rows <- rownames(word2vec) + rownames(df) <- rows + #df=df %>% filter(rownames(df) != '') + #rownames(df) <- rows[2:65] + + # Create t-SNE plot and save as jpeg + ggplot(df) + + geom_point(aes(x = V1, y = V2), color = "red") + + geom_text_repel(aes(x = V1, y = V2, label = rownames(df))) + + xlab("Dimension 1") + + ylab("Dimension 2 ") + + # geom_text(fontface = 2, alpha = .8) + + theme_bw(base_size = 12) + + theme(legend.position = "none") + + ggtitle(paste0("2D reduction of Word Embedding Model on stack over flow data of pakistan using t_SNE")) + + + pca = prcomp(word2vec,scale=T) + plot(pca$x, t='n', main="pca") + text(pca$x, labels=rownames(df)) + x=pca$x[,1] + y=pca$x[,2] + -# embedding is a character array of all the features. We will vectorize them using word2vec -# and tsne will be used for dimensionality reduction - - #word2vec=read.binary.vectors(filename='/home/mustufain/Downloads/GoogleNews-vectors-negative300.bin') - \ No newline at end of file + # hierarchical clustering using pca + hc = hclust(dist(cbind(x,y)), method = 'ward.D') + plot(hc, axes=F,xlab='', ylab='',sub ='', main='Comp 1/2') + rect.hclust(hc, k=3, border='red') + + # hierachical clustering using tsne + + hc = hclust(dist(cbind(xt,yt)), method = 'ward.D') + plot(hc, axes=F,xlab='', ylab='',sub ='', main='Comp 1/2') + rect.hclust(hc, k=3, border='red') + + + ggsave('wordembeddings.png') \ No newline at end of file diff --git a/r-shiny-app/.RData b/r-shiny-app/.RData new file mode 100644 index 0000000..51cced9 Binary files /dev/null and b/r-shiny-app/.RData differ diff --git a/r-shiny-app/.Rhistory b/r-shiny-app/.Rhistory new file mode 100644 index 0000000..8c1d3cf --- /dev/null +++ b/r-shiny-app/.Rhistory @@ -0,0 +1,512 @@ +gather(Country, DeveloperType) %>% +group_by(Country,DeveloperType) %>% +filter(DeveloperType!='Other') +so_data%>%select(Country,DeveloperType,EmploymentStatus) %>% +filter(Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% +mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% +unnest(DeveloperType) %>% +mutate(DeveloperType=trimws(DeveloperType)) +so_data%>%select(Country,DeveloperType,EmploymentStatus) %>% +filter(Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India' | EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work') %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% +mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% +unnest(DeveloperType) %>% +mutate(DeveloperType=trimws(DeveloperType)) %>% +gather(Country, DeveloperType) %>% +group_by(Country,DeveloperType) %>% +filter(DeveloperType!='Other') +so_data%>%select(Country,DeveloperType,EmploymentStatus) %>% +filter(Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India' | EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work') %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% +mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% +unnest(DeveloperType) %>% +mutate(DeveloperType=trimws(DeveloperType)) +so_data%>%select(Country,DeveloperType,EmploymentStatus) %>% +filter(Country=='Pakistan' | EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work') %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% +mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% +unnest(DeveloperType) %>% +mutate(DeveloperType=trimws(DeveloperType)) +count_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>% +filter(Country=='Pakistan' | EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work') %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% +mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% +unnest(DeveloperType) %>% +mutate(DeveloperType=trimws(DeveloperType)) +count_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>% +filter(Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India' | EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work') %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% +mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% +unnest(DeveloperType) %>% +mutate(DeveloperType=trimws(DeveloperType)) +count_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>% +filter(Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% +mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% +unnest(DeveloperType) %>% +mutate(DeveloperType=trimws(DeveloperType)) %>% +gather(Country, DeveloperType) %>% +group_by(Country,DeveloperType) %>% +filter(DeveloperType!='Other') %>% +summarise(Count= n()) +count_dev<-so_data%>%select(Country,DeveloperType) %>% +filter(Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% +mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% +unnest(DeveloperType) %>% +mutate(DeveloperType=trimws(DeveloperType)) %>% +gather(Country, DeveloperType) %>% +group_by(Country,DeveloperType) %>% +filter(DeveloperType!='Other') %>% +summarise(Count= n()) +View(count_dev) +so_data%>%select(Country,DeveloperType) %>% +filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% +mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% +unnest(DeveloperType) %>% +mutate(DeveloperType=trimws(DeveloperType)) +so_data%>%select(Country,DeveloperType,EmploymentStatus) %>% +filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% +mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% +unnest(DeveloperType) %>% +mutate(DeveloperType=trimws(DeveloperType)) +unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>% +filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% +mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% +unnest(DeveloperType) %>% +mutate(DeveloperType=trimws(DeveloperType)) +View(unemployment_dev) +unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>% +filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% +mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% +unnest(DeveloperType) %>% +mutate(DeveloperType=trimws(DeveloperType)) %>% +mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) +View(unemployment_dev) +unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>% +filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% +mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% +unnest(DeveloperType) %>% +mutate(DeveloperType=trimws(DeveloperType)) %>% +mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) %>% +gather(Country,DeveloperType) %>% +group_by(Country,DeveloperType) %>% +filter(DeveloperType!='Other') %>% +summarise(Count= n()) +View(unemployment_dev) +so_data%>%select(Country,DeveloperType,EmploymentStatus) %>% +filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% +mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% +unnest(DeveloperType) %>% +mutate(DeveloperType=trimws(DeveloperType)) %>% +mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) %>% +gather(Country,EmploymentStatus,DeveloperType) +unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>% +filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% +mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% +unnest(DeveloperType) %>% +mutate(DeveloperType=trimws(DeveloperType)) %>% +mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) %>% +gather(Country,EmploymentStatus,DeveloperType) %>% +group_by(Country,DeveloperType) %>% +filter(DeveloperType!='Other') %>% +summarise(Count= n()) +unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>% +filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% +mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% +unnest(DeveloperType) %>% +mutate(DeveloperType=trimws(DeveloperType)) %>% +mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) %>% +gather(Country,EmploymentStatus,DeveloperType) %>% +group_by(Country,DeveloperType,DeveloperType) %>% +filter(DeveloperType!='Other') %>% +summarise(Count= n()) +unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>% +filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% +mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% +unnest(DeveloperType) %>% +mutate(DeveloperType=trimws(DeveloperType)) %>% +mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) %>% +gather(Country,EmploymentStatus,DeveloperType) %>% head() +View(unemployment_dev) +unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>% +filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% +mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% +unnest(DeveloperType) %>% +mutate(DeveloperType=trimws(DeveloperType)) %>% +mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) %>% +gather(Country,Val,EmploymentStatus:DeveloperType) %>% head() +View(unemployment_dev) +unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>% +filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% +mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% +unnest(DeveloperType) %>% +mutate(DeveloperType=trimws(DeveloperType)) %>% +mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) +View(unemployment_dev) +unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>% +filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% +mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% +unnest(DeveloperType) %>% +mutate(DeveloperType=trimws(DeveloperType)) %>% +mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) %>% +select(-EmploymentStatus) +group_by(Country,DeveloperType) %>% +filter(DeveloperType!='Other') %>% +summarise(Count= n()) +unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>% +filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% +mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% +unnest(DeveloperType) %>% +mutate(DeveloperType=trimws(DeveloperType)) %>% +mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) %>% +select(-EmploymentStatus) %>% +group_by(Country,DeveloperType) %>% +filter(DeveloperType!='Other') %>% +summarise(Count= n()) +View(unemployment_dev) +View(count_dev) +count_dev %>% +inner_join(id=c('Country','DeveloperType'),unemployment_dev) +unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>% +filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% +mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% +unnest(DeveloperType) %>% +mutate(DeveloperType=trimws(DeveloperType)) %>% +mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) %>% +select(-EmploymentStatus) %>% +group_by(Country,DeveloperType) %>% +filter(DeveloperType!='Other') %>% +summarise(Count= n()) +unemployment %>% count_dev %>% +inner_join(id=c('Country','DeveloperType'),unemployment_dev) +count_dev %>% +inner_join(id=c('Country','DeveloperType'),unemployment_dev) +count_dev +unemployment_dev +unemployment <- count_dev %>% +inner_join(id=c('Country','DeveloperType'),unemployment_dev) +View(unemployment) +count_dev +unemployment_dev +count_dev %>% +inner_join(id=c('Country','DeveloperType'),unemployment_dev) +unemployment <- count_dev %>% +lef_join(id=c('Country','DeveloperType'),unemployment_dev) +count_dev %>% +left_join(id=c('Country','DeveloperType'),unemployment_dev) +count_dev %>% +right_join(id=c('Country','DeveloperType'),unemployment_dev) +unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>% +filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% +mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% +unnest(DeveloperType) %>% +mutate(DeveloperType=trimws(DeveloperType)) %>% +mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) %>% +select(-EmploymentStatus) %>% +group_by(Country,DeveloperType) %>% +filter(DeveloperType!='Other') %>% +summarise(freq= n()) +count_dev %>% +inner_join(id=c('Country','DeveloperType'),unemployment_dev) +unemployment <- count_dev %>% +inner_join(id=c('Country','DeveloperType'),unemployment_dev) +View(unemployment) +count_dev<-so_data%>%select(Country,DeveloperType) %>% +filter(Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% +mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% +unnest(DeveloperType) %>% +mutate(DeveloperType=trimws(DeveloperType)) %>% +gather(Country, DeveloperType) %>% +group_by(Country,DeveloperType) %>% +filter(DeveloperType!='Other') %>% +summarise(Total_Count= n()) +unemployment_dev<-so_data%>%select(Country,DeveloperType,EmploymentStatus) %>% +filter((Country=='Pakistan' | Country=='China' | Country == 'United States' | Country == 'India') & (EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work')) %>% +na.omit(.) %>% +mutate_if(is.factor,as.character) %>% +mutate(DeveloperType=gsub(" ","",DeveloperType)) %>% +mutate(DeveloperType = strsplit(as.character(DeveloperType), ";")) %>% +unnest(DeveloperType) %>% +mutate(DeveloperType=trimws(DeveloperType)) %>% +mutate(EmploymentStatus=ifelse(EmploymentStatus=='Not employed, and not looking for work' | EmploymentStatus=='Not employed, but looking for work',yes='unemployed',.)) %>% +select(-EmploymentStatus) %>% +group_by(Country,DeveloperType) %>% +filter(DeveloperType!='Other') %>% +summarise(Unemployment_count= n()) +unemployment <- count_dev %>% +inner_join(id=c('Country','DeveloperType'),unemployment_dev) +View(unemployment) +count_dev +unemployment_dev +count_dev %>% +left_join(id=c('Country','DeveloperType'),unemployment_dev) +?na.replace +unemployment <- count_dev %>% +left_join(id=c('Country','DeveloperType'),unemployment_dev) %>% +is.na[.]<-0 +?is.na +View(unemployment) +unemployment <- count_dev %>% +left_join(id=c('Country','DeveloperType'),unemployment_dev) +View(unemployment) +View(unemployment) +unemployment <- count_dev %>% +left_join(id=c('Country','DeveloperType'),unemployment_dev) %>% +is.na(.)<-0 +View(unemployment) +unemployment <- count_dev %>% +left_join(id=c('Country','DeveloperType'),unemployment_dev) %>% +is.na(.)<-0 +?is.na +is.na(unemployment)<-0 +is.na[unemployment]<-0 +unemployment[is.na(unemployment)] <- 0 +View(unemployment) +View(test) +pay_gap <- so_data %>% +select(Country,Gender,Salary) %>% +na.omit(.) %>% +filter(Country %in% top_20C &(Gender=='Male' | Gender=='Female')) %>% +mutate(Country=as.character(Country)) %>% +mutate(Gender=as.character(Gender)) %>% +group_by(Gender,Country) %>% summarise_all(funs(mean)) +pay_gapDt<-as.data.table(pay_gap) +setkey(pay_gapDt,Country) +pay_gapDt<-pay_gapDt %>% select(Country,Salary) +pay_gapDt[ , Pay_Gap := c(NA, abs(diff(Salary))), by = Country] +pay_gapDt <- pay_gapDt %>% select(Country,Pay_Gap) %>% na.omit(.) %>% arrange(desc(Pay_Gap)) +genderPayGap <- pay_gapDt %>% inner_join(country_mf) +View(genderPayGap) +View(genderPayGap) +pay_gap <- so_data %>% +select(Country,Gender,Salary) %>% +na.omit(.) %>% +filter(Country %in% top_20C &(Gender=='Male' | Gender=='Female')) %>% +mutate(Country=as.character(Country)) %>% +mutate(Gender=as.character(Gender)) +ay_gap <- so_data %>% +select(Country,Gender,Salary) %>% +na.omit(.) %>% +filter(Country %in% top_20C &(Gender=='Male' | Gender=='Female')) %>% +mutate(Country=as.character(Country)) %>% +mutate(Gender=as.character(Gender)) %>% +group_by(Gender,Country) %>% summarise_all(funs(mean)) +so_data %>% +select(Country,Gender,Salary) %>% +na.omit(.) %>% +filter(Country %in% top_20C &(Gender=='Male' | Gender=='Female')) %>% +mutate(Country=as.character(Country)) %>% +mutate(Gender=as.character(Gender)) +View(pay_gap) +pay_gap <- so_data %>% +select(Country,Gender,Salary) %>% +na.omit(.) %>% +filter(Country %in% top_20C &(Gender=='Male' | Gender=='Female')) %>% +mutate(Country=as.character(Country)) %>% +mutate(Gender=as.character(Gender)) %>% +group_by(Gender,Country) %>% summarise_all(funs(mean)) +View(pay_gap) +pay_gap <- so_data %>% +select(Country,Gender,Salary) %>% +na.omit(.) %>% +filter(Country %in% top_20C &(Gender=='Male' | Gender=='Female')) %>% +mutate(Country=as.character(Country)) %>% +mutate(Gender=as.character(Gender)) %>% +group_by(Gender,Country) %>% summarise_all(funs(median)) +ay_gapDt<-as.data.table(pay_gap) +setkey(pay_gapDt,Country) +pay_gapDt<-pay_gapDt %>% select(Country,Salary) +pay_gapDt[ , Pay_Gap := c(NA, abs(diff(Salary))), by = Country] +pay_gapDt <- pay_gapDt %>% select(Country,Pay_Gap) %>% na.omit(.) %>% arrange(desc(Pay_Gap)) +pay_gapDt<-as.data.table(pay_gap) +setkey(pay_gapDt,Country) +pay_gapDt<-pay_gapDt %>% select(Country,Salary) +pay_gapDt[ , Pay_Gap := c(NA, abs(diff(Salary))), by = Country] +pay_gapDt <- pay_gapDt %>% select(Country,Pay_Gap) %>% na.omit(.) %>% arrange(desc(Pay_Gap)) +genderPayGap <- pay_gapDt %>% inner_join(country_mf) +View(genderPayGap) +pay_gap <- so_data %>% +select(Country,Gender,Salary) %>% +na.omit(.) %>% +filter(Country %in% top_20C &(Gender=='Male' | Gender=='Female')) %>% +mutate(Country=as.character(Country)) %>% +mutate(Gender=as.character(Gender)) %>% +group_by(Gender,Country) %>% summarise_all(funs(mean)) +pay_gapDt<-as.data.table(pay_gap) +setkey(pay_gapDt,Country) +pay_gapDt<-pay_gapDt %>% select(Country,Salary) +pay_gapDt[ , Pay_Gap := c(NA, abs(diff(Salary))), by = Country] +pay_gapDt <- pay_gapDt %>% select(Country,Pay_Gap) %>% na.omit(.) %>% arrange(desc(Pay_Gap)) +genderPayGap <- pay_gapDt %>% inner_join(country_mf) +View(genderPayGap) +so_data %>% +filter(Gender=='Male' | Gender=='Female') %>% +group_by(Country) %>% +summarise(Count=n()) %>% +arrange(desc(Count)) %>% +head(20) -> top_countries +top_countries +so_data %>% +filter(Country %in% top_countries$Country) %>% +group_by(Country) +summarise(Males=sum(!is.na(Gender) & Gender == "Male"), +Females=sum(!is.na(Gender) & Gender == "Female"), +Total=n()) -> country_mf +top_countries +so_data %>% +filter(Country %in% top_countries$Country) %>% +group_by(Country) +summarise(Males=sum(!is.na(Gender) & Gender == "Male"), +Females=sum(!is.na(Gender) & Gender == "Female"), +Total=n()) -> country_mf +so_data %>% +group_by(Country) %>% +summarise(Count=n()) %>% +arrange(desc(Count)) %>% +head(20) -> top_countries +so_data %>% +filter(Country %in% top_countries$Country) %>% +group_by(Country) +summarise(Males=sum(!is.na(Gender) & Gender == "Male"), +Females=sum(!is.na(Gender) & Gender == "Female"), +Total=n()) -> country_mf +pay_gap <- so_data %>% +select(Country,Gender,Salary) %>% +na.omit(.) %>% +filter(Country %in% top_20C &(Gender=='Male' | Gender=='Female')) %>% +mutate(Country=as.character(Country)) %>% +mutate(Gender=as.character(Gender)) %>% +group_by(Gender,Country) %>% summarise_all(funs(mean)) +pay_gapDt<-as.data.table(pay_gap) +setkey(pay_gapDt,Country) +pay_gapDt<-pay_gapDt %>% select(Country,Salary) +pay_gapDt[ , Pay_Gap := c(NA, abs(diff(Salary))), by = Country] +pay_gapDt <- pay_gapDt %>% select(Country,Pay_Gap) %>% na.omit(.) %>% arrange(desc(Pay_Gap)) +genderPayGap <- pay_gapDt %>% inner_join(country_mf) +View(genderPayGap) +so_data %>% filter(Country=='Pakistan' & (Gender=='Male' | Gender=='Female')) %>% n() +so_data %>% filter(Country=='Pakistan' & (Gender=='Male' | Gender=='Female')) %>% count(.) +genderPayGap <- pay_gapDt %>% mutate(Total=as.integer(Males) + as.inetegr(Females)) +pay_gap <- so_data %>% +select(Country,Gender,Salary) %>% +na.omit(.) %>% +filter(Country %in% top_20C &(Gender=='Male' | Gender=='Female')) %>% +mutate(Country=as.character(Country)) %>% +mutate(Gender=as.character(Gender)) %>% +group_by(Gender,Country) %>% summarise_all(funs(mean)) %>% +mutate(Total=as.integer(Males) + as.inetegr(Females)) +so_data %>% +select(Country,Gender,Salary) %>% +na.omit(.) %>% +filter(Country %in% top_20C &(Gender=='Male' | Gender=='Female')) %>% +mutate(Country=as.character(Country)) %>% +mutate(Gender=as.character(Gender)) %>% +group_by(Gender,Country) %>% summarise_all(funs(mean) +) +pay_gapDt +country_mf +country_mf <- so_data %>% +filter(Country %in% top_countries$Country) %>% +group_by(Country) +summarise(Males=sum(!is.na(Gender) & Gender == "Male"), +Females=sum(!is.na(Gender) & Gender == "Female")) %>% +mutate(Total=as.integer(Males) + as.integer(females)) +country_mf <- so_data %>% +filter(Country %in% top_countries$Country) %>% +group_by(Country) %>% +summarise(Males=sum(!is.na(Gender) & Gender == "Male"), +Females=sum(!is.na(Gender) & Gender == "Female")) %>% +mutate(Total=as.integer(Males) + as.integer(females)) +country_mf <- so_data %>% +filter(Country %in% top_countries$Country) %>% +group_by(Country) %>% +summarise(Males=sum(!is.na(Gender) & Gender == "Male"), +Females=sum(!is.na(Gender) & Gender == "Female")) %>% +mutate(Total=as.integer(Males) + as.integer(Females)) +country_mf +pay_gap <- so_data %>% +select(Country,Gender,Salary) %>% +na.omit(.) %>% +filter(Country %in% top_20C &(Gender=='Male' | Gender=='Female')) %>% +mutate(Country=as.character(Country)) %>% +mutate(Gender=as.character(Gender)) %>% +group_by(Gender,Country) %>% summarise_all(funs(mean)) +pay_gapDt<-as.data.table(pay_gap) +setkey(pay_gapDt,Country) +pay_gapDt<-pay_gapDt %>% select(Country,Salary) +pay_gapDt[ , Pay_Gap := c(NA, abs(diff(Salary))), by = Country] +pay_gapDt <- pay_gapDt %>% select(Country,Pay_Gap) %>% na.omit(.) %>% arrange(desc(Pay_Gap)) +genderPayGap <- pay_gapDt %>% inner_join(country_mf) +View(genderPayGap) +genderPayGap <- pay_gapDt %>% inner_join(country_mf) %>% mutate(gender_equality_%=Female/Total*100) +genderPayGap <- pay_gapDt %>% inner_join(country_mf) %>% mutate(gender_equality_%=Females/Total*100) +genderPayGap <- pay_gapDt %>% inner_join(country_mf) %>% mutate(gender_equality_%=as.double(Females/Total*100)) +genderPayGap <- pay_gapDt %>% inner_join(country_mf) %>% +mutate(gender_equality=as.double(Females/Total*100)) +View(genderPayGap)