From 4c222032121ea06048d2078272728c34a2b39bb7 Mon Sep 17 00:00:00 2001 From: Jose Luis Ricon Date: Thu, 3 May 2018 15:55:00 +0100 Subject: [PATCH] Update lemmatize.R Performs lemmatisation faster (>10x faster than lemmatize_strings on a vector of 10000 strings) by using hash tables. --- R/lemmatize.R | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/R/lemmatize.R b/R/lemmatize.R index 8d81b4e..c9bfa1b 100644 --- a/R/lemmatize.R +++ b/R/lemmatize.R @@ -102,4 +102,26 @@ lemmatize_strings <- function(x, dictionary = lexicon::hash_lemmas, ...) { x2$unhold(unlist(lemmatized)) } +#' Performs fast lemmatisation of the text +#' @return The text with replacemnets +#' @param text A vector of stroings +#' @param dictionary If NULL it will perform stemming. If else, a data frame with (word,replacement) as +fastLemma = function(text,dictionary = NULL){ + splitted = text %>% stringr::str_split(pattern = " ") + if (is.null(dictionary)) { + unique_words = splitted %>% unlist() %>% unique() + lemmatised_words = textstem::lemmatize_words(unique_words) + dictionary = data.table(word = unique_words,replacement = lemmatised_words) + dictionary = dictionary[word != replacement] + + } + + hashed_dict = hashmap(dictionary$word, dictionary$replacement) + hashed_dict$cache_keys() + # Do the actual replacement + lapply(splitted,function(x) ifelse(x %in% hashed_dict$keys(),hashed_dict[[x]],x)) %>% lapply(function(x) paste0(x,collapse = " ")) %>% unlist + +} + +