From 4c222032121ea06048d2078272728c34a2b39bb7 Mon Sep 17 00:00:00 2001
From: Jose Luis Ricon <Jose-Luis.Ricon@amey.co.uk>
Date: Thu, 3 May 2018 15:55:00 +0100
Subject: [PATCH] Update lemmatize.R

Performs lemmatisation faster (>10x faster than lemmatize_strings on a vector of 10000 strings) by using hash tables.
---
 R/lemmatize.R | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/R/lemmatize.R b/R/lemmatize.R
index 8d81b4e..c9bfa1b 100644
--- a/R/lemmatize.R
+++ b/R/lemmatize.R
@@ -102,4 +102,26 @@ lemmatize_strings <- function(x, dictionary = lexicon::hash_lemmas, ...) {
     x2$unhold(unlist(lemmatized))
 }
 
+#' Performs fast lemmatisation of the text
+#' @return The text with replacemnets
+#' @param text A vector of stroings
+#' @param dictionary If NULL it will perform stemming. If else, a data frame with (word,replacement) as
+fastLemma = function(text,dictionary = NULL){
+  splitted = text %>% stringr::str_split(pattern = " ") 
+  if (is.null(dictionary)) {
+    unique_words = splitted %>% unlist() %>% unique()
+    lemmatised_words = textstem::lemmatize_words(unique_words)
+    dictionary = data.table(word = unique_words,replacement = lemmatised_words)
+    dictionary = dictionary[word != replacement]
+    
+  }
+  
+  hashed_dict = hashmap(dictionary$word, dictionary$replacement)
+  hashed_dict$cache_keys()
+  # Do the actual replacement
+  lapply(splitted,function(x) ifelse(x %in% hashed_dict$keys(),hashed_dict[[x]],x)) %>% lapply(function(x) paste0(x,collapse = " ")) %>% unlist
+  
+}
+
+