From 8e3c0127fd187070817c01b756e22909dd3708de Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Tue, 17 Sep 2024 14:14:02 -0500 Subject: [PATCH] Fix for dev profvis For complicated reasons, we had to change the evaluation strategy in profvis and it no longer modifies the environment that it's run in. However, I think `bench::mark()` probably better suited for your use case anyway, especially since it comes with a convenient way to check the outputs are equal. --- DESCRIPTION | 8 +-- vignettes/Introduction.Rmd | 123 ++++++++++++++++--------------------- 2 files changed, 58 insertions(+), 73 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 6c5b652..b44e0b1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: tidyft Title: Tidy Verbs for Fast Data Operations by Reference Version: 0.4.5 -Authors@R: +Authors@R: person(given = "Tian-Yuan", family = "Huang", role = c("aut", "cre"), @@ -18,13 +18,13 @@ License: MIT + file LICENSE Encoding: UTF-8 LazyData: true RoxygenNote: 7.1.0 -Imports: +Imports: data.table (>= 1.12.8), stringr (>= 1.4.0), fst (>= 0.9.0) -Suggests: +Suggests: + bench, knitr, rmarkdown, - profvis, dplyr VignetteBuilder: knitr diff --git a/vignettes/Introduction.Rmd b/vignettes/Introduction.Rmd index 27ecbbd..71ef733 100644 --- a/vignettes/Introduction.Rmd +++ b/vignettes/Introduction.Rmd @@ -42,7 +42,7 @@ class(b) # convert codes lapply(ls(),get) %>% - lapply(setDT) %>% + lapply(setDT) %>% invisible() # after @@ -80,7 +80,7 @@ ls() # only the ft exists The `as_fst` could save any data.frame as ".fst" file in temporary file and parse it back as fst table. Fst table is small in RAM, but if you want to get any part of the data.frame, you can get it in almost no time: ```{r} -ft %>% +ft %>% slice_fst(5555:6666) # get 5555 to 6666 row ``` @@ -91,17 +91,17 @@ Except for `slice_fst`, there are also other functions for subsetting the data, ```{r} sys_time_print({ - res = ft %>% - select_fst(Species,Sepal.Length,Sepal.Width) %>% - rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>% - arrange(group,sl) %>% - filter(sl > 5) %>% - distinct(sl,.keep_all = TRUE) %>% + res = ft %>% + select_fst(Species,Sepal.Length,Sepal.Width) %>% + rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>% + arrange(group,sl) %>% + filter(sl > 5) %>% + distinct(sl,.keep_all = TRUE) %>% summarise(sw = max(sw),by = group) }) res - + ``` This should be pretty fast. Becasue when we use the data in fst table, we never get them until using the "_fst" suffix functions, so the tidyft functions never modify the data in the fst file or fst table. That is to say, we do not have to worry about the modification by reference any more. No copies made, fastest ever. @@ -113,7 +113,6 @@ The fst workflow could also be working with other tools, though less efficient. rm(list = ls()) -library(profvis) library(data.table) library(dplyr) library(dtplyr) @@ -128,57 +127,53 @@ dim(dt) as_fst(dt) -> ft # remove the data.frame from RAM rm(dt) - - -profvis({ - - res1 = ft %>% - select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>% - dplyr::select(-Petal.Length) %>% - dplyr::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>% - dplyr::arrange(group,sl) %>% - dplyr::filter(sl > 5) %>% - dplyr::distinct(sl,.keep_all = TRUE) %>% - dplyr::group_by(group) %>% - dplyr::summarise(sw = max(sw)) - - res2 = ft %>% - select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>% - lazy_dt() %>% - dplyr::select(-Petal.Length) %>% - dplyr::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>% - dplyr::arrange(group,sl) %>% - dplyr::filter(sl > 5) %>% - dplyr::distinct(sl,.keep_all = TRUE) %>% - dplyr::group_by(group) %>% - dplyr::summarise(sw = max(sw)) %>% - as.data.table() - - res3 = ft[,c("Species","Sepal.Length","Sepal.Width","Petal.Length")] %>% + + +bench::mark( + + dplyr = ft %>% + select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>% + dplyr::select(-Petal.Length) %>% + dplyr::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>% + dplyr::arrange(group,sl) %>% + dplyr::filter(sl > 5) %>% + dplyr::distinct(sl,.keep_all = TRUE) %>% + dplyr::group_by(group) %>% + dplyr::summarise(sw = max(sw)), + + dtplyr = ft %>% + select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>% + lazy_dt() %>% + dplyr::select(-Petal.Length) %>% + dplyr::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>% + dplyr::arrange(group,sl) %>% + dplyr::filter(sl > 5) %>% + dplyr::distinct(sl,.keep_all = TRUE) %>% + dplyr::group_by(group) %>% + dplyr::summarise(sw = max(sw)) %>% + as.data.table(), + + data.table = ft[,c("Species","Sepal.Length","Sepal.Width","Petal.Length")] %>% setDT() %>% - .[,.SD,.SDcols = -"Petal.Length"] %>% + .[,.SD,.SDcols = -"Petal.Length"] %>% setnames(old =c("Species","Sepal.Length","Sepal.Width"), - new = c("group","sl","sw")) %>% - setorder(group,sl) %>% - .[sl>5] %>% unique(by = "sl") %>% - .[,.(sw = max(sw)),by = group] - - - res4 = ft %>% - tidyft::select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>% - tidyft::select(-Petal.Length) %>% - tidyft::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>% - tidyft::arrange(group,sl) %>% - tidyft::filter(sl > 5) %>% - tidyft::distinct(sl,.keep_all = TRUE) %>% - tidyft::summarise(sw = max(sw),by = group) - - -}) - -setequal(res1,res2) -setequal(res2,res3) -setequal(res3,res4) + new = c("group","sl","sw")) %>% + setorder(group,sl) %>% + .[sl>5] %>% unique(by = "sl") %>% + .[,.(sw = max(sw)),by = group], + + + tidyft = ft %>% + tidyft::select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>% + tidyft::select(-Petal.Length) %>% + tidyft::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>% + tidyft::arrange(group,sl) %>% + tidyft::filter(sl > 5) %>% + tidyft::distinct(sl,.keep_all = TRUE) %>% + tidyft::summarise(sw = max(sw),by = group), + + check = setequal +) ``` @@ -189,13 +184,3 @@ Because tidyft is based on data.table, therefore, if you always use data.table c ```{r} sessionInfo() ``` - - - - - - - - - -