count

library(tidyverse,warn.conflicts = FALSE)
## -- Attaching packages ---------------------- tidyverse 1.3.0 --
## √ ggplot2 3.3.0     √ purrr   0.3.3
## √ tibble  2.1.3     √ dplyr   0.8.5
## √ tidyr   1.0.2     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.5.0
## -- Conflicts ------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(DT)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
ggplot2movies::movies %>%
  group_by(year) %>%
  count(sort = FALSE, name = "num") %>%
  mutate(decade = 10 * (year %/% 10)) %>%
  ungroup() %>%
  select(-year) %>%
  group_by(decade) %>%
  summarise(n = sum(num)) %>%
  ggplot(aes(decade, n)) +
  geom_col(aes(decade, n)) +
  scale_x_continuous(breaks = seq(1890, 2000, 10)) +
  scale_y_continuous(breaks = seq(0, 15000, 2500)) +
  geom_text(aes(label = n),hjust = -0.5) +
  coord_flip()->p

p 

add_count

iris %>% 
  add_count(Species) %>% 
  select(n,everything())
## # A tibble: 150 x 6
##        n Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##    <int>        <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1    50          5.1         3.5          1.4         0.2 setosa 
##  2    50          4.9         3            1.4         0.2 setosa 
##  3    50          4.7         3.2          1.3         0.2 setosa 
##  4    50          4.6         3.1          1.5         0.2 setosa 
##  5    50          5           3.6          1.4         0.2 setosa 
##  6    50          5.4         3.9          1.7         0.4 setosa 
##  7    50          4.6         3.4          1.4         0.3 setosa 
##  8    50          5           3.4          1.5         0.2 setosa 
##  9    50          4.4         2.9          1.4         0.2 setosa 
## 10    50          4.9         3.1          1.5         0.1 setosa 
## # ... with 140 more rows
ggplot2movies::movies %>% 
  add_count(year) %>% 
  select(n,everything()) %>% 
  head()
## # A tibble: 6 x 25
##       n title  year length budget rating votes    r1    r2    r3    r4    r5
##   <int> <chr> <int>  <int>  <int>  <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1   646 $      1971    121     NA    6.4   348   4.5   4.5   4.5   4.5  14.5
## 2   484 $100~  1939     71     NA    6      20   0    14.5   4.5  24.5  14.5
## 3   521 $21 ~  1941      7     NA    8.2     5   0     0     0     0     0  
## 4  1390 $40,~  1996     70     NA    8.2     6  14.5   0     0     0     0  
## 5   619 $50,~  1975     71     NA    3.4    17  24.5   4.5   0    14.5  14.5
## 6  2048 $pent  2000     91     NA    4.3    45   4.5   4.5   4.5  14.5  14.5
## # ... with 13 more variables: r6 <dbl>, r7 <dbl>, r8 <dbl>, r9 <dbl>,
## #   r10 <dbl>, mpaa <chr>, Action <int>, Animation <int>, Comedy <int>,
## #   Drama <int>, Documentary <int>, Romance <int>, Short <int>

or:

ggplot2movies::movies %>% 
  group_by(year) %>% 
  mutate(n = n()) %>%
  ungroup(year) %>% 
  select(n,everything()) %>% 
  head()
## # A tibble: 6 x 25
##       n title  year length budget rating votes    r1    r2    r3    r4    r5
##   <int> <chr> <int>  <int>  <int>  <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1   646 $      1971    121     NA    6.4   348   4.5   4.5   4.5   4.5  14.5
## 2   484 $100~  1939     71     NA    6      20   0    14.5   4.5  24.5  14.5
## 3   521 $21 ~  1941      7     NA    8.2     5   0     0     0     0     0  
## 4  1390 $40,~  1996     70     NA    8.2     6  14.5   0     0     0     0  
## 5   619 $50,~  1975     71     NA    3.4    17  24.5   4.5   0    14.5  14.5
## 6  2048 $pent  2000     91     NA    4.3    45   4.5   4.5   4.5  14.5  14.5
## # ... with 13 more variables: r6 <dbl>, r7 <dbl>, r8 <dbl>, r9 <dbl>,
## #   r10 <dbl>, mpaa <chr>, Action <int>, Animation <int>, Comedy <int>,
## #   Drama <int>, Documentary <int>, Romance <int>, Short <int>

summarise

mtcars %>%
  summarise(mean = mean(disp), n = n())
##       mean  n
## 1 230.7219 32
mtcars %>%
  group_by(cyl) %>%
  summarise(mean = mean(disp), n = n())
## # A tibble: 3 x 3
##     cyl  mean     n
##   <dbl> <dbl> <int>
## 1     4  105.    11
## 2     6  183.     7
## 3     8  353.    14
mtcars %>%
  group_by(cyl, vs) %>%
  summarise(cyl_n = n()) %>%
  ungroup()
## # A tibble: 5 x 3
##     cyl    vs cyl_n
##   <dbl> <dbl> <int>
## 1     4     0     1
## 2     4     1    10
## 3     6     0     3
## 4     6     1     4
## 5     8     0    14
mtcars %>%
  group_by(cyl) %>%
  summarise(disp = mean(disp), sd = sd(disp), double_disp = disp * 2)
## # A tibble: 3 x 4
##     cyl  disp    sd double_disp
##   <dbl> <dbl> <dbl>       <dbl>
## 1     4  105.    NA        210.
## 2     6  183.    NA        367.
## 3     8  353.    NA        706.

fct_reorder

iris %>% 
  ggplot(aes(fct_reorder(Species,Sepal.Width),Sepal.Width)) +
  geom_boxplot(aes(fill = Species),outlier.fill = "red") +
  labs(x = "Species") +
  theme(text = element_text(family = "Times New Roman",size = 15))

fct_lump

x <- factor(rep(LETTERS[1:9], times = c(40, 10, 5, 27, 1, 1, 1, 1, 1)))
x %>% table()
## .
##  A  B  C  D  E  F  G  H  I 
## 40 10  5 27  1  1  1  1  1
x %>% fct_lump_n(3) %>% table()
## .
##     A     B     D Other 
##    40    10    27    10
x %>% fct_lump_prop(0.10) %>% table()
## .
##     A     B     D Other 
##    40    10    27    10
x %>% fct_lump_min(5) %>% table()
## .
##     A     B     C     D Other 
##    40    10     5    27     5
x %>% fct_lump_lowfreq() %>% table()
## .
##     A     D Other 
##    40    27    20
x <- factor(letters[rpois(100, 5)])
x
##  [1] f b g i g e e e e e c c b e i d h e g e f b e c g f e h c f e e g b k c f d
## [39] c g e c i g f c f e g b f l d b f d d c g g e b c d f l n d d b c e c a g d
## [77] e e e e d e c e c e c e g c g f h d h f d f c
## Levels: a b c d e f g h i k l n
table(x)
## x
##  a  b  c  d  e  f  g  h  i  k  l  n 
##  1  8 17 12 24 13 13  4  3  1  2  1
table(fct_lump_lowfreq(x))
## 
##  a  b  c  d  e  f  g  h  i  k  l  n 
##  1  8 17 12 24 13 13  4  3  1  2  1
# Use positive values to collapse the rarest
fct_lump_n(x, n = 3)
##  [1] f     Other g     Other g     e     e     e     e     e     c     c    
## [13] Other e     Other Other Other e     g     e     f     Other e     c    
## [25] g     f     e     Other c     f     e     e     g     Other Other c    
## [37] f     Other c     g     e     c     Other g     f     c     f     e    
## [49] g     Other f     Other Other Other f     Other Other c     g     g    
## [61] e     Other c     Other f     Other Other Other Other Other c     e    
## [73] c     Other g     Other e     e     e     e     Other e     c     e    
## [85] c     e     c     e     g     c     g     f     Other Other Other f    
## [97] Other f     c    
## Levels: c e f g Other
fct_lump_prop(x, prop = 0.1)
##  [1] f     Other g     Other g     e     e     e     e     e     c     c    
## [13] Other e     Other d     Other e     g     e     f     Other e     c    
## [25] g     f     e     Other c     f     e     e     g     Other Other c    
## [37] f     d     c     g     e     c     Other g     f     c     f     e    
## [49] g     Other f     Other d     Other f     d     d     c     g     g    
## [61] e     Other c     d     f     Other Other d     d     Other c     e    
## [73] c     Other g     d     e     e     e     e     d     e     c     e    
## [85] c     e     c     e     g     c     g     f     Other d     Other f    
## [97] d     f     c    
## Levels: c d e f g Other
# Use negative values to collapse the most common
fct_lump_n(x, n = -3)
##  [1] Other Other Other Other Other Other Other Other Other Other Other Other
## [13] Other Other Other Other Other Other Other Other Other Other Other Other
## [25] Other Other Other Other Other Other Other Other Other Other k     Other
## [37] Other Other Other Other Other Other Other Other Other Other Other Other
## [49] Other Other Other Other Other Other Other Other Other Other Other Other
## [61] Other Other Other Other Other Other n     Other Other Other Other Other
## [73] Other a     Other Other Other Other Other Other Other Other Other Other
## [85] Other Other Other Other Other Other Other Other Other Other Other Other
## [97] Other Other Other
## Levels: a k n Other
fct_lump_prop(x, prop = -0.1)
##  [1] Other b     Other i     Other Other Other Other Other Other Other Other
## [13] b     Other i     Other h     Other Other Other Other b     Other Other
## [25] Other Other Other h     Other Other Other Other Other b     k     Other
## [37] Other Other Other Other Other Other i     Other Other Other Other Other
## [49] Other b     Other l     Other b     Other Other Other Other Other Other
## [61] Other b     Other Other Other l     n     Other Other b     Other Other
## [73] Other a     Other Other Other Other Other Other Other Other Other Other
## [85] Other Other Other Other Other Other Other Other h     Other h     Other
## [97] Other Other Other
## Levels: a b h i k l n Other

scale_x/y_log10

library(patchwork)
p1 <- ggplot(mpg, aes(displ, hwy)) +
  geom_point() +
  theme(text = element_text(family = "Times New Roman"))
  
p1

# Manipulating the default position scales lets you:* change the axis labels
p1 +
  scale_x_continuous("Engine displacement (L)") +
  scale_y_continuous("Highway MPG")->p2

p1 + p2

#  * modify the axis limits
p1 + scale_x_continuous(limits = c(2, 6))->p3
p1 + scale_x_continuous(limits = c(0, 10))->p4

p3 + p4
## Warning: Removed 27 rows containing missing values (geom_point).
#  * choose your own labels
p1 + scale_x_continuous(
  breaks = c(2, 4, 6),
  label = c("two", "four", "six")
)

# You can also override the default linear mapping by using a transformation. There are three shortcuts:
p1 + scale_y_log10()-> p1
p1 + scale_y_sqrt()-> p2
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
p1 + scale_y_reverse() -> p3
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
# Or you can supply a transformation in the `trans` argument:
p1 + scale_y_continuous(trans = scales::reciprocal_trans())-> p4
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
# You can also create your own. See ?scales::trans_new

(p1 + p2) / (p3 + p4)

crossing

expand(mtcars, vs, cyl)
## # A tibble: 6 x 2
##      vs   cyl
##   <dbl> <dbl>
## 1     0     4
## 2     0     6
## 3     0     8
## 4     1     4
## 5     1     6
## 6     1     8
# Only combinations of vs and cyl that appear in the data
expand(mtcars, nesting(vs, cyl))
## # A tibble: 5 x 2
##      vs   cyl
##   <dbl> <dbl>
## 1     0     4
## 2     0     6
## 3     0     8
## 4     1     4
## 5     1     6
# Implicit missings -----------------------------------------------------
df <- tibble(
  year   = c(2010, 2010, 2010, 2010, 2012, 2012, 2012),
  qtr    = c(   1,    2,    3,    4,    1,    2,    3),
  return = rnorm(7)
)
df
## # A tibble: 7 x 3
##    year   qtr  return
##   <dbl> <dbl>   <dbl>
## 1  2010     1 -0.526 
## 2  2010     2 -0.253 
## 3  2010     3 -1.48  
## 4  2010     4  0.585 
## 5  2012     1 -1.28  
## 6  2012     2  0.258 
## 7  2012     3  0.0765
df %>% expand(year, qtr)
## # A tibble: 8 x 2
##    year   qtr
##   <dbl> <dbl>
## 1  2010     1
## 2  2010     2
## 3  2010     3
## 4  2010     4
## 5  2012     1
## 6  2012     2
## 7  2012     3
## 8  2012     4
df %>% expand(year = 2010:2012, qtr)
## # A tibble: 12 x 2
##     year   qtr
##    <int> <dbl>
##  1  2010     1
##  2  2010     2
##  3  2010     3
##  4  2010     4
##  5  2011     1
##  6  2011     2
##  7  2011     3
##  8  2011     4
##  9  2012     1
## 10  2012     2
## 11  2012     3
## 12  2012     4
df %>% expand(year = full_seq(year, 1), qtr)
## # A tibble: 12 x 2
##     year   qtr
##    <dbl> <dbl>
##  1  2010     1
##  2  2010     2
##  3  2010     3
##  4  2010     4
##  5  2011     1
##  6  2011     2
##  7  2011     3
##  8  2011     4
##  9  2012     1
## 10  2012     2
## 11  2012     3
## 12  2012     4
df %>% complete(year = full_seq(year, 1), qtr)
## # A tibble: 12 x 3
##     year   qtr  return
##    <dbl> <dbl>   <dbl>
##  1  2010     1 -0.526 
##  2  2010     2 -0.253 
##  3  2010     3 -1.48  
##  4  2010     4  0.585 
##  5  2011     1 NA     
##  6  2011     2 NA     
##  7  2011     3 NA     
##  8  2011     4 NA     
##  9  2012     1 -1.28  
## 10  2012     2  0.258 
## 11  2012     3  0.0765
## 12  2012     4 NA
# Nesting -------------------------------------------------------------

experiment <- tibble(
  name = rep(c("Alex", "Robert", "Sam"), c(3, 2, 1)),
  trt  = rep(c("a", "b", "a"), c(3, 2, 1)),
  rep = c(1, 2, 3, 1, 2, 1),
  measurement_1 = runif(6),
  measurement_2 = runif(6)
)
experiment
## # A tibble: 6 x 5
##   name   trt     rep measurement_1 measurement_2
##   <chr>  <chr> <dbl>         <dbl>         <dbl>
## 1 Alex   a         1         0.476        0.0408
## 2 Alex   a         2         0.594        0.781 
## 3 Alex   a         3         0.291        0.314 
## 4 Robert b         1         0.220        0.332 
## 5 Robert b         2         0.595        0.581 
## 6 Sam    a         1         0.348        0.329
# We can figure out the complete set of data with expand()
# Each person only gets one treatment, so we nest name and trt together:
all <- experiment %>% 
  expand(nesting(name, trt), rep)
all
## # A tibble: 9 x 3
##   name   trt     rep
##   <chr>  <chr> <dbl>
## 1 Alex   a         1
## 2 Alex   a         2
## 3 Alex   a         3
## 4 Robert b         1
## 5 Robert b         2
## 6 Robert b         3
## 7 Sam    a         1
## 8 Sam    a         2
## 9 Sam    a         3
# We can use anti_join to figure out which observations are missing
all %>% anti_join(experiment)
## Joining, by = c("name", "trt", "rep")
## # A tibble: 3 x 3
##   name   trt     rep
##   <chr>  <chr> <dbl>
## 1 Robert b         3
## 2 Sam    a         2
## 3 Sam    a         3
# And use right_join to add in the appropriate missing values to the original data
experiment %>% right_join(all)
## Joining, by = c("name", "trt", "rep")
## # A tibble: 9 x 5
##   name   trt     rep measurement_1 measurement_2
##   <chr>  <chr> <dbl>         <dbl>         <dbl>
## 1 Alex   a         1         0.476        0.0408
## 2 Alex   a         2         0.594        0.781 
## 3 Alex   a         3         0.291        0.314 
## 4 Robert b         1         0.220        0.332 
## 5 Robert b         2         0.595        0.581 
## 6 Robert b         3        NA           NA     
## 7 Sam    a         1         0.348        0.329 
## 8 Sam    a         2        NA           NA     
## 9 Sam    a         3        NA           NA
# Or use the complete() short-hand
experiment %>% complete(nesting(name, trt), rep)
## # A tibble: 9 x 5
##   name   trt     rep measurement_1 measurement_2
##   <chr>  <chr> <dbl>         <dbl>         <dbl>
## 1 Alex   a         1         0.476        0.0408
## 2 Alex   a         2         0.594        0.781 
## 3 Alex   a         3         0.291        0.314 
## 4 Robert b         1         0.220        0.332 
## 5 Robert b         2         0.595        0.581 
## 6 Robert b         3        NA           NA     
## 7 Sam    a         1         0.348        0.329 
## 8 Sam    a         2        NA           NA     
## 9 Sam    a         3        NA           NA
# Generate all combinations with expand():
formulas <- list(
  formula1 = Sepal.Length ~ Sepal.Width,
  formula2 = Sepal.Length ~ Sepal.Width + Petal.Width,
  formula3 = Sepal.Length ~ Sepal.Width + Petal.Width + Petal.Length
)
data <- split(iris, iris$Species)
crossing(formula = formulas, data)
## # A tibble: 9 x 2
##   formula      data             
##   <named list> <named list>     
## 1 <formula>    <df[,5] [50 x 5]>
## 2 <formula>    <df[,5] [50 x 5]>
## 3 <formula>    <df[,5] [50 x 5]>
## 4 <formula>    <df[,5] [50 x 5]>
## 5 <formula>    <df[,5] [50 x 5]>
## 6 <formula>    <df[,5] [50 x 5]>
## 7 <formula>    <df[,5] [50 x 5]>
## 8 <formula>    <df[,5] [50 x 5]>
## 9 <formula>    <df[,5] [50 x 5]>

separate

df <- data.frame(x = c(NA, "a.b", "a.d", "b.c"))
df
##      x
## 1 <NA>
## 2  a.b
## 3  a.d
## 4  b.c
df %>% separate(x, c("A", "B"),sep = "\\.")
##      A    B
## 1 <NA> <NA>
## 2    a    b
## 3    a    d
## 4    b    c
df <- data.frame(x = c("a", "a b", "a b c", NA))
df
##       x
## 1     a
## 2   a b
## 3 a b c
## 4  <NA>
df %>% separate(x, c("a", "b"),sep = " ")
## Warning: Expected 2 pieces. Additional pieces discarded in 1 rows [3].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [1].
##      a    b
## 1    a <NA>
## 2    a    b
## 3    a    b
## 4 <NA> <NA>
# The same behaviour drops the c but no warnings
df %>% separate(x, c("a", "b"), extra = "drop", fill = "right")
##      a    b
## 1    a <NA>
## 2    a    b
## 3    a    b
## 4 <NA> <NA>
# Another option:
df %>% separate(x, c("a", "b"), extra = "merge", fill = "left")
##      a    b
## 1 <NA>    a
## 2    a    b
## 3    a  b c
## 4 <NA> <NA>
# Or you can keep all three
df %>% separate(x, c("a", "b", "c"))
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2 rows [1, 2].
##      a    b    c
## 1    a <NA> <NA>
## 2    a    b <NA>
## 3    a    b    c
## 4 <NA> <NA> <NA>
# If only want to split specified number of times use extra = "merge"
df <- data.frame(x = c("x: 123", "y: error: 7"))
df
##             x
## 1      x: 123
## 2 y: error: 7
df %>% separate(x,c("key", "value"), ": ", extra = "merge")
##   key    value
## 1   x      123
## 2   y error: 7
df %>% separate(x,c("key","value"),": ")
## Warning: Expected 2 pieces. Additional pieces discarded in 1 rows [2].
##   key value
## 1   x   123
## 2   y error
# Use regular expressions to separate on multiple characters:
df <- data.frame(x = c(NA, "a?b", "a.d", "b:c"))
df
##      x
## 1 <NA>
## 2  a?b
## 3  a.d
## 4  b:c
df %>% separate(x, c("A","B"), sep = "([\\.\\?\\:])")
##      A    B
## 1 <NA> <NA>
## 2    a    b
## 3    a    d
## 4    b    c
# convert = TRUE detects column classes
df <- data.frame(x = c("a:1", "a:2", "c:4", "d", NA))
df
##      x
## 1  a:1
## 2  a:2
## 3  c:4
## 4    d
## 5 <NA>
df %>% separate(x, c("key","value"), ":") 
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [4].
##    key value
## 1    a     1
## 2    a     2
## 3    c     4
## 4    d  <NA>
## 5 <NA>  <NA>
df %>% separate(x, c("key","value"), ":", convert = TRUE)
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [4].
##    key value
## 1    a     1
## 2    a     2
## 3    c     4
## 4    d    NA
## 5 <NA>    NA
# Argument col can take quasiquotation to work with strings
var <- "x"
df %>% separate(!!var, c("key","value"), ":")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [4].
##    key value
## 1    a     1
## 2    a     2
## 3    c     4
## 4    d  <NA>
## 5 <NA>  <NA>

extract

df <- data.frame(x = c(NA, "ahjjk-knb", "anjk-njkd", "hjbjb-cjbj", "njd-nnke"))
df
##            x
## 1       <NA>
## 2  ahjjk-knb
## 3  anjk-njkd
## 4 hjbjb-cjbj
## 5   njd-nnke
df %>% extract(x, "A")
##       A
## 1  <NA>
## 2 ahjjk
## 3  anjk
## 4 hjbjb
## 5   njd
df %>% 
  separate(col = x,into = c("key","value"),sep = "-")
##     key value
## 1  <NA>  <NA>
## 2 ahjjk   knb
## 3  anjk  njkd
## 4 hjbjb  cjbj
## 5   njd  nnke
df %>% extract(x, c("A", "B"), "([[:alnum:]]+)-([[:alnum:]]+)")
##       A    B
## 1  <NA> <NA>
## 2 ahjjk  knb
## 3  anjk njkd
## 4 hjbjb cjbj
## 5   njd nnke
# If no match, NA:
df %>% extract(x, c("A", "B"), "([a-z]+)-([a-n]+)")
##       A    B
## 1  <NA> <NA>
## 2 ahjjk  knb
## 3  anjk njkd
## 4 hjbjb cjbj
## 5   njd nnke