library(tidyverse,warn.conflicts = FALSE)
## -- Attaching packages ---------------------- tidyverse 1.3.0 --
## √ ggplot2 3.3.0 √ purrr 0.3.3
## √ tibble 2.1.3 √ dplyr 0.8.5
## √ tidyr 1.0.2 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.5.0
## -- Conflicts ------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(DT)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
ggplot2movies::movies %>%
group_by(year) %>%
count(sort = FALSE, name = "num") %>%
mutate(decade = 10 * (year %/% 10)) %>%
ungroup() %>%
select(-year) %>%
group_by(decade) %>%
summarise(n = sum(num)) %>%
ggplot(aes(decade, n)) +
geom_col(aes(decade, n)) +
scale_x_continuous(breaks = seq(1890, 2000, 10)) +
scale_y_continuous(breaks = seq(0, 15000, 2500)) +
geom_text(aes(label = n),hjust = -0.5) +
coord_flip()->p
p
iris %>%
add_count(Species) %>%
select(n,everything())
## # A tibble: 150 x 6
## n Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <int> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 50 5.1 3.5 1.4 0.2 setosa
## 2 50 4.9 3 1.4 0.2 setosa
## 3 50 4.7 3.2 1.3 0.2 setosa
## 4 50 4.6 3.1 1.5 0.2 setosa
## 5 50 5 3.6 1.4 0.2 setosa
## 6 50 5.4 3.9 1.7 0.4 setosa
## 7 50 4.6 3.4 1.4 0.3 setosa
## 8 50 5 3.4 1.5 0.2 setosa
## 9 50 4.4 2.9 1.4 0.2 setosa
## 10 50 4.9 3.1 1.5 0.1 setosa
## # ... with 140 more rows
ggplot2movies::movies %>%
add_count(year) %>%
select(n,everything()) %>%
head()
## # A tibble: 6 x 25
## n title year length budget rating votes r1 r2 r3 r4 r5
## <int> <chr> <int> <int> <int> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 646 $ 1971 121 NA 6.4 348 4.5 4.5 4.5 4.5 14.5
## 2 484 $100~ 1939 71 NA 6 20 0 14.5 4.5 24.5 14.5
## 3 521 $21 ~ 1941 7 NA 8.2 5 0 0 0 0 0
## 4 1390 $40,~ 1996 70 NA 8.2 6 14.5 0 0 0 0
## 5 619 $50,~ 1975 71 NA 3.4 17 24.5 4.5 0 14.5 14.5
## 6 2048 $pent 2000 91 NA 4.3 45 4.5 4.5 4.5 14.5 14.5
## # ... with 13 more variables: r6 <dbl>, r7 <dbl>, r8 <dbl>, r9 <dbl>,
## # r10 <dbl>, mpaa <chr>, Action <int>, Animation <int>, Comedy <int>,
## # Drama <int>, Documentary <int>, Romance <int>, Short <int>
or:
ggplot2movies::movies %>%
group_by(year) %>%
mutate(n = n()) %>%
ungroup(year) %>%
select(n,everything()) %>%
head()
## # A tibble: 6 x 25
## n title year length budget rating votes r1 r2 r3 r4 r5
## <int> <chr> <int> <int> <int> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 646 $ 1971 121 NA 6.4 348 4.5 4.5 4.5 4.5 14.5
## 2 484 $100~ 1939 71 NA 6 20 0 14.5 4.5 24.5 14.5
## 3 521 $21 ~ 1941 7 NA 8.2 5 0 0 0 0 0
## 4 1390 $40,~ 1996 70 NA 8.2 6 14.5 0 0 0 0
## 5 619 $50,~ 1975 71 NA 3.4 17 24.5 4.5 0 14.5 14.5
## 6 2048 $pent 2000 91 NA 4.3 45 4.5 4.5 4.5 14.5 14.5
## # ... with 13 more variables: r6 <dbl>, r7 <dbl>, r8 <dbl>, r9 <dbl>,
## # r10 <dbl>, mpaa <chr>, Action <int>, Animation <int>, Comedy <int>,
## # Drama <int>, Documentary <int>, Romance <int>, Short <int>
mtcars %>%
summarise(mean = mean(disp), n = n())
## mean n
## 1 230.7219 32
mtcars %>%
group_by(cyl) %>%
summarise(mean = mean(disp), n = n())
## # A tibble: 3 x 3
## cyl mean n
## <dbl> <dbl> <int>
## 1 4 105. 11
## 2 6 183. 7
## 3 8 353. 14
mtcars %>%
group_by(cyl, vs) %>%
summarise(cyl_n = n()) %>%
ungroup()
## # A tibble: 5 x 3
## cyl vs cyl_n
## <dbl> <dbl> <int>
## 1 4 0 1
## 2 4 1 10
## 3 6 0 3
## 4 6 1 4
## 5 8 0 14
mtcars %>%
group_by(cyl) %>%
summarise(disp = mean(disp), sd = sd(disp), double_disp = disp * 2)
## # A tibble: 3 x 4
## cyl disp sd double_disp
## <dbl> <dbl> <dbl> <dbl>
## 1 4 105. NA 210.
## 2 6 183. NA 367.
## 3 8 353. NA 706.
iris %>%
ggplot(aes(fct_reorder(Species,Sepal.Width),Sepal.Width)) +
geom_boxplot(aes(fill = Species),outlier.fill = "red") +
labs(x = "Species") +
theme(text = element_text(family = "Times New Roman",size = 15))
x <- factor(rep(LETTERS[1:9], times = c(40, 10, 5, 27, 1, 1, 1, 1, 1)))
x %>% table()
## .
## A B C D E F G H I
## 40 10 5 27 1 1 1 1 1
x %>% fct_lump_n(3) %>% table()
## .
## A B D Other
## 40 10 27 10
x %>% fct_lump_prop(0.10) %>% table()
## .
## A B D Other
## 40 10 27 10
x %>% fct_lump_min(5) %>% table()
## .
## A B C D Other
## 40 10 5 27 5
x %>% fct_lump_lowfreq() %>% table()
## .
## A D Other
## 40 27 20
x <- factor(letters[rpois(100, 5)])
x
## [1] f b g i g e e e e e c c b e i d h e g e f b e c g f e h c f e e g b k c f d
## [39] c g e c i g f c f e g b f l d b f d d c g g e b c d f l n d d b c e c a g d
## [77] e e e e d e c e c e c e g c g f h d h f d f c
## Levels: a b c d e f g h i k l n
table(x)
## x
## a b c d e f g h i k l n
## 1 8 17 12 24 13 13 4 3 1 2 1
table(fct_lump_lowfreq(x))
##
## a b c d e f g h i k l n
## 1 8 17 12 24 13 13 4 3 1 2 1
# Use positive values to collapse the rarest
fct_lump_n(x, n = 3)
## [1] f Other g Other g e e e e e c c
## [13] Other e Other Other Other e g e f Other e c
## [25] g f e Other c f e e g Other Other c
## [37] f Other c g e c Other g f c f e
## [49] g Other f Other Other Other f Other Other c g g
## [61] e Other c Other f Other Other Other Other Other c e
## [73] c Other g Other e e e e Other e c e
## [85] c e c e g c g f Other Other Other f
## [97] Other f c
## Levels: c e f g Other
fct_lump_prop(x, prop = 0.1)
## [1] f Other g Other g e e e e e c c
## [13] Other e Other d Other e g e f Other e c
## [25] g f e Other c f e e g Other Other c
## [37] f d c g e c Other g f c f e
## [49] g Other f Other d Other f d d c g g
## [61] e Other c d f Other Other d d Other c e
## [73] c Other g d e e e e d e c e
## [85] c e c e g c g f Other d Other f
## [97] d f c
## Levels: c d e f g Other
# Use negative values to collapse the most common
fct_lump_n(x, n = -3)
## [1] Other Other Other Other Other Other Other Other Other Other Other Other
## [13] Other Other Other Other Other Other Other Other Other Other Other Other
## [25] Other Other Other Other Other Other Other Other Other Other k Other
## [37] Other Other Other Other Other Other Other Other Other Other Other Other
## [49] Other Other Other Other Other Other Other Other Other Other Other Other
## [61] Other Other Other Other Other Other n Other Other Other Other Other
## [73] Other a Other Other Other Other Other Other Other Other Other Other
## [85] Other Other Other Other Other Other Other Other Other Other Other Other
## [97] Other Other Other
## Levels: a k n Other
fct_lump_prop(x, prop = -0.1)
## [1] Other b Other i Other Other Other Other Other Other Other Other
## [13] b Other i Other h Other Other Other Other b Other Other
## [25] Other Other Other h Other Other Other Other Other b k Other
## [37] Other Other Other Other Other Other i Other Other Other Other Other
## [49] Other b Other l Other b Other Other Other Other Other Other
## [61] Other b Other Other Other l n Other Other b Other Other
## [73] Other a Other Other Other Other Other Other Other Other Other Other
## [85] Other Other Other Other Other Other Other Other h Other h Other
## [97] Other Other Other
## Levels: a b h i k l n Other
library(patchwork)
p1 <- ggplot(mpg, aes(displ, hwy)) +
geom_point() +
theme(text = element_text(family = "Times New Roman"))
p1
# Manipulating the default position scales lets you:* change the axis labels
p1 +
scale_x_continuous("Engine displacement (L)") +
scale_y_continuous("Highway MPG")->p2
p1 + p2
# * modify the axis limits
p1 + scale_x_continuous(limits = c(2, 6))->p3
p1 + scale_x_continuous(limits = c(0, 10))->p4
p3 + p4
## Warning: Removed 27 rows containing missing values (geom_point).
# * choose your own labels
p1 + scale_x_continuous(
breaks = c(2, 4, 6),
label = c("two", "four", "six")
)
# You can also override the default linear mapping by using a transformation. There are three shortcuts:
p1 + scale_y_log10()-> p1
p1 + scale_y_sqrt()-> p2
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
p1 + scale_y_reverse() -> p3
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
# Or you can supply a transformation in the `trans` argument:
p1 + scale_y_continuous(trans = scales::reciprocal_trans())-> p4
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
# You can also create your own. See ?scales::trans_new
(p1 + p2) / (p3 + p4)
expand(mtcars, vs, cyl)
## # A tibble: 6 x 2
## vs cyl
## <dbl> <dbl>
## 1 0 4
## 2 0 6
## 3 0 8
## 4 1 4
## 5 1 6
## 6 1 8
# Only combinations of vs and cyl that appear in the data
expand(mtcars, nesting(vs, cyl))
## # A tibble: 5 x 2
## vs cyl
## <dbl> <dbl>
## 1 0 4
## 2 0 6
## 3 0 8
## 4 1 4
## 5 1 6
# Implicit missings -----------------------------------------------------
df <- tibble(
year = c(2010, 2010, 2010, 2010, 2012, 2012, 2012),
qtr = c( 1, 2, 3, 4, 1, 2, 3),
return = rnorm(7)
)
df
## # A tibble: 7 x 3
## year qtr return
## <dbl> <dbl> <dbl>
## 1 2010 1 -0.526
## 2 2010 2 -0.253
## 3 2010 3 -1.48
## 4 2010 4 0.585
## 5 2012 1 -1.28
## 6 2012 2 0.258
## 7 2012 3 0.0765
df %>% expand(year, qtr)
## # A tibble: 8 x 2
## year qtr
## <dbl> <dbl>
## 1 2010 1
## 2 2010 2
## 3 2010 3
## 4 2010 4
## 5 2012 1
## 6 2012 2
## 7 2012 3
## 8 2012 4
df %>% expand(year = 2010:2012, qtr)
## # A tibble: 12 x 2
## year qtr
## <int> <dbl>
## 1 2010 1
## 2 2010 2
## 3 2010 3
## 4 2010 4
## 5 2011 1
## 6 2011 2
## 7 2011 3
## 8 2011 4
## 9 2012 1
## 10 2012 2
## 11 2012 3
## 12 2012 4
df %>% expand(year = full_seq(year, 1), qtr)
## # A tibble: 12 x 2
## year qtr
## <dbl> <dbl>
## 1 2010 1
## 2 2010 2
## 3 2010 3
## 4 2010 4
## 5 2011 1
## 6 2011 2
## 7 2011 3
## 8 2011 4
## 9 2012 1
## 10 2012 2
## 11 2012 3
## 12 2012 4
df %>% complete(year = full_seq(year, 1), qtr)
## # A tibble: 12 x 3
## year qtr return
## <dbl> <dbl> <dbl>
## 1 2010 1 -0.526
## 2 2010 2 -0.253
## 3 2010 3 -1.48
## 4 2010 4 0.585
## 5 2011 1 NA
## 6 2011 2 NA
## 7 2011 3 NA
## 8 2011 4 NA
## 9 2012 1 -1.28
## 10 2012 2 0.258
## 11 2012 3 0.0765
## 12 2012 4 NA
# Nesting -------------------------------------------------------------
experiment <- tibble(
name = rep(c("Alex", "Robert", "Sam"), c(3, 2, 1)),
trt = rep(c("a", "b", "a"), c(3, 2, 1)),
rep = c(1, 2, 3, 1, 2, 1),
measurement_1 = runif(6),
measurement_2 = runif(6)
)
experiment
## # A tibble: 6 x 5
## name trt rep measurement_1 measurement_2
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Alex a 1 0.476 0.0408
## 2 Alex a 2 0.594 0.781
## 3 Alex a 3 0.291 0.314
## 4 Robert b 1 0.220 0.332
## 5 Robert b 2 0.595 0.581
## 6 Sam a 1 0.348 0.329
# We can figure out the complete set of data with expand()
# Each person only gets one treatment, so we nest name and trt together:
all <- experiment %>%
expand(nesting(name, trt), rep)
all
## # A tibble: 9 x 3
## name trt rep
## <chr> <chr> <dbl>
## 1 Alex a 1
## 2 Alex a 2
## 3 Alex a 3
## 4 Robert b 1
## 5 Robert b 2
## 6 Robert b 3
## 7 Sam a 1
## 8 Sam a 2
## 9 Sam a 3
# We can use anti_join to figure out which observations are missing
all %>% anti_join(experiment)
## Joining, by = c("name", "trt", "rep")
## # A tibble: 3 x 3
## name trt rep
## <chr> <chr> <dbl>
## 1 Robert b 3
## 2 Sam a 2
## 3 Sam a 3
# And use right_join to add in the appropriate missing values to the original data
experiment %>% right_join(all)
## Joining, by = c("name", "trt", "rep")
## # A tibble: 9 x 5
## name trt rep measurement_1 measurement_2
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Alex a 1 0.476 0.0408
## 2 Alex a 2 0.594 0.781
## 3 Alex a 3 0.291 0.314
## 4 Robert b 1 0.220 0.332
## 5 Robert b 2 0.595 0.581
## 6 Robert b 3 NA NA
## 7 Sam a 1 0.348 0.329
## 8 Sam a 2 NA NA
## 9 Sam a 3 NA NA
# Or use the complete() short-hand
experiment %>% complete(nesting(name, trt), rep)
## # A tibble: 9 x 5
## name trt rep measurement_1 measurement_2
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Alex a 1 0.476 0.0408
## 2 Alex a 2 0.594 0.781
## 3 Alex a 3 0.291 0.314
## 4 Robert b 1 0.220 0.332
## 5 Robert b 2 0.595 0.581
## 6 Robert b 3 NA NA
## 7 Sam a 1 0.348 0.329
## 8 Sam a 2 NA NA
## 9 Sam a 3 NA NA
# Generate all combinations with expand():
formulas <- list(
formula1 = Sepal.Length ~ Sepal.Width,
formula2 = Sepal.Length ~ Sepal.Width + Petal.Width,
formula3 = Sepal.Length ~ Sepal.Width + Petal.Width + Petal.Length
)
data <- split(iris, iris$Species)
crossing(formula = formulas, data)
## # A tibble: 9 x 2
## formula data
## <named list> <named list>
## 1 <formula> <df[,5] [50 x 5]>
## 2 <formula> <df[,5] [50 x 5]>
## 3 <formula> <df[,5] [50 x 5]>
## 4 <formula> <df[,5] [50 x 5]>
## 5 <formula> <df[,5] [50 x 5]>
## 6 <formula> <df[,5] [50 x 5]>
## 7 <formula> <df[,5] [50 x 5]>
## 8 <formula> <df[,5] [50 x 5]>
## 9 <formula> <df[,5] [50 x 5]>
df <- data.frame(x = c(NA, "a.b", "a.d", "b.c"))
df
## x
## 1 <NA>
## 2 a.b
## 3 a.d
## 4 b.c
df %>% separate(x, c("A", "B"),sep = "\\.")
## A B
## 1 <NA> <NA>
## 2 a b
## 3 a d
## 4 b c
df <- data.frame(x = c("a", "a b", "a b c", NA))
df
## x
## 1 a
## 2 a b
## 3 a b c
## 4 <NA>
df %>% separate(x, c("a", "b"),sep = " ")
## Warning: Expected 2 pieces. Additional pieces discarded in 1 rows [3].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [1].
## a b
## 1 a <NA>
## 2 a b
## 3 a b
## 4 <NA> <NA>
# The same behaviour drops the c but no warnings
df %>% separate(x, c("a", "b"), extra = "drop", fill = "right")
## a b
## 1 a <NA>
## 2 a b
## 3 a b
## 4 <NA> <NA>
# Another option:
df %>% separate(x, c("a", "b"), extra = "merge", fill = "left")
## a b
## 1 <NA> a
## 2 a b
## 3 a b c
## 4 <NA> <NA>
# Or you can keep all three
df %>% separate(x, c("a", "b", "c"))
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2 rows [1, 2].
## a b c
## 1 a <NA> <NA>
## 2 a b <NA>
## 3 a b c
## 4 <NA> <NA> <NA>
# If only want to split specified number of times use extra = "merge"
df <- data.frame(x = c("x: 123", "y: error: 7"))
df
## x
## 1 x: 123
## 2 y: error: 7
df %>% separate(x,c("key", "value"), ": ", extra = "merge")
## key value
## 1 x 123
## 2 y error: 7
df %>% separate(x,c("key","value"),": ")
## Warning: Expected 2 pieces. Additional pieces discarded in 1 rows [2].
## key value
## 1 x 123
## 2 y error
# Use regular expressions to separate on multiple characters:
df <- data.frame(x = c(NA, "a?b", "a.d", "b:c"))
df
## x
## 1 <NA>
## 2 a?b
## 3 a.d
## 4 b:c
df %>% separate(x, c("A","B"), sep = "([\\.\\?\\:])")
## A B
## 1 <NA> <NA>
## 2 a b
## 3 a d
## 4 b c
# convert = TRUE detects column classes
df <- data.frame(x = c("a:1", "a:2", "c:4", "d", NA))
df
## x
## 1 a:1
## 2 a:2
## 3 c:4
## 4 d
## 5 <NA>
df %>% separate(x, c("key","value"), ":")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [4].
## key value
## 1 a 1
## 2 a 2
## 3 c 4
## 4 d <NA>
## 5 <NA> <NA>
df %>% separate(x, c("key","value"), ":", convert = TRUE)
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [4].
## key value
## 1 a 1
## 2 a 2
## 3 c 4
## 4 d NA
## 5 <NA> NA
# Argument col can take quasiquotation to work with strings
var <- "x"
df %>% separate(!!var, c("key","value"), ":")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [4].
## key value
## 1 a 1
## 2 a 2
## 3 c 4
## 4 d <NA>
## 5 <NA> <NA>
df <- data.frame(x = c(NA, "ahjjk-knb", "anjk-njkd", "hjbjb-cjbj", "njd-nnke"))
df
## x
## 1 <NA>
## 2 ahjjk-knb
## 3 anjk-njkd
## 4 hjbjb-cjbj
## 5 njd-nnke
df %>% extract(x, "A")
## A
## 1 <NA>
## 2 ahjjk
## 3 anjk
## 4 hjbjb
## 5 njd
df %>%
separate(col = x,into = c("key","value"),sep = "-")
## key value
## 1 <NA> <NA>
## 2 ahjjk knb
## 3 anjk njkd
## 4 hjbjb cjbj
## 5 njd nnke
df %>% extract(x, c("A", "B"), "([[:alnum:]]+)-([[:alnum:]]+)")
## A B
## 1 <NA> <NA>
## 2 ahjjk knb
## 3 anjk njkd
## 4 hjbjb cjbj
## 5 njd nnke
# If no match, NA:
df %>% extract(x, c("A", "B"), "([a-z]+)-([a-n]+)")
## A B
## 1 <NA> <NA>
## 2 ahjjk knb
## 3 anjk njkd
## 4 hjbjb cjbj
## 5 njd nnke