download.file(
url = "https://cran.r-project.org/web/packages/available_packages_by_date.html",
destfile = "scrapedpage.html"
)
data_cran <- read_html("https://cran.r-project.org/web/packages/available_packages_by_date.html")
# data_cran %>%
# html_node("th , td") %>%
# html_table(fill = TRUE) -> data_cran_df
data_cran %>%
html_node("body > table") %>%
html_table(fill = TRUE) -> data_cran_df
data_cran_df %>%
head(100) %>%
datatable()
# write_csv(data_cran_df,path = "../data1/cran_20200508.csv")
url <- "https://en.wikipedia.org/wiki/Men%27s_100_metres_world_record_progression" # 读取URL
download.file(url, destfile = "scrapedpage.html", quiet=TRUE) # 下载html
data_html <- read_html("scrapedpage.html")
data_html %>%
html_node("div+ .wikitable td , div+ .wikitable th") %>%
html_table(fill = TRUE) # 居然错了!!!
# Error in html_table.xml_node(., fill = TRUE) : html_name(x) == "table" is not TRUE
啊哦!似乎我们马上就遇到了一个错误。这里我就不细讲了,但是有时候我们必须对SelectorGadget保持谨慎。这是一个很好的工具,通常可以完美地工作。然而,有时候看起来正确的选择(例如,用黄色突出显示的东西)并不是我们想要的。我特意选择了这个Wikipedia 100m的例子,因为我想展示这个潜在的陷阱。再次强调:网络抓取既是一门科学,也是一门艺术。
data_html %>%
html_node("#mw-content-text > div > table:nth-child(8)") %>% # 分析网页
html_table() -> pre_iaaf # 得到数据表
pre_iaaf %>% class()
## [1] "data.frame"
## Warning in FUN(X[[i]], ...): strings not representable in native encoding will
## be translated to UTF-8
## [1] "time" "athlete" "nationality"
## [4] "location_of_races" "date"
pre_iaaf %<>%
mutate(athlete = if_else(is.na(athlete),athlete,lag(athlete)))
pre_iaaf %>%
mutate(date = mdy(date))-> pre_iaaf
pre_iaaf
## # A tibble: 21 x 5
## time athlete nationality location_of_races date
## <dbl> <chr> <chr> <chr> <date>
## 1 10.8 <NA> United States Paris, France 1891-07-04
## 2 10.8 Luther Cary United Kingdom Brussels, Belgium 1892-09-25
## 3 10.8 Cecil Lee Belgium Brussels, Belgium 1893-08-04
## 4 10.8 étienne De Ré United Kingdom Frankfurt/Main, Germany 1895-04-13
## 5 10.8 L. Atcherley United Kingdom Rotterdam, Netherlands 1895-08-28
## 6 10.8 Harry Beaton Sweden Helsingborg, Sweden 1896-08-09
## 7 10.8 Harald Anderson-Arbin Sweden G<U+00E4>vle, Sweden 1898-09-11
## 8 10.8 Isaac Westergren Sweden G<U+00E4>vle, Sweden 1899-09-10
## 9 10.8 10.8 United States Paris, France 1900-07-14
## 10 10.8 Frank Jarvis United States Paris, France 1900-07-14
## # ... with 11 more rows
最后,我们有了干净的数据框架。如果我们愿意,我们可以很容易地绘制出前国际田联的数据。但是,我打算等到我们收集完剩下的WR数据后再这么做。说到这里…
data_html %>%
html_node("#mw-content-text > div > table:nth-child(14)") %>%
html_table(fill = TRUE) -> iaaf_76
iaaf_76 %>% class()
## [1] "data.frame"
iaaf_76 %>%
as_tibble() %>%
janitor::clean_names() %>%
mutate(date = mdy(date)) %>%
mutate(athlete = ifelse(athlete=="", lag(athlete), athlete)) -> iaaf_76
## Warning: 3 failed to parse.
## # A tibble: 20 x 8
## time wind auto athlete nationality location_of_race date ref
## <dbl> <chr> <dbl> <chr> <chr> <chr> <date> <chr>
## 1 10 "2.0" 10.2 Jim Hines "United Sta~ Modesto, USA 1967-05-27 "[2]"
## 2 10 "1.8" NA Enrique ~ "Cuba" Budapest, Hungary 1967-06-17 "[2]"
## 3 10 "0.0" NA Paul Nash "South Afri~ Krugersdorp, Sout~ 1968-04-02 "[2]"
## 4 10 "1.1" NA Oliver F~ "United Sta~ Albuquerque, USA 1968-05-31 "[2]"
## 5 10 "2.0" 10.2 Oliver F~ "Charles Gr~ Sacramento, USA 1968-06-20 "[2]"
## 6 10 "2.0" 10.3 Oliver F~ "Charles Gr~ Roger Bambuck NA ""
## 7 9.9 "0.8" 10.0 Jim Hines "United Sta~ Sacramento, USA 1968-06-20 "[2]"
## 8 9.9 "0.9" 10.1 Ronnie R~ "United Sta~ Sacramento, USA 1968-06-20 ""
## 9 9.9 "0.9" 10.1 Charles ~ "United Sta~ Sacramento, USA 1968-06-20 ""
## 10 9.9 "0.3" 9.95 Jim Hines "United Sta~ Mexico City, Mexi~ 1968-10-14 "[2]"
## 11 9.9 "0.0" NA Eddie Ha~ "United Sta~ Eugene, USA 1972-07-01 "[2]"
## 12 9.9 "0.0" NA Eddie Ha~ "United Sta~ United States NA ""
## 13 9.9 "1.3" NA Steve Wi~ "United Sta~ Los Angeles, USA 1974-06-21 "[2]"
## 14 9.9 "1.7" NA Silvio L~ "Cuba" Ostrava, Czechosl~ 1975-06-05 "[2]"
## 15 9.9 "0.0" NA Steve Wi~ "United Sta~ Siena, Italy 1975-07-16 "[2]"
## 16 9.9 "<U+2212>0.2" NA Steve Wi~ "" Berlin, Germany 1975-08-22 "[2]"
## 17 9.9 "0.7" NA Steve Wi~ "" Gainesville, USA 1976-03-27 "[2]"
## 18 9.9 "0.7" NA Steve Wi~ "Harvey Gla~ Columbia, USA 1976-04-03 "[2]"
## 19 9.9 "" NA Steve Wi~ "" Baton Rouge, USA 1976-05-01 "[2]"
## 20 9.9 "1.7" NA Don Quar~ "Jamaica" Modesto, USA 1976-05-22 "[2]"
## # A tibble: 54 x 8
## time wind auto athlete nationality location_of_race date ref
## <dbl> <chr> <dbl> <chr> <chr> <chr> <date> <chr>
## 1 10.6 "" NA Donald Li~ United Stat~ Stockholm, Sweden 1912-07-06 [2]
## 2 10.6 "" NA Jackson S~ United Stat~ Stockholm, Sweden 1920-09-16 [2]
## 3 10.4 "" NA Charley P~ United Stat~ Redlands, USA 1921-04-23 [2]
## 4 10.4 "0.0" NA Eddie Tol~ United Stat~ Stockholm, Sweden 1929-08-08 [2]
## 5 10.4 "" NA Eddie Tol~ United Stat~ Copenhagen, Denma~ 1929-08-25 [2]
## 6 10.3 "" NA Percy Wil~ Canada Toronto, Ontario,~ 1930-08-09 [2]
## 7 10.3 "0.4" 10.4 Eddie Tol~ United Stat~ Los Angeles, USA 1932-08-01 [2]
## 8 10.3 "" NA Eddie Tol~ Ralph Metca~ Budapest, Hungary 1933-08-12 [2]
## 9 10.3 "" NA Eddie Tol~ Eulace Peac~ Oslo, Norway 1934-08-06 [2]
## 10 10.3 "" NA Chris Ber~ Netherlands Amsterdam, Nether~ 1934-08-26 [2]
## # ... with 44 more rows
data_html %>%
html_node("#mw-content-text > div > table:nth-child(19)") %>%
html_table(fill = TRUE) -> iaaf
iaaf %>%
as_tibble() %>%
janitor::clean_names() %>%
mutate(date = mdy(date)) -> iaaf
iaaf
## # A tibble: 24 x 8
## time wind auto athlete nationality location_of_race date
## <dbl> <chr> <dbl> <chr> <chr> <chr> <date>
## 1 10.1 1.3 NA Bob Ha~ United Sta~ Tokyo, Japan 1964-10-15
## 2 10.0 0.8 NA Jim Hi~ United Sta~ Sacramento, USA 1968-06-20
## 3 10.0 2.0 NA Charle~ United Sta~ Mexico City, Me~ 1968-10-13
## 4 9.95 0.3 NA Jim Hi~ United Sta~ Mexico City, Me~ 1968-10-14
## 5 9.93 1.4 NA Calvin~ United Sta~ Colorado Spring~ 1983-07-03
## 6 9.83 1.0 NA Ben Jo~ Canada Rome, Italy 1987-08-30
## 7 9.93 1.1 NA Carl L~ United Sta~ Rome, Italy 1987-08-30
## 8 9.93 1.1 NA Carl L~ United Sta~ Zürich, Switzer~ 1988-08-17
## 9 9.79 1.1 NA Ben Jo~ Canada Seoul, South Ko~ 1988-09-24
## 10 9.92 1.1 NA Carl L~ United Sta~ Seoul, South Ko~ 1988-09-24
## # ... with 14 more rows, and 1 more variable: notes_note_2 <chr>
wr100 <-
bind_rows(
pre_iaaf %>% select(time, athlete, nationality:date) %>% mutate(era = "Pre-IAAF"),
iaaf_76 %>% select(time, athlete, nationality:date) %>% mutate(era = "Pre-automatic"),
iaaf %>% select(time, athlete, nationality:date) %>% mutate(era = "Modern")
)
wr100
## # A tibble: 99 x 7
## time athlete nationality location_of_rac~ date era location_of_race
## <dbl> <chr> <chr> <chr> <date> <chr> <chr>
## 1 10.8 <NA> United Sta~ Paris, France 1891-07-04 Pre-~ <NA>
## 2 10.8 Luther ~ United Kin~ Brussels, Belgi~ 1892-09-25 Pre-~ <NA>
## 3 10.8 Cecil L~ Belgium Brussels, Belgi~ 1893-08-04 Pre-~ <NA>
## 4 10.8 étienne~ United Kin~ Frankfurt/Main,~ 1895-04-13 Pre-~ <NA>
## 5 10.8 L. Atch~ United Kin~ Rotterdam, Neth~ 1895-08-28 Pre-~ <NA>
## 6 10.8 Harry B~ Sweden Helsingborg, Sw~ 1896-08-09 Pre-~ <NA>
## 7 10.8 Harald ~ Sweden G<U+00E4>vle, Sweden 1898-09-11 Pre-~ <NA>
## 8 10.8 Isaac W~ Sweden G<U+00E4>vle, Sweden 1899-09-10 Pre-~ <NA>
## 9 10.8 10.8 United Sta~ Paris, France 1900-07-14 Pre-~ <NA>
## 10 10.8 Frank J~ United Sta~ Paris, France 1900-07-14 Pre-~ <NA>
## # ... with 89 more rows
wr100 %>%
ggplot(aes(x=date, y=time, col=fct_reorder2(era, date, time))) +
geom_point(alpha = 0.7) +
labs(
title = "Men's 100m world record progression",
x = "Date", y = "Time",
caption = "Source: Wikipedia"
) +
theme(legend.title = element_blank()) ## Switch off legend title
总结: