read_html(x, encoding = "", ..., options = c("RECOVER", "NOERROR", "NOBLANKS"))
# x为网址链接
# encoding = #解码 UTF-8
html_nodes(x, css, xpath)
# css与 xpath 路径二选一即可
# css路径表示为:div.show-detail p.house-title a
# xpath路径表示为:xpath= "//p[@class='house-title']//a
西域网的特种扳手品类:
基本确定http://www.ehsy.com/category-16883?p=i的格式即为最后的格式!
## {html_document}
## <html>
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="layout-body category-result">\n<div class="layout-header-con ...
name <- web %>%
html_nodes(".p-name .high-light") %>%
html_text()
sku <- web %>%
html_nodes(".product-parameter .high-light") %>%
html_text(trim = TRUE)
price <- web %>% html_nodes(".yen") %>%
html_text(trim = TRUE) %>%
.[seq(2,72,2)]
stock <- web %>%
html_nodes(".stock") %>%
html_text()
df <- tibble(name,price,stock,sku)
guess_encoding(stock)
## # A tibble: 2 x 2
## encoding confidence
## <chr> <dbl>
## 1 UTF-8 1
## 2 windows-1252 0.42
## Best guess: UTF-8 (100% confident)
## [1] "有库存" "有库存" "有库存" "有库存" "有库存" "有库存" "有库存" "有库存"
## [9] "有库存" "有库存" "有库存" "有库存" "有库存" "有库存" "有库存" "有库存"
## [17] "有库存" "有库存" "有库存" "有库存" "有库存" "有库存" "有库存" "有库存"
## [25] "有库存" "有库存" "有库存" "有库存" "有库存" "有库存" "有库存" "有库存"
## [33] "有库存" "有库存" "有库存" "有库存"
tictoc::tic()
map(1:20, function(i) {
url <- str_c('http://www.ehsy.com/category-16883?p=', i, sep = "")
web <- read_html(url)
name <- web %>%
html_nodes(".p-name .high-light") %>%
html_text()
sku <- web %>%
html_nodes(".product-parameter .high-light") %>%
html_text(trim = TRUE)
price <- web %>% html_nodes(".yen") %>%
html_text(trim = TRUE) %>%
.[seq(2, 72, 2)]
stock <- web %>%
html_nodes(".stock") %>%
html_text()
df <- tibble(name, price, stock, sku)
return(df)
}) %>%
bind_rows()-> data
tictoc::toc()
## 12.97 sec elapsed
15.39 sec elapsed
## [1] 8
tictoc::tic()
furrr::future_map(1:20, function(i) {
url <- str_c('http://www.ehsy.com/category-16883?p=', i, sep = "")
web <- read_html(url)
name <- web %>%
html_nodes(".p-name .high-light") %>%
html_text()
sku <- web %>%
html_nodes(".product-parameter .high-light") %>%
html_text(trim = TRUE)
price <- web %>% html_nodes(".yen") %>%
html_text(trim = TRUE) %>%
.[seq(2, 72, 2)]
stock <- web %>%
html_nodes(".stock") %>%
html_text()
df <- tibble(name, price, stock, sku)
return(df)
}) %>%
bind_rows()-> data
tictoc::toc()
## 4.63 sec elapsed
1.92 sec elapsed!优秀优秀!