|
|
@ -5,8 +5,10 @@ import ( |
|
|
|
"github.com/antchfx/htmlquery" |
|
|
|
"github.com/gocolly/colly/v2" |
|
|
|
"strings" |
|
|
|
"time" |
|
|
|
) |
|
|
|
|
|
|
|
/// https://www.aiprose.com/ 爬取
|
|
|
|
func SpiderAiprose() { |
|
|
|
c := colly.NewCollector(colly.AllowedDomains("www.aiprose.com"), colly.Async(true)) |
|
|
|
c.OnHTML(".home-content-title a[href]", func(e *colly.HTMLElement) { |
|
|
@ -37,3 +39,41 @@ func SpiderAiprose() { |
|
|
|
//c.Visit("https://www.aiprose.com/blog/143")
|
|
|
|
c.Wait() |
|
|
|
} |
|
|
|
|
|
|
|
/// csdn爬取 https://blog.csdn.net/bitree1?type=blog
|
|
|
|
func SpiderBitree() { |
|
|
|
c := colly.NewCollector(colly.AllowedDomains("blog.csdn.net"), colly.Async(false)) |
|
|
|
c.UserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36" |
|
|
|
c.Limit(&colly.LimitRule{ |
|
|
|
Delay: 10000 * time.Minute, |
|
|
|
//RandomDelay: 2 * time.Minute,
|
|
|
|
Parallelism: 1, |
|
|
|
}) |
|
|
|
c2 := c.Clone() |
|
|
|
//异步
|
|
|
|
c2.Async = false |
|
|
|
//限速
|
|
|
|
c2.Limit(&colly.LimitRule{ |
|
|
|
DomainGlob: "blog.csdn.net/bitree1/article/details/*", |
|
|
|
Delay: 10000 * time.Minute, |
|
|
|
Parallelism: 1, |
|
|
|
}) |
|
|
|
c.OnHTML(".blog-list-box a[href]", func(e *colly.HTMLElement) { |
|
|
|
c2.Request("GET", e.Attr("href"), nil, nil, nil) |
|
|
|
}) |
|
|
|
c2.OnHTML(".blog-content-box", func(e *colly.HTMLElement) { |
|
|
|
//fmt.Println("获取到文章")
|
|
|
|
title := strings.TrimSpace(e.DOM.Find(".title-article").Eq(0).Text()) |
|
|
|
fmt.Println(title) |
|
|
|
}) |
|
|
|
c2.OnRequest(func(r *colly.Request) { |
|
|
|
fmt.Println("Visiting", r.URL) |
|
|
|
}) |
|
|
|
c2.OnError(func(r *colly.Response, err error) { |
|
|
|
fmt.Println("Request URL:", r.Request.URL.String(), "failed with response:", r, "\nError:", err) |
|
|
|
c2.Request("GET", r.Request.URL.String(), nil, nil, nil) |
|
|
|
}) |
|
|
|
c.Visit("https://blog.csdn.net/bitree1?type=blog") |
|
|
|
c.Wait() |
|
|
|
c2.Wait() |
|
|
|
} |
|
|
|