package spider import ( "fmt" "github.com/antchfx/htmlquery" "github.com/gocolly/colly/v2" "strings" "time" ) /// https://www.aiprose.com/ 爬取 func SpiderAiprose() { c := colly.NewCollector(colly.AllowedDomains("www.aiprose.com"), colly.Async(true)) c.OnHTML(".home-content-title a[href]", func(e *colly.HTMLElement) { e.Request.Visit(e.Attr("href")) }) c.OnHTML(".pagination .next a[href]", func(e *colly.HTMLElement) { e.Request.Visit(e.Attr("href")) }) c.OnHTML(".blog-root", func(e *colly.HTMLElement) { title := strings.TrimSpace(e.DOM.Find(".blog-title").Eq(0).Text()) //author:=strings.TrimSpace(e.DOM.Find(".author-info a").Eq(0).Text()) time := strings.TrimSpace(e.DOM.Find(".author-info span").Eq(0).Text()) catelog := strings.TrimSpace(e.DOM.Find(".author-info .catelog-name").Text()) content := strings.TrimSpace(e.DOM.Find(".blog-detaile").Eq(0).Text()) nodes := e.DOM.Find(".blog-label label").Nodes catelogs := []string{catelog} var tags []string for _, node := range nodes { tags = append(tags, htmlquery.InnerText(node)) } saveBlog(title, content, time, tags, catelogs) }) c.OnRequest(func(r *colly.Request) { fmt.Println("Visiting", r.URL) }) c.UserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36" c.Visit("https://www.aiprose.com/blogs") //c.Visit("https://www.aiprose.com/blog/143") c.Wait() } /// csdn爬取 https://blog.csdn.net/bitree1?type=blog func SpiderBitree() { c := colly.NewCollector(colly.AllowedDomains("blog.csdn.net"), colly.Async(false)) c.UserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36" c.Limit(&colly.LimitRule{ Delay: 10000 * time.Minute, //RandomDelay: 2 * time.Minute, Parallelism: 1, }) c2 := c.Clone() //异步 c2.Async = false //限速 c2.Limit(&colly.LimitRule{ DomainGlob: "blog.csdn.net/bitree1/article/details/*", Delay: 10000 * time.Minute, Parallelism: 1, }) c.OnHTML(".blog-list-box a[href]", func(e *colly.HTMLElement) { c2.Request("GET", e.Attr("href"), nil, nil, nil) }) c2.OnHTML(".blog-content-box", func(e *colly.HTMLElement) { //fmt.Println("获取到文章") title := strings.TrimSpace(e.DOM.Find(".title-article").Eq(0).Text()) fmt.Println(title) }) c2.OnRequest(func(r *colly.Request) { fmt.Println("Visiting", r.URL) }) c2.OnError(func(r *colly.Response, err error) { fmt.Println("Request URL:", r.Request.URL.String(), "failed with response:", r, "\nError:", err) c2.Request("GET", r.Request.URL.String(), nil, nil, nil) }) c.Visit("https://blog.csdn.net/bitree1?type=blog") c.Wait() c2.Wait() }