go colly 爬虫博客到wordpress https://www.aispider.cc
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

78 lines
2.6 KiB

package spider
import (
"fmt"
"github.com/antchfx/htmlquery"
"github.com/gocolly/colly/v2"
"strings"
"time"
)
/// https://www.aiprose.com/ 爬取
func SpiderAiprose() {
c := colly.NewCollector(colly.AllowedDomains("www.aiprose.com"), colly.Async(true))
c.OnHTML(".home-content-title a[href]", func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})
c.OnHTML(".pagination .next a[href]", func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})
c.OnHTML(".blog-root", func(e *colly.HTMLElement) {
title := strings.TrimSpace(e.DOM.Find(".blog-title").Eq(0).Text())
//author:=strings.TrimSpace(e.DOM.Find(".author-info a").Eq(0).Text())
time := strings.TrimSpace(e.DOM.Find(".author-info span").Eq(0).Text())
catelog := strings.TrimSpace(e.DOM.Find(".author-info .catelog-name").Text())
content := strings.TrimSpace(e.DOM.Find(".blog-detaile").Eq(0).Text())
nodes := e.DOM.Find(".blog-label label").Nodes
catelogs := []string{catelog}
var tags []string
for _, node := range nodes {
tags = append(tags, htmlquery.InnerText(node))
}
saveBlog(title, content, time, tags, catelogs)
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL)
})
c.UserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"
c.Visit("https://www.aiprose.com/blogs")
//c.Visit("https://www.aiprose.com/blog/143")
c.Wait()
}
/// csdn爬取 https://blog.csdn.net/bitree1?type=blog
func SpiderBitree() {
c := colly.NewCollector(colly.AllowedDomains("blog.csdn.net"), colly.Async(false))
c.Async = false
c.UserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"
c.Limit(&colly.LimitRule{
Delay: 10000 * time.Minute,
Parallelism: 1,
})
c2 := c.Clone()
//异步
c2.Async = false
//限速
c2.Limit(&colly.LimitRule{
DomainGlob: "blog.csdn.net/bitree1/article/details/*",
Delay: 10000 * time.Minute,
Parallelism: 1,
})
c.OnHTML(".blog-list-box a[href]", func(e *colly.HTMLElement) {
c2.Request("GET", e.Attr("href"), nil, nil, nil)
})
c2.OnHTML(".blog-content-box", func(e *colly.HTMLElement) {
title := strings.TrimSpace(e.DOM.Find(".title-article").Eq(0).Text())
fmt.Println(title)
})
c2.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL)
})
c2.OnError(func(r *colly.Response, err error) {
fmt.Println("Request URL:", r.Request.URL.String(), "failed with response:", r, "\nError:", err)
c2.Request("GET", r.Request.URL.String(), nil, nil, nil)
})
c.Visit("https://blog.csdn.net/bitree1?type=blog")
c.Wait()
c2.Wait()
}