diff --git a/go.mod b/go.mod index 902573b..3d2df28 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ require ( github.com/andybalholm/cascadia v1.3.1 // indirect github.com/antchfx/htmlquery v1.2.4 github.com/antchfx/xmlquery v1.3.7 // indirect - github.com/go-basic/uuid v1.0.0 // indirect + github.com/go-basic/uuid v1.0.0 github.com/gocolly/colly/v2 v2.1.0 github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/protobuf v1.5.2 // indirect diff --git a/main.go b/main.go index 4138d6c..6486b8b 100644 --- a/main.go +++ b/main.go @@ -3,5 +3,6 @@ package main import "go-wordpress-xmlrpc/spider" func main() { - spider.SpiderAiprose() + //spider.SpiderAiprose() + spider.SpiderBitree() } diff --git a/spider/blog.go b/spider/blog.go index 5a7365b..1a63916 100644 --- a/spider/blog.go +++ b/spider/blog.go @@ -8,8 +8,8 @@ import ( func saveBlog(title string, content string, postDate string, tags []string, catelogs []string) { c, err := NewClient(`http://49.235.160.131/xmlrpc.php`, UserInfo{ //c, err := NewClient(`https://www.aispider.cc/xmlrpc.php`, UserInfo{ - `nelson`, - `Yasaka.00`, + `你的账户`, + `你的密码`, }) if err != nil { log.Fatalln(err) diff --git a/spider/spider.go b/spider/spider.go index acffae4..3a6a3a0 100644 --- a/spider/spider.go +++ b/spider/spider.go @@ -5,8 +5,10 @@ import ( "github.com/antchfx/htmlquery" "github.com/gocolly/colly/v2" "strings" + "time" ) +/// https://www.aiprose.com/ 爬取 func SpiderAiprose() { c := colly.NewCollector(colly.AllowedDomains("www.aiprose.com"), colly.Async(true)) c.OnHTML(".home-content-title a[href]", func(e *colly.HTMLElement) { @@ -37,3 +39,41 @@ func SpiderAiprose() { //c.Visit("https://www.aiprose.com/blog/143") c.Wait() } + +/// csdn爬取 https://blog.csdn.net/bitree1?type=blog +func SpiderBitree() { + c := colly.NewCollector(colly.AllowedDomains("blog.csdn.net"), colly.Async(false)) + c.UserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36" + c.Limit(&colly.LimitRule{ + Delay: 10000 * time.Minute, + //RandomDelay: 2 * time.Minute, + Parallelism: 1, + }) + c2 := c.Clone() + //异步 + c2.Async = false + //限速 + c2.Limit(&colly.LimitRule{ + DomainGlob: "blog.csdn.net/bitree1/article/details/*", + Delay: 10000 * time.Minute, + Parallelism: 1, + }) + c.OnHTML(".blog-list-box a[href]", func(e *colly.HTMLElement) { + c2.Request("GET", e.Attr("href"), nil, nil, nil) + }) + c2.OnHTML(".blog-content-box", func(e *colly.HTMLElement) { + //fmt.Println("获取到文章") + title := strings.TrimSpace(e.DOM.Find(".title-article").Eq(0).Text()) + fmt.Println(title) + }) + c2.OnRequest(func(r *colly.Request) { + fmt.Println("Visiting", r.URL) + }) + c2.OnError(func(r *colly.Response, err error) { + fmt.Println("Request URL:", r.Request.URL.String(), "failed with response:", r, "\nError:", err) + c2.Request("GET", r.Request.URL.String(), nil, nil, nil) + }) + c.Visit("https://blog.csdn.net/bitree1?type=blog") + c.Wait() + c2.Wait() +}