Browse Source

csdn爬取

master
燕鹏 3 years ago
parent
commit
fdcac290d1
  1. 2
      go.mod
  2. 3
      main.go
  3. 4
      spider/blog.go
  4. 40
      spider/spider.go

2
go.mod

@ -7,7 +7,7 @@ require (
github.com/andybalholm/cascadia v1.3.1 // indirect github.com/andybalholm/cascadia v1.3.1 // indirect
github.com/antchfx/htmlquery v1.2.4 github.com/antchfx/htmlquery v1.2.4
github.com/antchfx/xmlquery v1.3.7 // indirect github.com/antchfx/xmlquery v1.3.7 // indirect
github.com/go-basic/uuid v1.0.0 // indirect github.com/go-basic/uuid v1.0.0
github.com/gocolly/colly/v2 v2.1.0 github.com/gocolly/colly/v2 v2.1.0
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.2 // indirect github.com/golang/protobuf v1.5.2 // indirect

3
main.go

@ -3,5 +3,6 @@ package main
import "go-wordpress-xmlrpc/spider" import "go-wordpress-xmlrpc/spider"
func main() { func main() {
spider.SpiderAiprose() //spider.SpiderAiprose()
spider.SpiderBitree()
} }

4
spider/blog.go

@ -8,8 +8,8 @@ import (
func saveBlog(title string, content string, postDate string, tags []string, catelogs []string) { func saveBlog(title string, content string, postDate string, tags []string, catelogs []string) {
c, err := NewClient(`http://49.235.160.131/xmlrpc.php`, UserInfo{ c, err := NewClient(`http://49.235.160.131/xmlrpc.php`, UserInfo{
//c, err := NewClient(`https://www.aispider.cc/xmlrpc.php`, UserInfo{ //c, err := NewClient(`https://www.aispider.cc/xmlrpc.php`, UserInfo{
`nelson`, `你的账户`,
`Yasaka.00`, `你的密码`,
}) })
if err != nil { if err != nil {
log.Fatalln(err) log.Fatalln(err)

40
spider/spider.go

@ -5,8 +5,10 @@ import (
"github.com/antchfx/htmlquery" "github.com/antchfx/htmlquery"
"github.com/gocolly/colly/v2" "github.com/gocolly/colly/v2"
"strings" "strings"
"time"
) )
/// https://www.aiprose.com/ 爬取
func SpiderAiprose() { func SpiderAiprose() {
c := colly.NewCollector(colly.AllowedDomains("www.aiprose.com"), colly.Async(true)) c := colly.NewCollector(colly.AllowedDomains("www.aiprose.com"), colly.Async(true))
c.OnHTML(".home-content-title a[href]", func(e *colly.HTMLElement) { c.OnHTML(".home-content-title a[href]", func(e *colly.HTMLElement) {
@ -37,3 +39,41 @@ func SpiderAiprose() {
//c.Visit("https://www.aiprose.com/blog/143") //c.Visit("https://www.aiprose.com/blog/143")
c.Wait() c.Wait()
} }
/// csdn爬取 https://blog.csdn.net/bitree1?type=blog
func SpiderBitree() {
c := colly.NewCollector(colly.AllowedDomains("blog.csdn.net"), colly.Async(false))
c.UserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"
c.Limit(&colly.LimitRule{
Delay: 10000 * time.Minute,
//RandomDelay: 2 * time.Minute,
Parallelism: 1,
})
c2 := c.Clone()
//异步
c2.Async = false
//限速
c2.Limit(&colly.LimitRule{
DomainGlob: "blog.csdn.net/bitree1/article/details/*",
Delay: 10000 * time.Minute,
Parallelism: 1,
})
c.OnHTML(".blog-list-box a[href]", func(e *colly.HTMLElement) {
c2.Request("GET", e.Attr("href"), nil, nil, nil)
})
c2.OnHTML(".blog-content-box", func(e *colly.HTMLElement) {
//fmt.Println("获取到文章")
title := strings.TrimSpace(e.DOM.Find(".title-article").Eq(0).Text())
fmt.Println(title)
})
c2.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL)
})
c2.OnError(func(r *colly.Response, err error) {
fmt.Println("Request URL:", r.Request.URL.String(), "failed with response:", r, "\nError:", err)
c2.Request("GET", r.Request.URL.String(), nil, nil, nil)
})
c.Visit("https://blog.csdn.net/bitree1?type=blog")
c.Wait()
c2.Wait()
}

Loading…
Cancel
Save