Browse Source

爬虫到正式库

master
nelson 3 years ago
parent
commit
0398bf368d
  1. 5
      go.mod
  2. 5
      spider/blog.go
  3. 5
      spider/spider.go
  4. 3
      wordpress/wp_post.go

5
go.mod

@ -5,9 +5,10 @@ go 1.16
require ( require (
github.com/PuerkitoBio/goquery v1.7.1 // indirect github.com/PuerkitoBio/goquery v1.7.1 // indirect
github.com/andybalholm/cascadia v1.3.1 // indirect github.com/andybalholm/cascadia v1.3.1 // indirect
github.com/antchfx/htmlquery v1.2.4 // indirect github.com/antchfx/htmlquery v1.2.4
github.com/antchfx/xmlquery v1.3.7 // indirect github.com/antchfx/xmlquery v1.3.7 // indirect
github.com/gocolly/colly/v2 v2.1.0 // indirect github.com/go-basic/uuid v1.0.0 // indirect
github.com/gocolly/colly/v2 v2.1.0
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.2 // indirect github.com/golang/protobuf v1.5.2 // indirect
github.com/kolo/xmlrpc v0.0.0-20201022064351-38db28db192b github.com/kolo/xmlrpc v0.0.0-20201022064351-38db28db192b

5
spider/blog.go

@ -5,15 +5,16 @@ import (
"log" "log"
) )
func saveBlog(title string, content string, postDate string, tags []string) { func saveBlog(title string, content string, postDate string, tags []string, catelogs []string) {
c, err := NewClient(`http://49.235.160.131/xmlrpc.php`, UserInfo{ c, err := NewClient(`http://49.235.160.131/xmlrpc.php`, UserInfo{
//c, err := NewClient(`https://www.aispider.cc/xmlrpc.php`, UserInfo{
`nelson`, `nelson`,
`Yasaka.00`, `Yasaka.00`,
}) })
if err != nil { if err != nil {
log.Fatalln(err) log.Fatalln(err)
} }
p := wordpress.NewPost(title, content, postDate, tags, tags) p := wordpress.NewPost(title, content, postDate, tags, catelogs)
blogID, err := c.Call(p) blogID, err := c.Call(p)
if err != nil { if err != nil {
log.Println(err) log.Println(err)

5
spider/spider.go

@ -19,18 +19,21 @@ func SpiderAiprose() {
title := strings.TrimSpace(e.DOM.Find(".blog-title").Eq(0).Text()) title := strings.TrimSpace(e.DOM.Find(".blog-title").Eq(0).Text())
//author:=strings.TrimSpace(e.DOM.Find(".author-info a").Eq(0).Text()) //author:=strings.TrimSpace(e.DOM.Find(".author-info a").Eq(0).Text())
time := strings.TrimSpace(e.DOM.Find(".author-info span").Eq(0).Text()) time := strings.TrimSpace(e.DOM.Find(".author-info span").Eq(0).Text())
catelog := strings.TrimSpace(e.DOM.Find(".author-info .catelog-name").Text())
content := strings.TrimSpace(e.DOM.Find(".blog-detaile").Eq(0).Text()) content := strings.TrimSpace(e.DOM.Find(".blog-detaile").Eq(0).Text())
nodes := e.DOM.Find(".blog-label label").Nodes nodes := e.DOM.Find(".blog-label label").Nodes
catelogs := []string{catelog}
var tags []string var tags []string
for _, node := range nodes { for _, node := range nodes {
tags = append(tags, htmlquery.InnerText(node)) tags = append(tags, htmlquery.InnerText(node))
} }
saveBlog(title, content, time, tags) saveBlog(title, content, time, tags, catelogs)
}) })
c.OnRequest(func(r *colly.Request) { c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL) fmt.Println("Visiting", r.URL)
}) })
c.UserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36" c.UserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"
c.Visit("https://www.aiprose.com/blogs") c.Visit("https://www.aiprose.com/blogs")
//c.Visit("https://www.aiprose.com/blog/143")
c.Wait() c.Wait()
} }

3
wordpress/wp_post.go

@ -1,5 +1,7 @@
package wordpress package wordpress
import "github.com/go-basic/uuid"
type Post struct { type Post struct {
BlogID int BlogID int
PostContent PostContent
@ -64,6 +66,7 @@ func NewPost(title string, content string, postDate string, tags []string, cate
PostType: `post`, PostType: `post`,
PostStatus: `publish`, PostStatus: `publish`,
PostTitle: title, PostTitle: title,
PostName: uuid.New(),
PostContent: content, PostContent: content,
PostDate: postDate, PostDate: postDate,
//PostDate: time.Now().Format(`2006-01-02 15:04:05`), //PostDate: time.Now().Format(`2006-01-02 15:04:05`),

Loading…
Cancel
Save