浏览代码

爬虫到正式库

master
nelson 3 年前
父节点
当前提交
0398bf368d
  1. 5
      go.mod
  2. 5
      spider/blog.go
  3. 5
      spider/spider.go
  4. 3
      wordpress/wp_post.go

5
go.mod

@ -5,9 +5,10 @@ go 1.16
require (
github.com/PuerkitoBio/goquery v1.7.1 // indirect
github.com/andybalholm/cascadia v1.3.1 // indirect
github.com/antchfx/htmlquery v1.2.4 // indirect
github.com/antchfx/htmlquery v1.2.4
github.com/antchfx/xmlquery v1.3.7 // indirect
github.com/gocolly/colly/v2 v2.1.0 // indirect
github.com/go-basic/uuid v1.0.0 // indirect
github.com/gocolly/colly/v2 v2.1.0
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.2 // indirect
github.com/kolo/xmlrpc v0.0.0-20201022064351-38db28db192b

5
spider/blog.go

@ -5,15 +5,16 @@ import (
"log"
)
func saveBlog(title string, content string, postDate string, tags []string) {
func saveBlog(title string, content string, postDate string, tags []string, catelogs []string) {
c, err := NewClient(`http://49.235.160.131/xmlrpc.php`, UserInfo{
//c, err := NewClient(`https://www.aispider.cc/xmlrpc.php`, UserInfo{
`nelson`,
`Yasaka.00`,
})
if err != nil {
log.Fatalln(err)
}
p := wordpress.NewPost(title, content, postDate, tags, tags)
p := wordpress.NewPost(title, content, postDate, tags, catelogs)
blogID, err := c.Call(p)
if err != nil {
log.Println(err)

5
spider/spider.go

@ -19,18 +19,21 @@ func SpiderAiprose() {
title := strings.TrimSpace(e.DOM.Find(".blog-title").Eq(0).Text())
//author:=strings.TrimSpace(e.DOM.Find(".author-info a").Eq(0).Text())
time := strings.TrimSpace(e.DOM.Find(".author-info span").Eq(0).Text())
catelog := strings.TrimSpace(e.DOM.Find(".author-info .catelog-name").Text())
content := strings.TrimSpace(e.DOM.Find(".blog-detaile").Eq(0).Text())
nodes := e.DOM.Find(".blog-label label").Nodes
catelogs := []string{catelog}
var tags []string
for _, node := range nodes {
tags = append(tags, htmlquery.InnerText(node))
}
saveBlog(title, content, time, tags)
saveBlog(title, content, time, tags, catelogs)
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL)
})
c.UserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"
c.Visit("https://www.aiprose.com/blogs")
//c.Visit("https://www.aiprose.com/blog/143")
c.Wait()
}

3
wordpress/wp_post.go

@ -1,5 +1,7 @@
package wordpress
import "github.com/go-basic/uuid"
type Post struct {
BlogID int
PostContent
@ -64,6 +66,7 @@ func NewPost(title string, content string, postDate string, tags []string, cate
PostType: `post`,
PostStatus: `publish`,
PostTitle: title,
PostName: uuid.New(),
PostContent: content,
PostDate: postDate,
//PostDate: time.Now().Format(`2006-01-02 15:04:05`),

正在加载...
取消
保存