Browse Source

aiprose 爬虫成功

master
燕鹏 3 years ago
parent
commit
1c4a575eaa
  1. 16
      go.mod
  2. 7
      main.go
  3. 22
      spider/blog.go
  4. 2
      spider/client.go
  5. 45
      spider/spider.go
  6. 26
      wordpress/wp_post.go

16
go.mod

@ -2,4 +2,18 @@ module go-wordpress-xmlrpc
go 1.16
require github.com/kolo/xmlrpc v0.0.0-20201022064351-38db28db192b
require (
github.com/PuerkitoBio/goquery v1.7.1 // indirect
github.com/andybalholm/cascadia v1.3.1 // indirect
github.com/antchfx/htmlquery v1.2.4 // indirect
github.com/antchfx/xmlquery v1.3.7 // indirect
github.com/gocolly/colly/v2 v2.1.0 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.2 // indirect
github.com/kolo/xmlrpc v0.0.0-20201022064351-38db28db192b
github.com/temoto/robotstxt v1.1.2 // indirect
golang.org/x/net v0.0.0-20210927181540-4e4d966f7476 // indirect
golang.org/x/text v0.3.7 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/protobuf v1.27.1 // indirect
)

7
main.go

@ -0,0 +1,7 @@
package main
import "go-wordpress-xmlrpc/spider"
func main() {
spider.SpiderAiprose()
}

22
spider/blog.go

@ -0,0 +1,22 @@
package spider
import (
"go-wordpress-xmlrpc/wordpress"
"log"
)
func saveBlog(title string, content string, postDate string, tags []string) {
c, err := NewClient(`http://49.235.160.131/xmlrpc.php`, UserInfo{
`nelson`,
`Yasaka.00`,
})
if err != nil {
log.Fatalln(err)
}
p := wordpress.NewPost(title, content, postDate, tags, tags)
blogID, err := c.Call(p)
if err != nil {
log.Println(err)
}
log.Println(blogID)
}

2
client.go → spider/client.go

@ -1,4 +1,4 @@
package xmlrpc
package spider
import (
"github.com/kolo/xmlrpc"

45
spider/spider.go

@ -0,0 +1,45 @@
package spider
import (
"fmt"
"github.com/antchfx/htmlquery"
"github.com/gocolly/colly/v2"
"strings"
)
func SpiderAiprose() {
c := colly.NewCollector(colly.AllowedDomains("www.aiprose.com"))
//c := colly.NewCollector()
c.Async = true
// Find and visit all links
c.OnHTML(".home-content-title a[href]", func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})
c.OnHTML(".pagination .next a[href]", func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})
c.OnHTML(".blog-root", func(e *colly.HTMLElement) {
title := strings.TrimSpace(e.DOM.Find(".blog-title").Eq(0).Text())
//author:=strings.TrimSpace(e.DOM.Find(".author-info a").Eq(0).Text())
time := strings.TrimSpace(e.DOM.Find(".author-info span").Eq(0).Text())
content := strings.TrimSpace(e.DOM.Find(".blog-detaile").Eq(0).Text())
nodes := e.DOM.Find(".blog-label label").Nodes
var tags []string
for _, node := range nodes {
tags = append(tags, htmlquery.InnerText(node))
}
saveBlog(title, content, time, tags)
//println(title +author+ time)
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL)
})
c.UserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"
c.Visit("https://www.aiprose.com/blogs")
//c.Visit("https://www.aiprose.com/blog/139")
c.Wait()
}

26
wordpress/wp_post.go

@ -1,9 +1,5 @@
package wordpress
import (
"time"
)
type Post struct {
BlogID int
PostContent
@ -63,13 +59,31 @@ func (p Post) GetArgs(user string, pwd string) interface{} {
return args
}
func NewPost(content string, title string, tags []string, cate []string) (p Post) {
func NewPost(title string, content string, postDate string, tags []string, cate []string) (p Post) {
p.PostContent = PostContent{
PostType: `post`,
PostStatus: `publish`,
PostTitle: title,
PostContent: content,
PostDate: postDate,
//PostDate: time.Now().Format(`2006-01-02 15:04:05`),
TermsNames: TermsNames{
PostCategory: cate,
TagsInput: tags,
},
}
return p
}
func NewPostAuthor(title string, content string, postDate string, postAuthor int, tags []string, cate []string) (p Post) {
p.PostContent = PostContent{
PostType: `post`,
PostStatus: `publish`,
PostTitle: title,
PostContent: content,
PostDate: time.Now().Format(`2006-01-02 15:04:05`),
PostAuthor: postAuthor,
PostDate: postDate,
//PostDate: time.Now().Format(`2006-01-02 15:04:05`),
TermsNames: TermsNames{
PostCategory: cate,
TagsInput: tags,

Loading…
Cancel
Save