go colly 爬虫博客到wordpress
https://www.aispider.cc
Du kan inte välja fler än 25 ämnen
Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.
45 rader
1.3 KiB
45 rader
1.3 KiB
package spider
|
|
|
|
import (
|
|
"fmt"
|
|
"github.com/antchfx/htmlquery"
|
|
"github.com/gocolly/colly/v2"
|
|
"strings"
|
|
)
|
|
|
|
func SpiderAiprose() {
|
|
c := colly.NewCollector(colly.AllowedDomains("www.aiprose.com"))
|
|
//c := colly.NewCollector()
|
|
c.Async = true
|
|
// Find and visit all links
|
|
c.OnHTML(".home-content-title a[href]", func(e *colly.HTMLElement) {
|
|
e.Request.Visit(e.Attr("href"))
|
|
})
|
|
|
|
c.OnHTML(".pagination .next a[href]", func(e *colly.HTMLElement) {
|
|
e.Request.Visit(e.Attr("href"))
|
|
})
|
|
|
|
c.OnHTML(".blog-root", func(e *colly.HTMLElement) {
|
|
title := strings.TrimSpace(e.DOM.Find(".blog-title").Eq(0).Text())
|
|
//author:=strings.TrimSpace(e.DOM.Find(".author-info a").Eq(0).Text())
|
|
time := strings.TrimSpace(e.DOM.Find(".author-info span").Eq(0).Text())
|
|
content := strings.TrimSpace(e.DOM.Find(".blog-detaile").Eq(0).Text())
|
|
nodes := e.DOM.Find(".blog-label label").Nodes
|
|
var tags []string
|
|
for _, node := range nodes {
|
|
tags = append(tags, htmlquery.InnerText(node))
|
|
}
|
|
saveBlog(title, content, time, tags)
|
|
//println(title +author+ time)
|
|
})
|
|
|
|
c.OnRequest(func(r *colly.Request) {
|
|
fmt.Println("Visiting", r.URL)
|
|
})
|
|
|
|
c.UserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"
|
|
c.Visit("https://www.aiprose.com/blogs")
|
|
//c.Visit("https://www.aiprose.com/blog/139")
|
|
c.Wait()
|
|
}
|
|
|