package spider
import (
"fmt"
"github.com/antchfx/htmlquery"
"github.com/gocolly/colly/v2"
"strings"
"time"
)
/// https://www.aiprose.com/ 爬取
func SpiderAiprose ( ) {
c := colly . NewCollector ( colly . AllowedDomains ( "www.aiprose.com" ) , colly . Async ( true ) )
c . OnHTML ( ".home-content-title a[href]" , func ( e * colly . HTMLElement ) {
e . Request . Visit ( e . Attr ( "href" ) )
} )
c . OnHTML ( ".pagination .next a[href]" , func ( e * colly . HTMLElement ) {
e . Request . Visit ( e . Attr ( "href" ) )
} )
c . OnHTML ( ".blog-root" , func ( e * colly . HTMLElement ) {
title := strings . TrimSpace ( e . DOM . Find ( ".blog-title" ) . Eq ( 0 ) . Text ( ) )
//author:=strings.TrimSpace(e.DOM.Find(".author-info a").Eq(0).Text())
time := strings . TrimSpace ( e . DOM . Find ( ".author-info span" ) . Eq ( 0 ) . Text ( ) )
catelog := strings . TrimSpace ( e . DOM . Find ( ".author-info .catelog-name" ) . Text ( ) )
content := strings . TrimSpace ( e . DOM . Find ( ".blog-detaile" ) . Eq ( 0 ) . Text ( ) )
nodes := e . DOM . Find ( ".blog-label label" ) . Nodes
catelogs := [ ] string { catelog }
var tags [ ] string
for _ , node := range nodes {
tags = append ( tags , htmlquery . InnerText ( node ) )
}
saveBlog ( title , content , time , tags , catelogs )
} )
c . OnRequest ( func ( r * colly . Request ) {
fmt . Println ( "Visiting" , r . URL )
} )
c . UserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"
c . Visit ( "https://www.aiprose.com/blogs" )
//c.Visit("https://www.aiprose.com/blog/143")
c . Wait ( )
}
/// csdn爬取 https://blog.csdn.net/bitree1?type=blog
func SpiderBitree ( ) {
c := colly . NewCollector ( colly . AllowedDomains ( "blog.csdn.net" ) , colly . Async ( false ) )
c . UserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"
c . Limit ( & colly . LimitRule {
Delay : 10000 * time . Minute ,
//RandomDelay: 2 * time.Minute,
Parallelism : 1 ,
} )
c2 := c . Clone ( )
//异步
c2 . Async = false
//限速
c2 . Limit ( & colly . LimitRule {
DomainGlob : "blog.csdn.net/bitree1/article/details/*" ,
Delay : 10000 * time . Minute ,
Parallelism : 1 ,
} )
c . OnHTML ( ".blog-list-box a[href]" , func ( e * colly . HTMLElement ) {
c2 . Request ( "GET" , e . Attr ( "href" ) , nil , nil , nil )
} )
c2 . OnHTML ( ".blog-content-box" , func ( e * colly . HTMLElement ) {
//fmt.Println("获取到文章")
title := strings . TrimSpace ( e . DOM . Find ( ".title-article" ) . Eq ( 0 ) . Text ( ) )
fmt . Println ( title )
} )
c2 . OnRequest ( func ( r * colly . Request ) {
fmt . Println ( "Visiting" , r . URL )
} )
c2 . OnError ( func ( r * colly . Response , err error ) {
fmt . Println ( "Request URL:" , r . Request . URL . String ( ) , "failed with response:" , r , "\nError:" , err )
c2 . Request ( "GET" , r . Request . URL . String ( ) , nil , nil , nil )
} )
c . Visit ( "https://blog.csdn.net/bitree1?type=blog" )
c . Wait ( )
c2 . Wait ( )
}