?Python爬蟲(chóng)可能大家都玩膩了,那就玩一下Golang的爬蟲(chóng)吧!
?
?想獲取
?原圖
或是.xmind
格式可在公眾號(hào)回復(fù)Go爬蟲(chóng)
Golang中提供了net/http
這個(gè)包原生支持request
和response
。
var client http.Client
reqList, err := http.NewRequest("GET", URL, nil)
Go中提供了一個(gè)cookiejar.New
的函數(shù)方法,用于保留生成Cookie
信息,這個(gè)是為了一些網(wǎng)站要登陸才能爬取的情況,所以我們登陸完之后,會(huì)有一個(gè)cookie,這個(gè)cookie是存儲(chǔ)用戶信息
的,也就是這個(gè)信息是讓服務(wù)器知道是誰(shuí)
進(jìn)行這一次的訪問(wèn)!比如說(shuō)登陸學(xué)校的教務(wù)處進(jìn)行爬取課表,因?yàn)檎n表每個(gè)人都可能是不同的,所以就需要登陸,讓服務(wù)器知道這是誰(shuí)的課表信息,所以就需要在請(qǐng)求頭上加上cookie
進(jìn)行偽裝爬取。
jar, err := cookiejar.New(nil)
if err != nil {
panic(err)
}
構(gòu)造POST
請(qǐng)求的時(shí)候,可以把要傳輸?shù)臄?shù)據(jù)進(jìn)行封裝好,與URL
一起構(gòu)造
var client http.Client
Info :="muser="+muserid+"&"+"passwd="+password
var data = strings.NewReader(Info)
req, err := http.NewRequest("POST", URL, data)
req.Header.Set("Connection", "keep-alive")
req.Header.Set("Pragma", "no-cache")
req.Header.Set("Cache-Control", "no-cache")
req.Header.Set("Upgrade-Insecure-Requests", "1")
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36")
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9")
resp, _:= client.Do(req) // 發(fā)送請(qǐng)求
bodyText, _ := ioutil.ReadAll(resp.Body) // 使用緩沖區(qū)讀取網(wǎng)頁(yè)內(nèi)容
上文也提到了一個(gè)包,當(dāng)發(fā)送完請(qǐng)求之后,cookie就會(huì)保存在這個(gè)client.Jar
這個(gè)包中
myStr:=fmt.Sprintf("%s",client.Jar) //強(qiáng)制類(lèi)型轉(zhuǎn)化 指針裝到string
我們處理打印出這個(gè)client.Jar
這個(gè)包的信息之后,選出響應(yīng)的cookie
,然后放在請(qǐng)求頭上面即可!就能處理登陸情況下的cookie問(wèn)題了。
req.Header.Set("Cookie", "ASP.NET_SessionId="+cook)
至此,發(fā)送請(qǐng)求部分就完全完成了!
github.com/PuerkitoBio/goquery
提供了.NewDocumentFromReader
方法進(jìn)行網(wǎng)頁(yè)的解析。
doc, err := goquery.NewDocumentFromReader(resp.Body)
github.com/antchfx/htmlquery
提供了.Parse
方法進(jìn)行網(wǎng)頁(yè)的解析
root, _ := htmlquery.Parse(resp.Body)
reId, _ := regexp.Compile(`id=(\d+)`) // 正則匹配
allId := reId.FindAll(bodyText,1)
for _,item := range allId {
id=string(item)
}
通過(guò)2.1,我們拿到上一步解析出來(lái)的doc
之后,可以進(jìn)行css選擇器語(yǔ)法
,進(jìn)行結(jié)點(diǎn)的選擇。
doc.Find("#main > div.right > div.detail_main_content").
Each(func(i int, s *goquery.Selection) {
Data.title = s.Find("p").Text()
Data.time = s.Find("#fbsj").Text()
Data.author = s.Find("#author").Text()
Data.count = Read_Count(Read_Id)
fmt.Println(Data.title, Data.time, Data.author,Data.count)
})
doc.Find("#news_content_display").Each(func(i int, s *goquery.Selection) {
Data.content = s.Find("p").Text()
fmt.Println(Data.content)
})
通過(guò)3.2,我們拿到上一步解析出來(lái)的root
之后,可以進(jìn)行Xpath語(yǔ)法的編寫(xiě)
,進(jìn)行結(jié)點(diǎn)的選擇。
tr := htmlquery.Find(root, "http://*[@id='LB_kb']/table/tbody/tr/td") //使用Xpath進(jìn)行結(jié)點(diǎn)信息的獲取
for _, row := range tr { //len(tr)=13
classNames := htmlquery.Find(row, "./font")
classPosistions := htmlquery.Find(row,"./text()[4]")
classTeachers := htmlquery.Find(row,"./text()[5]")
if len(classNames)!=0 {
className = htmlquery.InnerText(classNames[0])
classPosistion = htmlquery.InnerText(classPosistions[0])
classTeacher = htmlquery.InnerText(classTeachers[0])
fmt.Println(className)
fmt.Println(classPosistion)
fmt.Println(classTeacher)
}
}
const (
usernameClass = "root"
passwordClass = "root"
ipClass = "127.0.0.1"
portClass = "3306"
dbnameClass = "class"
)
var DB *sql.DB
func InitDB(){
path := strings.Join([]string{usernameClass, ":", passwordClass, "@tcp(", ipClass, ":", portClass, ")/", dbnameClass, "?charset=utf8"}, "")
DB, _ = sql.Open("mysql", path)
DB.SetConnMaxLifetime(10)
DB.SetMaxIdleConns(5)
if err := DB.Ping(); err != nil{
fmt.Println("opon database fail")
return
}
fmt.Println("connect success")
}
type Class struct {
classData string
teacherName string
position string
}
func InsertData(Data Class) bool {
tx, err := DB.Begin()
if err != nil{
fmt.Println("tx fail")
return false
}
stmt, err := tx.Prepare("INSERT INTO class_data (`class`,`teacher`,`position`) VALUES (?, ?, ?)")
if err != nil{ // 數(shù)據(jù)的插入
fmt.Println("Prepare fail",err)
return false
}
_, err = stmt.Exec(Data.classData,Data.teacherName,Data.position) //執(zhí)行事務(wù)
if err != nil{
fmt.Println("Exec fail",err)
return false
}
_ = tx.Commit() // 提交事務(wù)
return true
}
type NewD struct {
gorm.Model
Title string `gorm:"type:varchar(255);not null;"`
Time string `gorm:"type:varchar(256);not null;"`
Author string `gorm:"type:varchar(256);not null;"`
Count string `gorm:"type:varchar(256);not null;"`
Content string `gorm:"type:longtext;not null;"`
}
var db *gorm.DB
func Init() {
var err error
path := strings.Join([]string{userName_New, ":", password_New, "@tcp(",ip_New, ":", port_New, ")/", dbName_New, "?charset=utf8"}, "")
db, err = gorm.Open("mysql", path)
if err != nil {
panic(err)
}
fmt.Println("SUCCESS")
_ = db.AutoMigrate(&NewD{})
sqlDB := db.DB()
sqlDB.SetMaxIdleConns(10)
sqlDB.SetMaxOpenConns(100)
}
NewA := NewD{
Title: Data.title,
Time: Data.time,
Author: Data.author,
Count: Data.count,
Content: Data.content,
}
err = db.Create(&NewA).Error // 在數(shù)據(jù)庫(kù)中創(chuàng)建一條數(shù)據(jù)
期待你的關(guān)注~
聯(lián)系客服