因为收藏了某些网站中的内容,收藏的内容太多而且网站不提供检索功能,因此需要将收藏的内容获取下来,然后进行检索.所以就写了这个小爬虫进行数据获取并保存的mysql中.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
package main
import (
"bytes"
"database/sql"
"fmt"
_ "github.com/go-sql-driver/mysql"
"github.com/opesun/goquery"
"io"
"io/ioutil"
"net/http"
"os"
"strconv"
"strings"
)
func main() {
var i = 1
db, err := sql.Open("mysql", "user:pwd@tcp(localhost:3306)/dbname?charset=utf8")
checkErr(err)
cookie := ""
for {
content := httpGet(i, cookie)
parseHtml(content, db)
i++
}
defer db.Close()
}
func httpGet(i int, cookie string) io.ReadCloser {
url := "http://xxxxx.io/favorites?page=" + strconv.Itoa(i)
fmt.Println(url)
client := &http.Client{}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
print(err)
os.Exit(1)
}
req.Header.Set("Pragma", "no-cache")
req.Header.Set("Accept-Language", "zh-CN,zh;q=0.8")
req.Header.Set("Upgrade-Insecure-Requests", "1")
req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36")
req.Header.Set("Connection", "keep-alive")
req.Header.Set("Cookie", cookie)
resp, err := client.Do(req)
checkErr(err)
body, err := ioutil.ReadAll(resp.Body)
//ioutil.ReadAll 在读完io.Reader后会将io.Reader清空,因此需要将其恢复
buf := bytes.NewBuffer(body)
resp.Body = ioutil.NopCloser(buf)
flag := strings.Contains(string(body), "您还没有任何收藏")
if flag {
os.Exit(0)
}
return resp.Body
}
func parseHtml(content io.ReadCloser, db *sql.DB) {
p, err := goquery.Parse(content)
checkErr(err)
favorites := p.Find(".post")
for i := 0; i < favorites.Length(); i++ {
d := favorites.Eq(i)
title := d.Find(".title")
link := title.Find("a")
titleText := title.Text()
linkText := link.Attr("href")
titleText = strings.TrimSpace(titleText)
fmt.Println(title.Text())
fmt.Println(link.Attr("href"))
save(titleText, linkText, db)
}
}
func save(title, link string, db *sql.DB) {
flag := query(db, title, link) //记录存在就不出理直接返回
if flag {
return
}
stmt, err := db.Prepare("INSERT toutiao SET title=?,link=?")
checkErr(err)
res, err := stmt.Exec(title, link)
id, err := res.LastInsertId()
checkErr(err)
fmt.Println(id)
}
func query(db *sql.DB, title, link string) bool {
var sql = "select id from toutiao where title = ? and link = ?"
var id int
err := db.QueryRow(sql, title, link).Scan(&id)
if err != nil || id == 0 {
return false
}
return true
}
func checkErr(err error) {
if err != nil {
panic(err)
}
}

数据库表格

1
2
3
4
5
6
7
8
9
10
11
CREATE TABLE `NewTable` (
`id` int(11) NOT NULL AUTO_INCREMENT ,
`title` varchar(150) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL ,
`link` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL ,
PRIMARY KEY (`id`),
INDEX `idx_title` (`title`) USING BTREE
)
ENGINE=InnoDB
DEFAULT CHARACTER SET=utf8 COLLATE=utf8_general_ci
AUTO_INCREMENT=1
ROW_FORMAT=COMPACT