xpath

  • 比正则表达式更好的一种定位HTML信息的方式叫做XPATH,点击查看详情

使用XPATH时,导入lxml包

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import requests
# 调用XPATH的包
from lxml import etree

url = 'http://lol.178.com/'
response = requests.get(url)

with open('178.html', 'wb') as f:
f.write(response.content)

# 需要找到的是 /html/head/title

html_ele = etree.HTML(response.text)
print(html_ele)
head_ele = html_ele.xpath('/html/head')
print(head_ele)
print(head_ele[0])
title_ele = head_ele[0].xpath('./title')
print(title_ele[0])

# 找到了标签, 如何找到标签内的内容
print(title_ele[0].text)
# 另一种方式就是 通过xpath获取内部的字符串 `/html/head/title/text()`

# 在xpath中可以通过索引的方式获取具体的那个标签, 从1开始
meta_ele = html_ele.xpath('/html/head/meta[1]')
print(meta_ele[0])

# 通过@获取标签下的属性
charset_str = html_ele.xpath('/html/head/meta[1]/@charset')
print(charset_str[0])

# /html/head/link[1]/@href
href_str = html_ele.xpath('/html/head/link[1]/@href')
print(href_str[0])

# 谓词相关信息: /html/body/div[@class="wrap"]
div_ele = html_ele.xpath('/html/body/div[@class="wrap"]/@class')
print(div_ele)

# /html/body/div[@class="wrap"]/div[1]/div[@class="head"]
div_ele = html_ele.xpath('/html/body/div[@class="wrap"]/div[1]/div[@class="head"]/@class')
print(div_ele)

# /html/body/div[@class="wrap"]/div/div[1]/div[1]/div/a
div_ele = html_ele.xpath('/html/body/div[@class="wrap"]/div/div[1]/div[1]/div/a/text()')
print(div_ele)

# 搜索内容: //div[@class="head"]
div_ele = html_ele.xpath('//div[@class="Oldversion"]/a/@href')
print(div_ele)

# 获取所有的属性包含 itemprop 的 meta 标签: /html/head/meta[@itemprop]
div_ele = html_ele.xpath('/html/head/meta[@itemprop]/@itemprop')
print(div_ele)

# 获取所有的属性不包含 itemprop 的 meta 标签:, 需要使用not函数
div_ele = html_ele.xpath('/html/head/meta[not(@itemprop)]')
print(div_ele)

# 找到刚刚的ul, XPATH: //ul[@class="ui-nav-main"]//text()
div_ele = html_ele.xpath('//ul[@class="ui-nav-main"]//text()')
print(div_ele)

strs = [item for item in div_ele if item.strip()]
print(strs)

# contains()
# starts_with()

网易足球的爬取

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
 	import requests
from lxml import etree
from Day2.mysqlhelper import MysqlHelper

helper = MysqlHelper()

# 我们公司就是一个做足球相关的APP的公司
# 1. 足球相关新闻
# 2. 足球赛事
# 3. 足球锦囊(付费)

# 这个函数是负责下载详情页的相关信息的
def get_detailed_page_info(url, date):
print('正在处理: ', url)
# 爬虫第三步
response = requests.get(url)
# with open('163.html', 'wb') as f:
# f.write(response.content)
# 爬虫第四步
html_ele = etree.HTML(response.text)
event_name_lsit = html_ele.xpath('//div[@class="leftList2"]/div[@class="c2"]/a/text()')
loops_count = 0
for event_name in event_name_lsit:
print(event_name)
loops_count += 1
#print(event_name)
# 表格里面的xpath, 获取所有的tr : //div[@class="leftList4"]/table/tbody/tr
tr_ele_list = html_ele.xpath('//div[@class="leftList4"][{}]/table/tr'.format(loops_count))
for i in range(1, len(tr_ele_list), 3):
#print(tr_ele_list[i])
round = tr_ele_list[i].xpath('./td[1]/text()')[0].strip()
time = tr_ele_list[i].xpath('./td[2]/text()')[0].strip()
status = tr_ele_list[i].xpath('./td[3]/text()')[0].strip()
host_team = tr_ele_list[i].xpath('./td[4]//a/text()')[0].strip()
if tr_ele_list[i].xpath('./td[5]/text()')[0].strip():
score = tr_ele_list[i].xpath('./td[5]/text()')[0].strip()
else:
score = tr_ele_list[i].xpath('./td[5]//a/text()')[0].strip().replace(' ', '').replace('\n', '')
guest_team = tr_ele_list[i].xpath('./td[6]//a/text()')[0].strip()
print(round, time, status, host_team, score, guest_team)
print('-'*50)
# CREATE TABLE football_match (id int primary key auto_increment,
# `round` int,
# `time` varchar(10),
# `status` varchar(10),
# host_team varchar(50),
# score varchar(30),
# guest_team varchar(50),
# `date` date,
# event_name varchar(20)) DEFAULT CHARSET=utf8mb4;
insert_sql = 'INSERT INTO football_match(`round`, `time`, status, host_team, score, guest_team, `date`, event_name)' \
'VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'
data = (round, time, status, host_team, score, guest_team, date, event_name)
helper.execute_insert_sql(insert_sql, data)

# 初始的url就是 http://goal.sports.163.com/schedule/20170604.html
# 从初始的URL里面能够生成七个URL, 对七个URL进行访问数据存储MySQL
# 初始URL的下一页也能生成七个URL
# 如此循环

def get_all_football_match_info(start_url):
# 爬虫第三步
response = requests.get(start_url)
# with open('163.html', 'wb') as f:
# f.write(response.content)
# 爬虫的第四步
html_ele = etree.HTML(response.text)
href_list = html_ele.xpath('//div[@class="t2L"]/div[@class="c2"]/ul/li/a/@href')
# 预期上面的列表大小是七
#print(href_list)

for href in href_list:
date = href.split('/')[-1].split('.')[0]
url = 'http://goal.sports.163.com' + href
get_detailed_page_info(url, date)

next_page_href = html_ele.xpath('//div[@class="t2L"]/div[@class="c2"]/span[2]/a/@href')[0]

if next_page_href != start_url:
next_page_href = 'http://goal.sports.163.com' + next_page_href
get_all_football_match_info(next_page_href)

if __name__ == '__main__':
# url = 'http://goal.sports.163.com/schedule/20170910.html'
# get_detailed_page_info(url, '20160911')
start_url = 'http://goal.sports.163.com/schedule/20170604.html'
get_all_football_match_info(start_url)

妹子图的爬取

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# https://www.mzitu.com/
# https://www.mzitu.com/page/2/
# ....
# https://www.mzitu.com/page/209/


# 第二个URL
# https://www.mzitu.com/166998
# 获取总的页数有多少

# 第三种
# https://www.mzitu.com/166998/1
# 找到图片的URL, 并且保存到本地
#

# 第四种
# https://i.meizitu.net/2019/01/03a02.jpg
# 下载图片
import requests
import os
from lxml import etree

def get_all_list_url(url):
print('正在处理的最外层URL : ', url)
print('--'*30)
response = requests.get(url)

html_ele = etree.HTML(response.text)

# //ul[@id="pins"]/li/a/@href
href_list = html_ele.xpath('//ul[@id="pins"]/li/a/@href')
for href in href_list:
get_detailed_page_url(href)
break

next_page_ele = html_ele.xpath('//a[@class="next page-numbers"]/@href')
if next_page_ele:
get_all_list_url(next_page_ele[0])


def get_detailed_page_url(url):
response = requests.get(url)
# //div[@class="pagenavi"]/a[last()-1]/span/text()
html_ele = etree.HTML(response.text)
page_num = html_ele.xpath('//div[@class="pagenavi"]/a[last()-1]/span/text()')[0]
for i in range(int(page_num)):
detailed_url = url + '/' + str(i+1)
get_image_url(detailed_url)
break

def get_image_url(url):
response = requests.get(url)

html_ele = etree.HTML(response.text)
# //div[@class="main-image"]/p/a/img/@src
src = html_ele.xpath('//div[@class="main-image"]/p/a/img/@src')[0]
download_image(src, url)

def download_image(url, referer):
print('正在下载...'+ url)

downloaddir = '高老师的小秘密'
if not os.path.exists(downloaddir):
os.mkdir(downloaddir)

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
'Referer': referer,
}

response = requests.get(url, headers=headers)
filename = downloaddir + '/' + url.split('/')[-1]
with open(filename, 'wb') as f:
f.write(response.content)

if __name__ == '__main__':
# 获取列表页的所有的URL的测试
url = 'https://www.mzitu.com/'
get_all_list_url(url)

# 获取所有的详情页的url的代码
# url = 'https://www.mzitu.com/166998'
# get_detailed_page_url(url)

# 获取image url的测试代码
# url = 'https://www.mzitu.com/166998/2'
# get_image_url(url)

# 下载image的测试代码
# referer = 'https://www.mzitu.com/166998/2'
# url = 'https://i.meizitu.net/2019/01/03a02.jpg'
# download_image(url, referer)