selenium

  • selenium 在爬虫中的作用

    他能够帮助我们解决掉javascript动态加载的问题,requests的爬取只能获取的网页静态的资源和ajax的数据

    selenium的问题在哪:慢

    selenium最主要的使用场景是:验证登录

  • 安装selenium

在本机的python环境里利用pip install selenium

查看浏览器版本号,下载对应版本的driver启动器

把下载的启动器的地址放在path中,这样启动时不需配置地址

具体安装和用法,点击详情

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
from selenium import webdriver
import time

#driver = webdriver.Chrome(executable_path = 'D:\SoftWare\chrom\chromdriver.exe')

# 1. 能够启动浏览器,配置好启动器的环境path,无需填写地址
driver = webdriver.Chrome()
# 2. 能够访问网页
driver.get('http://www.baidu.com')
# 3. 能够定位数据
driver.find_element_by_id('kw').send_keys('大长腿是什么体验')
# 4. 能够填写表单
driver.find_element_by_id('su').click()

#等待网页加载完,不然下方写入的html代码不完全
time.sleep(3)
# 5. 能够获取网页源代码
with open('baidudachangtui.html', 'wb') as f:
f.write(driver.page_source.encode('utf-8'))

有道翻译的爬取(selenium)

1
2
3
4
5
6
7
8
9
10
11
12
13
from selenium import webdriver

driver = webdriver.Chrome()

driver.get('http://fanyi.youdao.com/')

driver.find_element_by_id('inputOriginal').send_keys('你真帅')

import time
time.sleep(1)
translated = driver.find_element_by_id('transTarget').text

print(translated)

新浪微博的爬取(selenium)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from selenium import webdriver
import time
from selenium.webdriver.chrome.options import Options
# selenium 的 headless 模式
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("window-size=1980,1080")

driver = webdriver.Chrome(chrome_options=chrome_options)

driver.get('https://weibo.com/')
time.sleep(10)

driver.find_element_by_id('loginname').send_keys('18510556963')
driver.find_element_by_name('password').send_keys('yaoqinglin2011')

driver.find_element_by_class_name('W_btn_a').click()
time.sleep(20)
if '"text=请输入验证码"' in driver.page_source:
img_ele = driver.find_element_by_xpath('//img[@node-type="verifycode_image"]')
image_url = img_ele.get_attribute('src')

import requests
response = requests.get(image_url)
with open('verify_image.jpg', 'wb') as f:
f.write(response.content)
# import base64
# encodestr = base64.b64encode(response.content).decode('utf-8')
# print(encodestr)
#
# # 识别验证码
# url = 'http://wordimg.market.alicloudapi.com/word'
# form = {
# 'image': 'data:image/jpeg;base64,' + encodestr,
# 'language': 'ENG',
# }
# headers = {
# 'Authorization': 'APPCODE eab23fa1d03f40d48b43c826c57bd284'
# }
# response = requests.post(url, data = form, headers=headers)
# print(response.text)
# verify_image = response.json()['result'][0]['words']
# print(verify_image)

verify_image = input('请输入验证码:\n')
driver.find_element_by_name('verifycode').send_keys(verify_image)
driver.find_element_by_class_name('W_btn_a').click()
time.sleep(20)

# 只需要找到cookie就可以了, 不要使用selenium做过多的访问, 因为很慢

cookie_list = driver.get_cookies()
cookie_item_list = []
for item in cookie_list:
name = item['name']
value = item['value']
cookie_item = name + '=' + value
cookie_item_list.append(cookie_item)
cookie_str = '; '.join(cookie_item_list)

print(cookie_str)

# 使用cookie访问需要登录的页面
url = 'https://account.weibo.com/set/index?topnav=1&wvr=6'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
#'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
#'Connection': 'keep-alive',
'Cookie': cookie_str,
'Host': 'account.weibo.com',
'Referer': 'https://weibo.com/u/5995110384/home',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}
import requests
response = requests.get(url, headers=headers)
with open('basic_modify.html', 'wb') as f:
f.write(response.content)

雪球网的爬取(requests,BeautifulSoup)

BeautifulSoup与css选择器相关,选取特定的标签

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import requests
from bs4 import BeautifulSoup

url = 'https://xueqiu.com/ask/square'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
}
# 第三步
response = requests.get(url, headers=headers)
# 第四步
# 正则表达式 - re
# XPATH - lxml
# etree.HTML(response.text)
# CSS SELECTOR - BeautifulSoup

soup = BeautifulSoup(response.text, 'lxml')

# 定位标签
# XPATH: //title
# title_tag = soup.select('title')
# print(title_tag)
# print(type(title_tag[0]))
# # 获取tag中的内容
# print(title_tag[0].text)

# XPATH : //head/meta
# CSS SELECTOR: head > meta
# meta_tag_list = soup.select('head > meta')
# print(meta_tag_list)

# XPATH : //head/meta[@content]/@content
# CSS SELECTOR: head > meta[@content]
# meta_tag_list = soup.select('head > meta[content]')
# print(meta_tag_list)
# for meta_tag in meta_tag_list:
# # 通过字典的方式获取标签的属性值
# print(meta_tag['content'])

# XPATH: //div[@class="square-main"]
# CSS SELECTOR: div.square-main
# div_tag_list = soup.select('div.square-main')
# print(div_tag_list)

# XPATH: //div[@id="head"]
# CSS SELECTOR: div#head
div_tag_list = soup.select('div#head')
print(div_tag_list)