requests模块的使用

  • 百度翻译的爬取(不使用fiddler)
1
2
3
4
5
6
7
8
9
10
11
12
13
import requests
import json

url = 'https://fanyi.baidu.com/sug'
#翻译的内容太长会报错,翻译不了
form = {
'kw': '黄色',
}

response = requests.post(url, data=form)

res_dict = json.loads(response.text)
print(res_dict)
  • 有道翻译的爬取

    找到翻译内容的network数据所在,点击后面的initiator,找到js,再把js在线格式化,放入python文件同级目录下,打开js,找到加密方法,如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import requests
import json

def translate(key):
# 1. ts 是当前的时间戳 1547090110687
import time
ts = int(time.time() * 1000)
import random
salt = str(ts + random.randint(0, 10))

# js中是这样的: "fanyideskweb" + e + i + "n%A-rKaT5fb[Gy?;N5@Tj"
sign = "fanyideskweb" + key + salt + "n%A-rKaT5fb[Gy?;N5@Tj"
# 需要对sign做md5的操作
import hashlib
lmd5 = hashlib.md5()
lmd5.update(sign.encode('utf-8'))
sign_md5 = lmd5.hexdigest()

form = {
'i': key,
'from': 'AUTO',
'to': 'AUTO',
'smartresult': 'dict',
'client': 'fanyideskweb',
'salt': salt,
'sign': sign_md5,
'ts': ts,
'bv': '4b1009b506fa4405f21e207abc4459fd',
'doctype': 'json',
'version': '2.1',
'keyfrom': 'fanyi.web',
'action': 'FY_BY_REALTIME',
'typoResult': 'false',
}
headers = {
'Referer': 'http://fanyi.youdao.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Cookie': 'OUTFOX_SEARCH_USER_ID=717705358@10.169.0.83; JSESSIONID=aaaf_u-JXSbTm4g1X5m6w; OUTFOX_SEARCH_USER_ID_NCOO=1759410959.7206407; ___rl__test__cookies=1574324686939',
}
url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'
response = requests.post(url, data=form, headers=headers)
res_dict = json.loads(response.text)
print(res_dict)
print(res_dict['translateResult'][0][0]['tgt'])

#main方法
if __name__ == '__main__':
while True:
#key为输入的值,()里的是输出到console里的字符串
key = input('饶氏翻译, 请说出你想翻译的内容:\n')
if key == 'q':
break
translate(key)
  • 微博博主的爬取并保存在数据库

    前往微博手机版,搜索博主,发现XHR里的ajax的General/Request URL的规律为地址后加&page=

    在项目导入依赖,即w3lib,pymysql

    创建weibo_test表,新建mysqlhelper.py,并输入;报错:‘类名’ object has no attribute ‘cursor’,说明cursor不存在,即开始连接就错误了

1
2
CREATE TABLE weibo_test(id int primary key auto_increment, comments_count
int, attitudes_count int, created_at varchar(40), `text` text) default charset=utf8mb4;
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import pymysql

class MysqlHelper(object):
def __init__(self):
self.conn = pymysql.connect(host='localhost', port=3306, passwd='123456', user='root', db='pymysqls', charset='utf8mb4')
self.cursor = self.conn.cursor()

def execute_insert_sql(self, sql, data):
self.cursor.execute(sql, data)
self.conn.commit()

def __del__(self):
self.cursor.close()
self.conn.close()

if __name__ == '__main__':
helper = MysqlHelper()
insert_sql = 'INSERT INTO weibo_test(comments_count,attitudes_count,created_at,`text`) VALUES(%s, %s, %s, %s)'
data = (2, 3, '1990-05-14', '今天有长长了哦')
helper.execute_insert_sql(insert_sql, data)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import requests
import json
# 去掉标签的方式
from w3lib.html import remove_tags
from mysqlhelper import MysqlHelper

helper = MysqlHelper()
# 需要将下面的内容获取出来, 并且存储到数据库中
# source
# comments_count
# attitudes_count
# created_at
# text

headers = {
'Accept': 'application/json, text/plain, */*',
'MWeibo-Pwa': '1',
'Referer': 'https://m.weibo.cn/u/1312412824?uid=1312412824&t=0&luicode=10000011&lfid=100103type%3D1%26q%3D%E6%9E%97%E5%BF%97%E7%8E%B2',
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Mobile Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'X-XSRF-TOKEN': '466897',
}

def get_one_page_info(url):
response = requests.get(url, headers=headers)

# json.loads是将json的字符串, 转化成dict
res_dict = json.loads(response.text)
cards_list = res_dict['data']['cards']

# 具体的获取数据环节, 爬虫的第四步
for card in cards_list:
if 'mblog' in card:
#source_a = card['mblog']['source']
comments_count = card['mblog']['comments_count']
attitudes_count = card['mblog']['attitudes_count']
created_at = card['mblog']['created_at']
text = remove_tags(card['mblog']['text'])

#这个符号在Tab上面
#print('------' * 20)
#print(created_at, source_a, comments_count, attitudes_count, text)
insert_sql = 'INSERT INTO weibo_test(comments_count,attitudes_count,created_at,`text`) VALUES(%s, %s, %s, %s)'
data = (comments_count, attitudes_count, created_at, text)
helper.execute_insert_sql(insert_sql, data)

if __name__ == '__main__':
for i in range(100):
print(i)
url = 'https://m.weibo.cn/api/container/getIndex?uid=1211441627&t=0&luicode=10000011&lfid=100103type%3D1%26q%3D%E5%85%AD%E5%B0%8F%E9%BE%84%E7%AB%A5&type=uid&value=1211441627&containerid=1076031211441627&page={}'.format(i+1)
get_one_page_info(url)

爬取数据过多,容易被封,不给继续爬取,程序报错如下