雪球网的爬取

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import requests
import json

headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-encoding': 'gzip, deflate, br',
'Accept-language': 'zh-CN,zh;q=0.9',
'Cookie': 'aliyungf_tc=AQAAAOeXuT4bbQkAK43ItiYQYpmcKxei; acw_tc=2760824b15759005426311706e8c9a8474b7161bba769e895108bc4a5d7384; xq_a_token=c9d3b00a3bd89b210c0024ce7a2e049f437d4df3; xq_r_token=8712d4cae3deaa2f3a3d130127db7a20adc86fb2; u=981575900545496; Hm_lvt_1db88642e346389874251b5a1eded6e3=1575900546; device_id=24700f9f1986800ab4fcc880530dd0ed; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1575900583',
'Referer': 'https://xueqiu.com/',
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Mobile Safari/537.36',
}

url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=20358817&count=15&category=-1'

def get_info():
global url
# 1. 通过requests获取上面url返回的json
response = requests.get(url, headers=headers)

# 2. 获取json中的data
# 3. 将data中的数据转成dict
res_dict = json.loads(response.text)

# 4. 获取下一个URL的max_id
next_max_id = res_dict['next_max_id']

# 5. 根据max_id改变url
url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id={}&count=15&category=-1'.format(next_max_id)

# 6. 获取dict里的list值
cards_list = res_dict['list']
for card in cards_list:
data = card['data']
# data是json格式的,所以要把它转换成dict
data_dict = json.loads(data)
title = data_dict["title"]
print(title)
description = data_dict["description"]
print(description)
user = data_dict['user']
print(user)
print()

if __name__ == '__main__':
for i in range(5):
print(url)
get_info()

爱淘宝的爬取

获取商品列表的标题,销量,价格,促销价格,评论数量等信息

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
 	import requests
import re
import json

headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-encoding': 'gzip, deflate, br',
'Accept-language': 'zh-CN,zh;q=0.9',
'Cookie': 'aliyungf_tc=AQAAAOeXuT4bbQkAK43ItiYQYpmcKxei; acw_tc=2760824b15759005426311706e8c9a8474b7161bba769e895108bc4a5d7384; xq_a_token=c9d3b00a3bd89b210c0024ce7a2e049f437d4df3; xq_r_token=8712d4cae3deaa2f3a3d130127db7a20adc86fb2; u=981575900545496; Hm_lvt_1db88642e346389874251b5a1eded6e3=1575900546; device_id=24700f9f1986800ab4fcc880530dd0ed; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1575900583',
'Referer': 'https://xueqiu.com/',
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Mobile Safari/537.36',
}

def get_info(url):
# 1. 通过requests获取上面url返回的json
response = requests.get(url, headers=headers)

# 2. 用正则选取response()里的内容
pattern = '({.*})'
res = re.search(pattern, response.text)

# 3. 将json格式的字符串转成dict
res_dict = json.loads( res.group(1))

# 4. 获取dict里的result的item列表值
items_list = res_dict['result']['item']
print(items_list)

for item in items_list:
# 标题
title = item['TITLE']
print(title)
# 销量
sell = item['SELL']
print('销量: '+sell)
#价格
goodsprice = item['GOODSPRICE']
print('价格: '+goodsprice)
# 促销价格
if (item['PROMOTEPRICE'] != ""):
promoteprice = item['PROMOTEPRICE']
print('促销价格: '+promoteprice)
# 评论数量
if 'feedGoodReviewCount' in item:
feedGoodReviewCount = item['feedGoodReviewCount']
print('评论数量: '+feedGoodReviewCount)
print()

if __name__ == '__main__':
for i in range(5):
url = 'https://odin.re.taobao.com/m/NwirelessSem?pid=430680_1006&refpid=mm_26632258_3504122_32554087&keyword=%E5%86%85%E8%A1%A3&count=60&offset={}&pvid=_TL-99123&features=isMall%2CisPostFree%2CfeedGoodReviewCount%2C10000034&sbid=re-searching&clk1=d48dd25cf2c0befc9139235d33641b70&_=1547436191605&callback=jsonp{}'.format(60*i,12+i)
print(url)
get_info(url)

今日头条街拍的爬取

爬取次数多时,会被网站禁止访问

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
   import requests
import re
import json
import os
import time

# 生成所有的json的url, 调用 get_detailed_page
def get_all_iamge():

if not os.path.exists(mydir):
os.mkdir(mydir)

for i in range(2):
timestamp = round(time.time()*1000);
#print(timestamp)
url = 'https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset=0&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count={}&en_qc=1&cur_tab=1&from=search_tab&pd=synthesis&timestamp={}'
my_url = url.format(i*20,timestamp)
get_detailed_page(my_url)

# json的那个url, 调用 parse_detailed_page
def get_detailed_page(url):
# 爬虫第三步
headers = {
'cookie': 'tt_webid=6768656694743909895; _ga=GA1.2.237111991.1575950710; _gid=GA1.2.1896547608.1575950710; WIN_WH=587_722; tt_webid=6768656694743909895; s_v_web_id=e2310544b094625945f5535a168a9ba0; WEATHER_CITY=%E5%8C%97%E4%BA%AC; csrftoken=f00fea89a91dade14cb27f4d9dd9e12b; __tasessionId=76sg2e3zu1575961424549',
'referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'x-requested-with': 'XMLHttpReques',
}
response = requests.get(url, headers=headers, verify=False)

# 以下这两步可以通过一步实现
#import json
# res = json.loads(response.text)
# 第四步
res = response.json()

data = res['data']
#print(data)
for item in data:
if 'article_url' in item:
article_url = item['article_url']
print(article_url)
# 第五步
parse_detailed_page(article_url)

# json中获取出的详情页的url, 调用的是 download_image
def parse_detailed_page(url):
# 继续将新的URL使用爬虫的第三步获取信息
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
response = requests.get(url, headers=headers, verify=False)
# 爬虫第四步, 通过html的信息获取到一个字符串
res_match = pat.search(response.text)
if res_match:
json_str = res_match.group(1)
res = json.loads(json_str)
res = json.loads(res)
print(res)
sub_images = res['sub_images']
for item in sub_images:
# 将图片的内容保存到文件系统中, 用requests + open
image_url = item['url']
# 爬虫的第五步
download_image(image_url)

# 详情页中的image的url
def download_image(url):
filename = mydir + '/' + url.split('/')[-1] + '.jpg'
print('Downloading ...' + filename)
# 爬虫的第三步
response = requests.get(url,)
print(response)
# 第五步
with open(filename, 'wb') as f:
f.write(response.content)

if __name__ == '__main__':
mydir = 'download'
pattern = 'gallery: JSON.parse\((.*?)\),'
pat = re.compile(pattern)
get_all_iamge()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import requests
import re
import json
import os

# 生成所有的json的url, 调用 get_detailed_page
def get_all_iamge():
if not os.path.exists(mydir):
os.mkdir(mydir)
# 将所有的url都放到数组中
url_list = []
url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab&pd=synthesis'
for i in range(2):
my_url = url.format(i * 20)
url_list.append((my_url, get_detailed_page))

while url_list:
(my_url, func) = url_list.pop(0)
func(my_url, url_list)
# get_detailed_page(my_url)

# json的那个url, 调用 parse_detailed_page
def get_detailed_page(url, url_list):
# 爬虫第三步
response = requests.get(url)

# 以下这两步可以通过一步实现
# import json
# res = json.loads(response.text)
# 第四步
res = response.json()

data = res['data']

for item in data:
if 'article_url' in item:
article_url = item['article_url']
print(article_url)
# 第五步
url_list.append((article_url, parse_detailed_page))
#parse_detailed_page(article_url)

# json中获取出的详情页的url, 调用的是 download_image
def parse_detailed_page(url, url_list):
# 继续将新的URL使用爬虫的第三步获取信息
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
response = requests.get(url, headers=headers)
# 爬虫第四步, 通过html的信息获取到一个字符串
res_match = pat.search(response.text)
if res_match:
json_str = res_match.group(1)
res = json.loads(json_str)
res = json.loads(res)
sub_images = res['sub_images']
for item in sub_images:
# 将图片的内容保存到文件系统中, 用requests + open
image_url = item['url']
# 爬虫的第五步
url_list.append((image_url, download_image))
#download_image(image_url)

# 详情页中的image的url
def download_image(url, url_list):
filename = mydir + '/' + url.split('/')[-1] + '.jpg'
print('Downloading ...' + filename)
# 爬虫的第三步
response = requests.get(url)
# 第五步
with open(filename, 'wb') as f:
f.write(response.content)

if __name__ == '__main__':
mydir = 'download'
pattern = 'gallery: JSON.parse\\((.*?)\\),'
pat = re.compile(pattern)
get_all_iamge()