西刺代理可用性的爬取

  • 代理的可用性的单个验证
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# 获取免费的代理并验证代理的可用性
# 爬虫第二部, 找到了xicidaili
import requests
from lxml import etree

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}

def get_free_proxy():
url = 'https://www.xicidaili.com/nn/'
# 爬虫第三步
response = requests.get(url, headers=headers)
# with open('xicidaili.html', 'wb') as f:
# f.write(response.content)
html_ele = etree.HTML(response.content)
tr_eles = html_ele.xpath('//table[@id="ip_list"]//tr')
tr_eles.pop(0)
for tr_ele in tr_eles:
ip_str = tr_ele.xpath('./td[2]/text()')[0]
port = tr_ele.xpath('./td[3]/text()')[0]
yield 'http://' + ip_str + ':' + port

def validate_proxy(proxy_str):
url = 'http://httpbin.org/get'
proxy = {
'http': proxy_str,
'https': proxy_str
}
try:
response = requests.get(url, proxies=proxy, timeout=5)
if response.status_code == 200:
print('这个代理好使!!!', proxy_str)
return True
else:
print('这个代理好使!!!', proxy_str)
return True
except:
print('什么玩意, 不好使', proxy_str)
return False

if __name__ == '__main__':
good_proxy = []
for item in get_free_proxy():
if validate_proxy(item):
good_proxy.append(item)

print('good proxy is:')
print(good_proxy)
  • 代理的的可用性的连接池的验证
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# 获取免费的代理并验证代理的可用性
# 爬虫第二部, 找到了xicidaili
import requests
from lxml import etree


headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}

def get_free_proxy():
url = 'https://www.xicidaili.com/nn/'
# 爬虫第三步
response = requests.get(url, headers=headers)
# with open('xicidaili.html', 'wb') as f:
# f.write(response.content)
html_ele = etree.HTML(response.content)
tr_eles = html_ele.xpath('//table[@id="ip_list"]//tr')
tr_eles.pop(0)
for tr_ele in tr_eles:
ip_str = tr_ele.xpath('./td[2]/text()')[0]
port = tr_ele.xpath('./td[3]/text()')[0]
yield 'http://' + ip_str + ':' + port

def validate_proxy(proxy_str):
url = 'http://httpbin.org/get'
proxy = {
'http': proxy_str,
'https': proxy_str
}
try:
response = requests.get(url, proxies=proxy, timeout=5)
if response.status_code == 200:
print('这个代理好使!!!', proxy_str)
return proxy_str
else:
print('这个代理好使!!!', proxy_str)
return proxy_str
except:
print('什么玩意, 不好使', proxy_str)
return None

def start_process_pool():
from multiprocessing import Pool
pool = Pool(10)
good_proxy = []
# 需要存储所有的返回值
res_list = []
for item in get_free_proxy():
res = pool.apply_async(func=validate_proxy, args=(item,))
res_list.append(res)
# if validate_proxy(item):
# good_proxy.append(item)

for res in res_list:
proxy = res.get()
if proxy:
good_proxy.append(proxy)
# 结束进程池, 等待所有进程执行结束
pool.close()
pool.join()
return good_proxy

if __name__ == '__main__':
good_proxy = start_process_pool()
print('good proxy is:')
print(good_proxy)
  • 代理的可用性的面向对象的验证
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# 获取免费的代理并验证代理的可用性
# 爬虫第二部, 找到了xicidaili
import requests
from lxml import etree

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}

# 所有的网站获取都需要这个类
class GetProxy(object):
def validate_proxy(self, proxy_str):
url = 'http://httpbin.org/get'
proxy = {
'http': proxy_str,
'https': proxy_str
}
try:
response = requests.get(url, proxies=proxy, timeout=5)
if response.status_code == 200:
print('这个代理好使!!!', proxy_str)
return proxy_str
else:
print('这个代理好使!!!', proxy_str)
return proxy_str
except:
print('什么玩意, 不好使', proxy_str)
return None

def start_process_pool(self):
from multiprocessing import Pool
pool = Pool(10)
good_proxy = []
# 需要存储所有的返回值
res_list = []
for item in self.get_free_proxy():
res = pool.apply_async(func=self.validate_proxy, args=(item,))
res_list.append(res)
# if validate_proxy(item):
# good_proxy.append(item)

for res in res_list:
proxy = res.get()
if proxy:
good_proxy.append(proxy)
# 结束进程池, 等待所有进程执行结束
pool.close()
pool.join()
return good_proxy

def get_free_proxy(self):
assert(0)

class GetXicidailiProxy(GetProxy):
def get_free_proxy(self):
url = 'https://www.xicidaili.com/nn/'
# 爬虫第三步
response = requests.get(url, headers=headers)
# with open('xicidaili.html', 'wb') as f:
# f.write(response.content)
html_ele = etree.HTML(response.content)
tr_eles = html_ele.xpath('//table[@id="ip_list"]//tr')
tr_eles.pop(0)
for tr_ele in tr_eles:
ip_str = tr_ele.xpath('./td[2]/text()')[0]
port = tr_ele.xpath('./td[3]/text()')[0]
yield 'http://' + ip_str + ':' + port

class Get66IPProxy(GetProxy):
def get_free_proxy(self):
pass

if __name__ == '__main__':
getxicidailiobj = GetXicidailiProxy()
good_proxy = getxicidailiobj.start_process_pool()
print('good proxy is:')
print(good_proxy)

boss直聘的爬取

  • 西刺代理
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import requests
from lxml import etree
from Day2.mysqlhelper import MysqlHelper
import re
from urllib import parse
from Day10.ObjectOrientedProgram import GetXicidailiProxy

def parse_detailed_page(url):
print('正在处理:' + url)
if good_proxy:
proxy = {
'http': good_proxy[0],
'https': good_proxy[0]
}
else:
print('已经没有可以用的代理IP了')
return
# 爬虫第三步
try:
response = requests.get(url, headers=headers, proxies=proxy, timeout=5)
if '看不清?点击换一张' in response.text:
good_proxy.pop(0)
parse_detailed_page(url)
return
except:
good_proxy.pop(0)
parse_detailed_page(url)
return
# 爬虫第四步
html_ele = etree.HTML(response.text)

title = html_ele.xpath('//div[@class="name"]/h1/text()')[0]
salary = html_ele.xpath('//span[@class="salary"]/text()')[0]
all_info = html_ele.xpath('//div[@class="info-primary"]/p//text()')[:3]
city = all_info[0]
exp = all_info[1]
degree = all_info[2]
job_desc = html_ele.xpath('//div[@class="text"]//text()')
job_desc = [item.strip() for item in job_desc]
job_desc = '\n'.join(job_desc)

#print(title)
#print(salary)
#print(all_info)
#print(city)
#print(exp)
#print(degree)
#print(job_desc)

insert_sql = 'INSERT INTO boss_test (title,salary,city,exp,degree,job_desc) VALUES' \
'(%s, %s, %s, %s, %s, %s, %s)'
data = (title,salary,city,exp,degree,job_desc)
helper.execute_insert_sql(insert_sql, data)

# CREATE TABLE boss_test(id int primary key auto_increment,
# title varchar(30),salary char(8),city char(20),exp char(6),degree char(10),job_desc text)
# DEFAULT CHARSET=utf8mb4;

def parse_list_page(url):
if good_proxy:
proxy = {
'http': good_proxy[0],
'https': good_proxy[0]
}
else:
print('已经没有可以用的代理IP了')
return
# 爬虫的第三部
try:
response = requests.get(url, headers = headers, proxies=proxy, timeout=5)
if '看不清?点击换一张' in response.text:
good_proxy.pop(0)
parse_list_page(url)
return
except:
good_proxy.pop(0)
parse_list_page(url)
return
# 爬虫的第四部
# 这是一个可耻的错误
# pattern = '<li>(.*?)</li>'
# li_strings = re.findall(pattern, response.text, re.S)
# for li in li_strings:
# print('--'*50)
# print(li)
pattern = '<a href="(.*?)" data-jid'
href_list = re.findall(pattern, response.text)
#print(len(href_list))

for href in href_list:
# urljoin就等于下面的内容
url = parse.urljoin(url, href)
#url = 'https://www.zhipin.com' + href
# 爬虫的第五步
parse_detailed_page(url)

def get_all_page():
base_url = 'https://www.zhipin.com/c101010100/?query=python&page={}&ka=page-next'
for i in range(10):
url = base_url.format(i+1)
parse_list_page(url)

if __name__ == '__main__':
getxicidailiobj = GetXicidailiProxy()
good_proxy = getxicidailiobj.start_process_pool()
helper = MysqlHelper()

headers = {
'cookie': '_uab_collina=157077312178356694702941; lastCity=101190100; __c=1576457162; __g=-; __l=l=%2Fwww.zhipin.com%2Fweb%2Fcommon%2Fsecurity-check.html%3Fseed%3D%252FOhlAMNhqq0zP23T4wqv%252F%252FmwPvM0u7Ui6Uxy6OktMJ4%253D%26name%3D6c486237%26ts%3D1576457159842%26callbackUrl%3D%252Fjob_detail%252F7dbf05564030964a1HR42Ni8E1U%257E.html%26srcReferer%3D&r=&friend_source=0&friend_source=0; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1576457165,1576458525,1576458570; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1576460308; __a=80803241.1570773121.1573473201.1576457162.101.6.31.84; __zp_stoken__=6f7dpKksRc2BevK1XtDsgv5%2BK%2Bp8mG4ppisqnWD%2FMycu3PSmN5d1Xo%2FiAu8LFhutfH9LnYeivvlr8qElaKOIf7prwfk3gQtZfmyLUxsQjcQ8s4L0F7UcRmajqHSVANF7IdzN',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
}
# url = 'https://www.zhipin.com/job_detail/0aed27b145de57a50nF92d20F1I~.html'
# parse_detailed_page(url)

# url = 'https://www.zhipin.com/c101010100/?query=python&page=1&ka=page-next'
# parse_list_page(url)
get_all_page()
  • 蘑菇代理
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import requests
class ProxyHelperObject(object):
def __init__(self):
self.proxy = self.__requests_get_proxy()

def get_proxy(self):
return self.proxy

def update_proxy(self, old_proxy):
if old_proxy == self.proxy:
self.proxy = self.__requests_get_proxy()
print('你新获取了一个proxy:', self.proxy)

def __requests_get_proxy(self):
url = 'http://piping.mogumiao.com/proxy/api/get_ip_al?appKey=4015b4545c6545d5b21a3b49fec0671c&count=1&expiryDate=0&format=2&newLine=2'
response = requests.get(url)
return 'http://' + response.text.strip()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
   import requests
from lxml import etree
from Day2.mysqlhelper import MysqlHelper
import re
from urllib import parse
from Day11.ProxyHelper import ProxyHelperObject

def parse_detailed_page(url):
print('正在处理:' + url)
proxy = {
'http': proxyhelper.get_proxy(),
'https': proxyhelper.get_proxy()
}
# 爬虫第三步
try:
response = requests.get(url, headers=headers, proxies=proxy, timeout=20)
if '看不清?点击换一张' in response.text:
proxyhelper.update_proxy()
parse_detailed_page(url)
return
except Exception as e:
proxyhelper.update_proxy()
parse_detailed_page(url)
return
# 爬虫第四步
html_ele = etree.HTML(response.text)

publish_time = html_ele.xpath('//div[@class="job-author"]/span/text()')[0].replace('发布于','')
title = html_ele.xpath('//h1/text()')[0]
salary = html_ele.xpath('//span[@class="badge"]/text()')[0].strip()
all_info = html_ele.xpath('//div[@class="info-primary"]/p//text()')[:3]
city = all_info[0].replace('城市:', '')
exp = all_info[1].replace('经验:', '')
degree = all_info[2].replace('学历:', '')
job_desc = html_ele.xpath('//div[@class="text"]//text()')
job_desc = [item.strip() for item in job_desc]
job_desc = '\n'.join(job_desc)

insert_sql = 'INSERT INTO boss_test (publish_time,title,salary,city,exp,degree,job_desc) VALUES' \
'(%s, %s, %s, %s, %s, %s, %s)'
data = (publish_time,title,salary,city,exp,degree,job_desc)
helper.execute_insert_sql(insert_sql, data)

# CREATE TABLE boss_test(id int primary key auto_increment, publish_time datetime,
# title varchar(30),salary char(8),city char(20),exp char(6),degree char(10),job_desc text)
# DEFAULT CHARSET=utf8mb4;

def parse_list_page(url):
proxy = {
'http': proxyhelper.get_proxy(),
'https': proxyhelper.get_proxy()
}
# 爬虫的第三部
try:
response = requests.get(url, headers = headers, proxies=proxy, timeout=20)
if '看不清?点击换一张' in response.text:
proxyhelper.update_proxy()
parse_list_page(url)
return
except Exception as e:
proxyhelper.update_proxy()
parse_list_page(url)
return
# 爬虫的第四部
# 这是一个可耻的错误
# pattern = '<li>(.*?)</li>'
# li_strings = re.findall(pattern, response.text, re.S)
# for li in li_strings:
# print('--'*50)
# print(li)
pattern = '<a href="(.*?)" data-jid'
href_list = re.findall(pattern, response.text)
#print(len(href_list))

for href in href_list:
# urljoin就等于下面的内容
url = parse.urljoin(url, href)
#url = 'https://www.zhipin.com' + href
# 爬虫的第五步
parse_detailed_page(url)

def get_all_page():
base_url = 'https://www.zhipin.com/c101010100/?query=python&page={}&ka=page-next'
for i in range(10):
url = base_url.format(i+1)
parse_list_page(url)

if __name__ == '__main__':
helper = MysqlHelper()
proxyhelper = ProxyHelperObject()

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}
# url = 'https://www.zhipin.com/job_detail/7dbf05564030964a1HR42Ni8E1U~.html'
# parse_detailed_page(url)

# url = 'https://www.zhipin.com/c101010100/?query=python&page=1&ka=page-next'
# parse_list_page(url)
get_all_page()