1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
| import requests from lxml import etree from Day2.mysqlhelper import MysqlHelper import re from urllib import parse from Day11.ProxyHelper import ProxyHelperObject
def parse_detailed_page(url): print('正在处理:' + url) proxy = { 'http': proxyhelper.get_proxy(), 'https': proxyhelper.get_proxy() } try: response = requests.get(url, headers=headers, proxies=proxy, timeout=20) if '看不清?点击换一张' in response.text: proxyhelper.update_proxy() parse_detailed_page(url) return except Exception as e: proxyhelper.update_proxy() parse_detailed_page(url) return html_ele = etree.HTML(response.text)
publish_time = html_ele.xpath('//div[@class="job-author"]/span/text()')[0].replace('发布于','') title = html_ele.xpath('//h1/text()')[0] salary = html_ele.xpath('//span[@class="badge"]/text()')[0].strip() all_info = html_ele.xpath('//div[@class="info-primary"]/p//text()')[:3] city = all_info[0].replace('城市:', '') exp = all_info[1].replace('经验:', '') degree = all_info[2].replace('学历:', '') job_desc = html_ele.xpath('//div[@class="text"]//text()') job_desc = [item.strip() for item in job_desc] job_desc = '\n'.join(job_desc)
insert_sql = 'INSERT INTO boss_test (publish_time,title,salary,city,exp,degree,job_desc) VALUES' \ '(%s, %s, %s, %s, %s, %s, %s)' data = (publish_time,title,salary,city,exp,degree,job_desc) helper.execute_insert_sql(insert_sql, data)
def parse_list_page(url): proxy = { 'http': proxyhelper.get_proxy(), 'https': proxyhelper.get_proxy() } try: response = requests.get(url, headers = headers, proxies=proxy, timeout=20) if '看不清?点击换一张' in response.text: proxyhelper.update_proxy() parse_list_page(url) return except Exception as e: proxyhelper.update_proxy() parse_list_page(url) return pattern = '<a href="(.*?)" data-jid' href_list = re.findall(pattern, response.text)
for href in href_list: url = parse.urljoin(url, href) parse_detailed_page(url)
def get_all_page(): base_url = 'https://www.zhipin.com/c101010100/?query=python&page={}&ka=page-next' for i in range(10): url = base_url.format(i+1) parse_list_page(url)
if __name__ == '__main__': helper = MysqlHelper() proxyhelper = ProxyHelperObject()
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', }
get_all_page()
|