1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
| import requests from lxml import etree from Day2.mysqlhelper import MysqlHelper
helper = MysqlHelper()
def get_detailed_page_info(url, date): print('正在处理: ', url) response = requests.get(url) html_ele = etree.HTML(response.text) event_name_lsit = html_ele.xpath('//div[@class="leftList2"]/div[@class="c2"]/a/text()') loops_count = 0 for event_name in event_name_lsit: print(event_name) loops_count += 1 tr_ele_list = html_ele.xpath('//div[@class="leftList4"][{}]/table/tr'.format(loops_count)) for i in range(1, len(tr_ele_list), 3): round = tr_ele_list[i].xpath('./td[1]/text()')[0].strip() time = tr_ele_list[i].xpath('./td[2]/text()')[0].strip() status = tr_ele_list[i].xpath('./td[3]/text()')[0].strip() host_team = tr_ele_list[i].xpath('./td[4]//a/text()')[0].strip() if tr_ele_list[i].xpath('./td[5]/text()')[0].strip(): score = tr_ele_list[i].xpath('./td[5]/text()')[0].strip() else: score = tr_ele_list[i].xpath('./td[5]//a/text()')[0].strip().replace(' ', '').replace('\n', '') guest_team = tr_ele_list[i].xpath('./td[6]//a/text()')[0].strip() print(round, time, status, host_team, score, guest_team) print('-'*50) insert_sql = 'INSERT INTO football_match(`round`, `time`, status, host_team, score, guest_team, `date`, event_name)' \ 'VALUES (%s, %s, %s, %s, %s, %s, %s, %s)' data = (round, time, status, host_team, score, guest_team, date, event_name) helper.execute_insert_sql(insert_sql, data)
def get_all_football_match_info(start_url): response = requests.get(start_url) html_ele = etree.HTML(response.text) href_list = html_ele.xpath('//div[@class="t2L"]/div[@class="c2"]/ul/li/a/@href')
for href in href_list: date = href.split('/')[-1].split('.')[0] url = 'http://goal.sports.163.com' + href get_detailed_page_info(url, date)
next_page_href = html_ele.xpath('//div[@class="t2L"]/div[@class="c2"]/span[2]/a/@href')[0]
if next_page_href != start_url: next_page_href = 'http://goal.sports.163.com' + next_page_href get_all_football_match_info(next_page_href)
if __name__ == '__main__': start_url = 'http://goal.sports.163.com/schedule/20170604.html' get_all_football_match_info(start_url)
|