selenium
在本机的python环境里利用pip install selenium
查看浏览器版本号,下载对应版本的driver启动器
把下载的启动器的地址放在path中,这样启动时不需配置地址
具体安装和用法,点击详情
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 from selenium import webdriverimport timedriver = webdriver.Chrome() driver.get('http://www.baidu.com' ) driver.find_element_by_id('kw' ).send_keys('大长腿是什么体验' ) driver.find_element_by_id('su' ).click() time.sleep(3 ) with open ('baidudachangtui.html' , 'wb' ) as f: f.write(driver.page_source.encode('utf-8' ))
有道翻译的爬取(selenium)
1 2 3 4 5 6 7 8 9 10 11 12 13 from selenium import webdriverdriver = webdriver.Chrome() driver.get('http://fanyi.youdao.com/' ) driver.find_element_by_id('inputOriginal' ).send_keys('你真帅' ) import timetime.sleep(1 ) translated = driver.find_element_by_id('transTarget' ).text print (translated)
新浪微博的爬取(selenium)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 from selenium import webdriverimport timefrom selenium.webdriver.chrome.options import Optionschrome_options = Options() chrome_options.add_argument("--headless" ) chrome_options.add_argument("window-size=1980,1080" ) driver = webdriver.Chrome(chrome_options=chrome_options) driver.get('https://weibo.com/' ) time.sleep(10 ) driver.find_element_by_id('loginname' ).send_keys('18510556963' ) driver.find_element_by_name('password' ).send_keys('yaoqinglin2011' ) driver.find_element_by_class_name('W_btn_a' ).click() time.sleep(20 ) if '"text=请输入验证码"' in driver.page_source: img_ele = driver.find_element_by_xpath('//img[@node-type="verifycode_image"]' ) image_url = img_ele.get_attribute('src' ) import requests response = requests.get(image_url) with open ('verify_image.jpg' , 'wb' ) as f: f.write(response.content) verify_image = input ('请输入验证码:\n' ) driver.find_element_by_name('verifycode' ).send_keys(verify_image) driver.find_element_by_class_name('W_btn_a' ).click() time.sleep(20 ) cookie_list = driver.get_cookies() cookie_item_list = [] for item in cookie_list: name = item['name' ] value = item['value' ] cookie_item = name + '=' + value cookie_item_list.append(cookie_item) cookie_str = '; ' .join(cookie_item_list) print (cookie_str)url = 'https://account.weibo.com/set/index?topnav=1&wvr=6' headers = { 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' , 'Accept-Language' : 'zh-CN,zh;q=0.9' , 'Cookie' : cookie_str, 'Host' : 'account.weibo.com' , 'Referer' : 'https://weibo.com/u/5995110384/home' , 'Upgrade-Insecure-Requests' : '1' , 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' , } import requestsresponse = requests.get(url, headers=headers) with open ('basic_modify.html' , 'wb' ) as f: f.write(response.content)
雪球网的爬取(requests,BeautifulSoup)
BeautifulSoup与css选择器相关,选取特定的标签
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 import requestsfrom bs4 import BeautifulSoupurl = 'https://xueqiu.com/ask/square' headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36' , } response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, 'lxml' ) div_tag_list = soup.select('div#head' ) print (div_tag_list)