Requests a request for information library crawling IMDb
- Request embodiment
the GET - request header User-Agent Cookies
import requests import re def get_page(url): response=requests.get(url) return response def parse_index(html): movie_list = re.findall( '<div class="item">.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?导演: (.*?)主演: (.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价</span>.*?<span class="inq">(.*?)</span>', html, re.S) return movie_list def save_data(movie): top,m_url,name,daoyan,actor,year_type,point,commit,desc=movie year_type = year_type.strip ( ' \ n- ' ) Data = F '' ' =============================== film Ranking : {top} movies url: {m_url} movie name: {name} film directors: {daoyan} movie starring: {actor} Genre: {year_type} movie ratings: {point} movie review: {commit} The movie: { } desc =============================== \ n- '' ' Print (Data) with Open ( ' douban_top250.txt ' , ' A ' , encoding = ' UTF-. 8 ' ) AS F: f.write (Data) print(f'电影:{name}写入成功...') if __name__ == '__main__': num=0 for line in range(10): url=f'https://movie.douban.com/top250?start={num}&filter=' num+=25 print(url) index_res=get_page(url) movie_list=parse_index(index_res.text) for movie in movie_list: save_data(movie)
2. Why use selenium?
Advantages:
- js code execution
- no need to analyze complex communication processes
- do pop-ups, pull-down and other operations on the browser
- ***** obtain dynamic data
- *** crack the login authentication
Disadvantages:
- low efficiency
3, installation and use
1. Installation selenium request library:
PIP3 the install selenium
2. You must install the browser
, "Google" or Firefox
3. Install the browser driver
http://npm.taobao.org/mirrors/chromedriver/2.38/
Windows:
Download win32 drive
selenium request library crawling Jingdong network process
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait import time import time driver= webdriver.Chrome(r'C:\Users\HP\Desktop\chromedriver.exe') try: driver.get('https://www.jd.com/') wait = WebDriverWait(driver, 10) input_tag = wait.until(EC.presence_of_element_located( (By.ID, 'key') )) time.sleep(5) input_tag.send_keys('公仔') input_tag.send_keys(Keys.ENTER) time.sleep(20) finally: driver.close()
selenium request login Baidu library reptiles
from selenium import webdriver from selenium.webdriver.common.keys import Keys import time import time driver= webdriver.Chrome(r'C:\Users\HP\Desktop\chromedriver.exe') try: driver.implicitly_wait(10) driver.get('https://www.baidu.com/') time.sleep(5) login_link = driver.find_element_by_link_text('登录') login_link.click() time.sleep(1) user_login = driver.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn') user_login.click() time.sleep(1) # 3、find_element_by_class_name user = driver.find_element_by_class_name('pass-text-input-userName') user.send_keys('[email protected]') # 4、find_element_by_name pwd = driver.find_element_by_name('password') pwd.send_keys('*****') submit = driver.find_element_by_id('TANGRAM__PSP_10__submit') submit.click() login_link = driver.find_element_by_partial_link_text('登') login_link.click() login2_link = driver.find_element_by_css_selector('.tang-pass-footerBarULogin') login2_link.click() div = driver.find_elements_by_tag_name('div') print(div) time.sleep(20) finally: driver.close()