Day02: requests requesting library, selenium request library

Requests a request for information library crawling IMDb

  - request url https://movie.douban.com/top250

- Request embodiment 
  the GET - request header   User-Agent   Cookies




import requests
import re
def get_page(url):
    response=requests.get(url)
    return response

def parse_index(html):
    movie_list = re.findall(
        '<div class="item">.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?导演: (.*?)主演: (.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价</span>.*?<span class="inq">(.*?)</span>',
        html,
        re.S)
    return movie_list

def save_data(movie):
    top,m_url,name,daoyan,actor,year_type,point,commit,desc=movie
    year_type = year_type.strip ( ' \ n- ' ) 
    Data = F '' '
     =============================== 
    film Ranking : {top} 
    movies url: {m_url} 
    movie name: {name} 
    film directors: {daoyan} 
    movie starring: {actor} 
    Genre: {year_type} 
    movie ratings: {point} 
    movie review: {commit} 
    The movie: { } desc
     =============================== 
    \ n- 
    '' '
     Print (Data) 
    with Open ( ' douban_top250.txt ' , ' A ' , encoding = ' UTF-. 8 ' ) AS F: 
        f.write (Data)
        print(f'电影:{name}写入成功...')

if __name__ == '__main__':
    num=0
    for line in range(10):
        url=f'https://movie.douban.com/top250?start={num}&filter='
        num+=25
        print(url)
        index_res=get_page(url)
        movie_list=parse_index(index_res.text)
        for movie in movie_list:
            save_data(movie)

 

Two selenium request library 1. What is selenium? Opening is an automated testing tool, the principle is the drive browser must perform some good action. Reptile nature of the browser is analog, so you can use it for reptiles.  




2. Why use selenium?
Advantages:
- js code execution
- no need to analyze complex communication processes
- do pop-ups, pull-down and other operations on the browser
- ***** obtain dynamic data
- *** crack the login authentication

Disadvantages:
- low efficiency

3, installation and use
1. Installation selenium request library:
PIP3 the install selenium

2. You must install the browser
, "Google" or Firefox

3. Install the browser driver
http://npm.taobao.org/mirrors/chromedriver/2.38/
Windows:
Download win32 drive

 

selenium request library crawling Jingdong network process

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import time

import time
driver= webdriver.Chrome(r'C:\Users\HP\Desktop\chromedriver.exe')
try:
    driver.get('https://www.jd.com/')
    wait = WebDriverWait(driver, 10)
    input_tag = wait.until(EC.presence_of_element_located(
        (By.ID, 'key')
    ))
    time.sleep(5)
    input_tag.send_keys('公仔')
    input_tag.send_keys(Keys.ENTER)
    time.sleep(20)
finally:
    driver.close()

 

selenium request login Baidu library reptiles

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

import time
driver= webdriver.Chrome(r'C:\Users\HP\Desktop\chromedriver.exe')
try:
    driver.implicitly_wait(10)
    driver.get('https://www.baidu.com/')
    time.sleep(5)
    login_link = driver.find_element_by_link_text('登录')
    login_link.click()
    time.sleep(1)
    user_login = driver.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn')
    user_login.click()
    time.sleep(1)
    # 3、find_element_by_class_name
    user = driver.find_element_by_class_name('pass-text-input-userName')
    user.send_keys('[email protected]')
    # 4、find_element_by_name
    pwd = driver.find_element_by_name('password')
    pwd.send_keys('*****')
    submit = driver.find_element_by_id('TANGRAM__PSP_10__submit')
    submit.click()
    login_link = driver.find_element_by_partial_link_text('')
    login_link.click()
    login2_link = driver.find_element_by_css_selector('.tang-pass-footerBarULogin')
    login2_link.click()
    div = driver.find_elements_by_tag_name('div')
    print(div)
    time.sleep(20)
finally:
    driver.close()

 















































Guess you like

Origin www.cnblogs.com/zhoujie333/p/11122936.html