Day01 crawling video + POST request automatic login GitHub

First of requests requested Library Quick installation pip3 install requests

Import packages:

import requests
import time

Reptiles and the three-part song:

1, the transmission request

def get_page(url):
    response = requests.get(url)
    return response

2, analysis data

Import Re
 DEF parse_index (HTML):
     # findAll match all 
    # the re.findall ( 'regular matching rule', 'matching text', 'matching mode') 
    # re.S: searching for all text matching 
    detail_urls = re.findall ( ' div class = "items"> <A class = "imglink" href = "(.? *)" ' , HTML, re.S)
     return detail_urls
 # resolve details page 
DEF parse_detail (HTML): 
    movie_url = re.findall ( ' <Source the src = "(.? *)" ' , HTML, re.S)
     IF movie_url:
         return movie_url [0]

3, save data

Import UUID     # uuid.uuid4 () generates a text string as the only period of time stamps 
DEF save_video (Content): 
    with Open (F ' . uuid.uuid4 {()} MP4 ' , ' WB ' ) AS F: 
        F. write (content)

Real case: the school beauty network to crawling

# Main + ENTER 
IF  the __name__ == ' __main__ ' :
     for Line in Range (. 6 ): 
        URL = F ' http://www.xiaohuar.com/list-3-{line}.html ' 
        # transmission request 
        response = get_page (url)
         # Print (the response) 
        # # return a response status code 
        # Print (response.status_code) 
        # # return a response text 
        # Print (response.text) 

        # parse Home page 
        detail_urls = parse_index (response.text)
         # loop through the details page url 
        fordetail_url in detail_urls:
             # Print (detail_url) 
            # to each page before transmission request 
            detail_res = the get_page (detail_url)
             # Print (response.text) 

            # resolve details page for a video url 
            movie_url = parse_detail (detail_res.text) 

            # determines the presence of printed url video 
            IF movie_url:
                 Print (movie_url) 

                # to acquire a video transmission request url video stream binary 
                movie_res = the get_page (movie_url)
                 # the binary stream video to the function to preserve the local save_video 
                save_video (movie_res.content)

Two, POST request to automatically log GitHub

1. Get token strings

'' ' 
1. Visit the login page to obtain token string 
    request the URL of: 
        https://github.com/login 
    request method: 
        GET 
    request header: 
        Cookies 
        the User-Agent: Mozilla / 5.0 (Windows NT 10.0; WOW64) AppleWebKit / 537.36 ( KHTML, like Gecko) Chrome / 75.0.3770.100 Safari / 537.36 
2. parse and extract the token string 
# regular 
<the INPUT of the type = "hidden" name = "authenticity_token" value = "(. *?)" /> 
'' '

import requests
import re
login_url = 'https://github.com/login'
#login页面请求头信息
login_headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
login_res = requests.get(url=login_url,headers=login_headers)
# print(longin_res.text)

#解析提取token字符串
authenticity_token = re.findall(
    '<input type="hidden" name="authenticity_token" value="(.*?)" />',
    login_res.text,
    re.S
)[0]
print(authenticity_token)

#获取login页面的cookies信息
# print(type(login_res.cookies))
# print(type(login_res.cookies.get_dict()))
login_cookies = login_res.cookies.get_dict()

2 Start registering GitHub 
'' ' 
the POST request automatic login hithub: 
request URL: 
    https://github.com/session/ 
request method: 
    the POST 
request header: 
    Cookie 
    the User-- Agent: the Mozilla / 5.0 (the Windows NT 10.0; the WOW64) AppleWebKit /537.36 (KHTML, like Gecko) Chrome / 75.0.3770.100 Safari / 537.36


请求体：
    commit:Sign in
    utf8:✓
    authenticity_token:28M+Bm0xt10QgEidyrICyo/53xxYWV0deet0sGQKPdoQG9FXPOqFHZjMQPHHc+RBlOfJTMplbpyJI7yoBZH0zw==
    login:*****
    password:********
    webauthn-support:unsupported
'''

# The session login URL 
session_url = ' https://github.com/session/ ' 

# request header 
session_headers = {
     ' the User-- Agent ' : ' the Mozilla / 5.0 (the Windows NT 10.0; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko ) the Chrome / 75.0.3770.100 Safari / 537.36 ' 

} 
# request-information 
form_data = {
     " the commit " : " Sign in " ,
     " UTF8 " : " ✓ " ,
     " authenticity_token ": authenticity_token,
    "login": "[email protected]", #能成功登录的账号密码
    "password": "SHAOhh170326",
    "webauthn-support": "supported"
}
session_res = requests.post(url=session_url,headers=session_headers,cookies=login_cookies,data=form_data)
with open('github.html','w',encoding='utf-8') as f:
    f.write(session_res.text)

In github.html generated files can be seen in the open browser do not lose the account password can log github

In the beginning of trying to use the QQ browser, and finally displays the account password is incorrect, back into Google browser, and the User-Agent, etc. after the data modification can log in normal operation

Day01 crawling video + POST request automatic login GitHub

Guess you like