6.17-python learning

A content today

1. post request on request:

'' '' '' 
'' ' 
POST request landing GitHub 
' '' 
Import Requests 
Import Re 

obtain a token to access information # login page 
'' ' 
request url: 
    https://github.com/login 
request method:    
    GET 
response headers: 
    Set-Cookie 
request header: 
    cookies 
    the User-- Agent 
'' ' 
headers = { 
    ' - Agent-the User ':' the Mozilla / 5.0 (the Windows NT 10.0; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 65.0.3325.146 Safari / 537.36 ' 
} 

Response = requests.get (URL =' HTTPS: //github.com/login ', headers = headers) 
# Print (Response.text) 
# login page returned to convert information into a dictionary cookies 
login_cookies = response.cookies.get_dict ()

= the re.findall authenticity_token ( '<INPUT type = "hidden" name = "authenticity_token" value = "(. *?)" />', response.text, re.S) [0] 

Print (authenticity_token) 



# two to sessionurl send a POST request 
'' ' 

request URL: 
    https://github.com/session 
    
request method: 
    POST 
    
request header: 
    a request coming from the # 
    the Referer: https://github.com/login 
    cookies: ... 
    the User -Agent: Mozilla / 5.0 (Windows NT 10.0; WOW64) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome / 65.0.3325.146 Safari / 537.36 
    
request body: 
    only POST request will have a request body. 
    the commit: Sign in 
    utf8: ✓ 
    authenticity_token:
    login: tankjam1
    password: *****
    webauthn-support: unsupported
'''
# 拼接请求头信息
headers2 = {
    'Referer': 'https://github.com/login',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36',
}

# 拼接请求体信息
form_data = {
    "commit": "Sign in",
    "utf8": "✓",
    "authenticity_token": authenticity_token,
    "login": "tankjam",
    "password": "kermit46709394",
    "webauthn-support": "unsupported",
# carrying the request header, the request message body cookies, login page
# Post request address sent to the session
}

response2 = requests.post(url='https://github.com/session', data=form_data, headers=headers2, cookies=login_cookies)
print(response2.status_code)
# print(response2.text)
with open('github.html', 'w', encoding='utf-8') as f:
    f.write(response2.text)

2.response Base:

Requests Import 

Response = requests.get ( 'https://baidu.com') 
# Response Response 
print (response.status_code) # fetch response status code 
print (response.url) # Get the url 
print (response.encoding) # character encoding 
response.encoding = 'UTF-. 8' 
Print (response.text) # Get the text 
print (response.content) # obtain a binary stream 
print (response.headers) # page acquisition request header information 
once the print (response.history) # Jump address 
# 1, 2 returns the cookie dictionary, the object is returned cookies 
print (response.cookies) # cookies to obtain information, 
Print (response.cookies.get_dict ()) # get converted information into cookies dictionary 
print (response.cookies.items ()) # get converted cookies information into the dictionary 
Print (response.encoding) 
Print (response.elapsed) # access time 

Import requests 
# send a request to get an audio address
= URL 'https://vd3.bdstatic.com/mda-ic4pfhh3ex32svqi/hd/mda-ic4pfhh3ex32svqi.mp4?auth_key=1557973824-0-0-bfb2e69bb5198ff65e18065d91b2b8c8&bcevod_channel=searchbox_feed&pd=wisenatural&abtest=all.mp4' 
Response = requests.get (URL , stream = True) # stream = True set the content of a iterator object 
Print (response.content) 

with Open ( 'love_for_GD.mp4', 'WB') aS F: 
    for content in response.iter_content (): 
        F. write (content)

3.requests Advanced Usage:

'' ' 
' '' 
'' ' 
Certificate validation (most sites are HTTPS) 
' '' 
Import Requests 
# If ssl request, first check whether the certificate is legitimate, sub-rule error, the program terminal 
response = requests.get ( 'https : //www.xiaohuar.com ') 
Print (response.status_code) 

# 1 improvement: remove error, but a warning message will 
Import Requests 
Response = requests.get (' https://www.xiaohuar.com ', = False Verify ) 
# does not verify the certificate, a warning message, return 200 is 
Print (response.status_code) 

# 2 improvements: remove the error, and alarm information is removed 
Import Requests 
Import urllib3 
urllib3.disable_warnings () # closure alert 
response = requests.get ( 'https: / /www.xiaohuar.com ', Verify = False) 
Print (response.status_code) 

# 3 # improvements: adding the certificate  
# many sites are https, but can also be accessed without a certificate,Most cases can carry or may not carry a certificate
# know almost \ Baidu the like are available with or without 
# have rigid requirements,It must take, such as for the user to direct, before getting the certificate has access to a particular site
Requests Import 
Import urllib3 
# urllib3.disable_warnings () # closure alert 
# pseudocode 
Response = requests.get ( 
    'https://www.xiaohuar.com', 
    # Verify = False, 
    the directory where the certificate # /path/server.crt , / path / Key 
    CERT = ( '/ path / server.crt', '/ path / Key')) 
Print (response.status_code) 


'' ' 
timeout setting 
' '' 

# timeout setting 
# timeout two kinds: float or tuple 
# timeout = 0.1 # receiving data representative of timeout 
# timeout = (0.1,0.2) # 0.1 0.2 Representative representing the received data link timeout timeout 

Import Requests 
Response = requests.get ( 'https://www.baidu.com' , 
                        timeout = 0.0001) 
# Print (response.elapsed) 
Print (the Response.status_code)

'''
Proxy settings: first sends a request to the agent, and then sent by the agent to help (blocked ip is a common thing) 
'' ' 
Import Requests 
Proxies = { 
    proxy username password # tape, before the @ sign is the user name and password 
    ' http ': 'HTTP: // Tank: 123 @ localhost: 9527', 
    'HTTP': 'HTTP: // localhost: 9527', 
    'HTTPS': 'HTTPS: // localhost: 9527', 
} 
Response = requests.get ( ' https://www.12306.cn ', 
                     Proxies = Proxies) 

Print (response.status_code) 
' '' 
crawling west stab free agents: 
    1. access West stab free agent page 
    2. re module parses and extracts all agents 
    3 . crawling agents tested by ip test site 
    4. If test_ip throws an exception representative of void agents, or agents effective 
    5. the use of an effective proxy proxy test 

<TR class = "ODD">  
      <td class="country"><img src="//fs.xicidaili.com/images/flag/cn.png" alt="Cn"></td>
      <TD> 112.85.131.99 </ td>
      <td>9999</td>
      <td>
        <a href="/2019-05-09/jiangsu">江苏南通</a>
      </td>
      <td class="country">高匿</td>
      <td>HTTPS</td>
      <td class="country">
        <div title="0.144秒" class="bar">
          <div class="bar_inner fast" style="width:88%">

          </div>
        </div>
      </td>
      <td class="country">
        <div title="0.028秒" class="bar">
          <div class="bar_inner fast" style="width:97%">

          </div>
        </div>
      </td>

      <td>6天</td>
      <td>19-05-16 11:20</td>
    </tr>
re:
    <tr class="odd">(.*?)</td>.*?<td>(.*?)</td>

'''
import requests
import re
import time

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
}


def get_index(url):
    time.sleep(1)
    response = requests.get(url, headers=HEADERS)
    return response


def parse_index(text):
    ip_list = re.findall('<tr class="odd">.*?<td>(.*?)</td>.*?<td>(.*?)</td>', text, re.S)
    for ip_port in ip_list:
        ip = ':'.join(ip_port)
        yield ip

def test_ip(ip):
    the try:
    print ( 'Test IP: S%'% IP)
        = {Proxies 
            'HTTPS': ip 
        }

        # ip test site 
        ip_url = 'https://www.ipip.net/' 

        # ip access to valid and invalid proxy test site, if the results returned by 200 representatives of the current test ip normal 
        Response = requests.get (ip_url, headers = hEADERS, Proxies Proxies =, = timeout. 1) 

        IF 200 is response.status_code ==: 
            Print (F 'useful ip: {ip}') 
            return ip 

    # If invalid proxy ip exception is thrown 
    the except exception AS E: 
        Print (E) 

# crawling using the proxy NBA 
DEF spider_nba (good_ip): 
    URL = 'https://china.nba.com/' 

    Proxies = { 
        'HTTPS': good_ip 
    } 

    Response = requests.get (url, headers = hEADERS, proxies = proxies)
    Print (response.status_code) 
    Print (response.text) 


IF the __name__ == '__main__': 
Import Requests
    = the base_url 'https://www.xicidaili.com/nn/ {}' 

    for Line in Range (. 1, 3677): 
        ip_url = base_url.format (Line) 

        Response = get_index (ip_url) 

        # Western Analytical Agent retrieves each thorn ip list 
        ip_list = parse_index (response.text) 

        # cycles each ip 
        for ip in ip_list: 
            # Print (ip) 

            # ip crawling down to test 
            good_ip = test_ip (ip) 

            IF good_ip: 
                # agents really began testing 
                spider_nba (good_ip) 



'' ' 
authentication settings 
' '' 
URL = 'https://api.github.com/user' 
HEADERS = {
By accessing github # api to test 
from requests.auth import HTTPBasicAuth
    '- Agent-the User': 'the Mozilla / 5.0 (the Windows NT 10.0; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 70.0.3538.77 Safari / 537.36', 
} 

# 1 testing, else return 401 
# Response = requests.get (URL, headers = hEADERS) 
# Print (response.status_code) # 401 
# Print (response.text) 
'' ' 
print result: 
    { 
      "Message": "the Requires authentication", 
      "documentation_url": "HTTPS: // Developer. github.com/v3/users/#get-the-authenticated-user " 
    } 
'' ' 
# 
# 2 test, carried out in the authentication HTTPBasicAuth requests.auth, returns the user authentication success information 
response = requests.get (url, headers = HEADERS, auth = HTTPBasicAuth ( ' tankjam', 'kermit46709394'))
print(response.text)
#

Test # 3, by auth default parameters in the request is requests.get HTTPBasicAuth, returns the user authentication success information 
Response = requests.get (URL, headers = HEADERS, auth = ( 'tankjam', 'kermit46709394')) 
Print (Response. text) 


'' ' 
upload 
' '' 
Import requests 

# text file Upload 
files1 = { 'file': Open ( 'user.txt', 'RB')} 
# files POST request parameter is a fixed parameter 
response = requests.post ( 'http://httpbin.org/post', Files = files1) 
Print (response.status_code) # 200 is 
Print (response.text) # 200 is 
files3 = { 'Movie': Open ( 'love_for_GD.mp4', 'RB')} 
Response = requests.post ( 'http://httpbin.org/post', files=files3)
 
# upload image files 
files2 = { 'jpg': open ( ' punch .jpg', 'rb')}
response = requests.post('http://httpbin.org/post', files=files2)
Print (response.status_code) # 200 
Print (response.text) # 200 
# 
# upload video files 
Print (response.status_code) # 200 
Print (response.text) # 200

4.selenium basic use:

'' '' '' 
'' ' 
The Selenium module to explain 
a What is selenium? 
    Originally an automated testing tool. You can use it to help us drive the browser 
    automatically performs certain self-defined operations. For example, execute JS code in the page, 
    skip the login authentication. You can use selenium to help us achieve reptiles. 
    
Why use two selenium? 
    1, the advantages of: 
        using log module requests need to analyze a large number of complex processes communicate using selenium 
    can easily bypass the login authentication. 
    
    2. Disadvantages: 
        the browser will load the css, js, images, videos ... data, reptiles efficiency compared requests module is lower. 
        
How to use the three selenium? 
    Download selenium module: 
        PIP3 install -i https://pypi.tuna.tsinghua.edu.cn/simple selenium 
    download the browser Driver: 
        http://npm.taobao.org/mirrors/chromedriver/2.38/ 
'' ' 

# selenium the first 
from selenium import webdriver # used to drive the browser 

from selenium.

# Find property in what manner, By.ID, By.CSS_SELECTOR, By.Class 
from selenium.webdriver.common.by By Import 

from selenium.webdriver.common.keys # Import Keys keyboard operation 

# and together with the following WebDriverWait, EC is expected_conditions alias 
from selenium.webdriver.support Import expected_conditions AS EC 

# wait for a page to load some elements 
from selenium.webdriver.support.wait Import WebDriverWait 
Import Time 

# drive to open the Google browser by Google browser 
# webdriver.Chrome (r 'chromedriver.exe absolute path') 
# = webdriver.Chrome Chrome (r'D: \ BaiduNetdiskDownload \ chromedriver_win32 \ chromedriver.exe ') # absolute path input chromedriver.exe brackets 

# chromedriver.exe interpreter stored in python Scripts folder 

# chrome drive object is a 
Chrome webdriver.Chrome = ()

'' 
Example 1 
'' ' 
# try if unusual 
try: 
    # tank to get request transmitted blog page 
    # chrome.get (' https://www.cnblogs.com/kermitjam/ ') 

    # 1 Parameters: Parameters drive object 2: the waiting time 
    the wait = WebDriverWait (Chrome, 10) 

    #. 1, access Baidu 
    chrome.get ( 'https://www.baidu.com/') 

    # 2, the input block input lookup 
    The input_tag = wait.until ( 
        # calls EC the presence_of_element_located () 
        EC.presence_of_element_located ( 
            # here you can write a tuple 
            # parameter 1: Find a way to attribute 
            # 2 parameters: the name of the property 
            (By.ID, "kw") 
        ) 
    ) 
    the input_tag = wait.until (EC. presence_of_element_located ((By.ID, "kw")))

    # 3, the search one punch man 
    input_tag.send_keys ( 'Superman punch') 

    # 4, press the Enter key keyboard 
    input_tag.send_keys (Keys.ENTER) 
 
    the time.sleep (. 3)

# no matter what will happen to close the browser 
a finally: 
    # close the browser 
    chrome.close () 


'' 
example 2 
'' ' 
the try: 
    # tank to get request transmitted blog page 
    # chrome.get (' https://www.cnblogs.com/kermitjam/ ') 

    # parameter 1: 2 drive object parameters: wait time 
    the wait = WebDriverWait (Chrome, 10) 

    #. 1, access Jingdong home 
    chrome.get ( 'https://www.jd.com/') 

    # 2, the input block input lookup 
    input_tag = wait.until (EC.presence_of_element_located (( By.ID, "Key"))) 

    # 3, search Tang Poems 
    input_tag.send_keys ( 'Three Hundred Tang Poems') 

    # 4, look for the label according to the class attribute name
    = wait.until search_button ( 
        EC.presence_of_element_located ((By.CLASS_NAME, 'Button'))) 
    # 5, click the search button
    search_button.click () 

    the time.sleep (3) 

# no matter what will happen to close the browser 
a finally: 
    # close the browser 
    chrome.close ()

5.selenium basic selector to use:

from selenium import webdriver # used to drive the browser 
Import Time 
# 
# '' ' 
# implicitly wait 
#' '' 
# # driven acquisition, 
Driver = webdriver.Chrome () 
# 
the try: 
    # Explicit Wait: to wait a loading element 
    # parameter 1: 2 drive object parameters: latency 
    # = the wait WebDriverWait (Chrome, 10) 

    driver.get ( 'https://china.nba.com/') 

    # implicit wait: to wait for all the elements of the page load 
    driver .implicitly_wait (10) 
    news_tag = driver.find_element_by_class_name ( 'NAV-News') 
    # label objects acquired 
    Print (news_tag) 
    # tag name acquired 
    Print (news_tag.tag_name) 
# 
# 
    the time.sleep (10) 
# 
the finally: 
    driver.close ()


from selenium import webdriver # used to drive the browser 
Import Time 

'' ' 
=============== all methods ================= == 
    Element is to find a label 
    elements is to find all the labels 

    1, find_element_by_link_text link text go through 
    2, find_element_by_id id to look through 
    . 3, find_element_by_class_name 
    . 4, find_element_by_partial_link_text 
    . 5, find_element_by_name 
    . 6, find_element_by_css_selector 
    . 7, find_element_by_tag_name 
'' ' 
# driven acquisition, 
= webdriver.Chrome Driver () 

the try: 

    # Baidu transmission request to 
    driver.get ( 'https://www.baidu.com/') 
    driver.implicitly_wait (10) 

    #. 1, by linking text find find_element_by_link_text 
    # according to the login
    # Send_tag = driver.find_element_by_link_text ( 'login') 
    # send_tag.click () 

    # 2, find_element_by_partial_link_text find a label by the local text 
    login_button = driver.find_element_by_partial_link_text ( 'registration') 
    login_button.click () 
    the time.sleep (. 1) 

    #. 3 , find_element_by_class_name class attribute name lookup according 
    login_tag = driver.find_element_by_class_name ( 'Tang-Pass-footerBarULogin') 
    login_tag.click () 
    the time.sleep (. 1) 

    #. 4, according to find the name attribute find_element_by_name 
    username = driver.find_element_by_name ( 'userName') 
    username.send_keys ( '15,622,792,660') 
    the time.sleep (. 1) 

    #. 5, the id attribute name lookup find_element_by_id
    = driver.find_element_by_id password ( 'TANGRAM__PSP_10__password') 
    password.send_keys ( '*******') 
    the time.sleep (. 1) 

    #. 6, according to the attribute look find_element_by_css_selector selectors 
    # Find The login button ID 
    login_submit = driver.find_element_by_css_selector ( 'TANGRAM__PSP_10__submit #') 
    # driver.find_element_by_css_selector ( '. Pass-Submit-Button') 
    login_submit.click () 

    #. 7, according to the label name lookup tag find_element_by_tag_name 
    div = driver.find_element_by_tag_name ( 'div') 
    Print (div.tag_name) 

    the time.sleep (10) 

the finally: 
    driver.close ()

  

Guess you like

Origin www.cnblogs.com/hjeqng/p/11041331.html