A content today
1. post request on request:
'' '' '' '' ' POST request landing GitHub ' '' Import Requests Import Re obtain a token to access information # login page '' ' request url: https://github.com/login request method: GET response headers: Set-Cookie request header: cookies the User-- Agent '' ' headers = { ' - Agent-the User ':' the Mozilla / 5.0 (the Windows NT 10.0; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 65.0.3325.146 Safari / 537.36 ' } Response = requests.get (URL =' HTTPS: //github.com/login ', headers = headers) # Print (Response.text) # login page returned to convert information into a dictionary cookies login_cookies = response.cookies.get_dict () = the re.findall authenticity_token ( '<INPUT type = "hidden" name = "authenticity_token" value = "(. *?)" />', response.text, re.S) [0] Print (authenticity_token) # two to sessionurl send a POST request '' ' request URL: https://github.com/session request method: POST request header: a request coming from the # the Referer: https://github.com/login cookies: ... the User -Agent: Mozilla / 5.0 (Windows NT 10.0; WOW64) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome / 65.0.3325.146 Safari / 537.36 request body: only POST request will have a request body. the commit: Sign in utf8: ✓ authenticity_token: login: tankjam1 password: ***** webauthn-support: unsupported ''' # 拼接请求头信息 headers2 = { 'Referer': 'https://github.com/login', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36', } # 拼接请求体信息 form_data = { "commit": "Sign in", "utf8": "✓", "authenticity_token": authenticity_token, "login": "tankjam", "password": "kermit46709394", "webauthn-support": "unsupported", # carrying the request header, the request message body cookies, login page # Post request address sent to the session } response2 = requests.post(url='https://github.com/session', data=form_data, headers=headers2, cookies=login_cookies) print(response2.status_code) # print(response2.text) with open('github.html', 'w', encoding='utf-8') as f: f.write(response2.text)
2.response Base:
Requests Import Response = requests.get ( 'https://baidu.com') # Response Response print (response.status_code) # fetch response status code print (response.url) # Get the url print (response.encoding) # character encoding response.encoding = 'UTF-. 8' Print (response.text) # Get the text print (response.content) # obtain a binary stream print (response.headers) # page acquisition request header information once the print (response.history) # Jump address # 1, 2 returns the cookie dictionary, the object is returned cookies print (response.cookies) # cookies to obtain information, Print (response.cookies.get_dict ()) # get converted information into cookies dictionary print (response.cookies.items ()) # get converted cookies information into the dictionary Print (response.encoding) Print (response.elapsed) # access time Import requests # send a request to get an audio address = URL 'https://vd3.bdstatic.com/mda-ic4pfhh3ex32svqi/hd/mda-ic4pfhh3ex32svqi.mp4?auth_key=1557973824-0-0-bfb2e69bb5198ff65e18065d91b2b8c8&bcevod_channel=searchbox_feed&pd=wisenatural&abtest=all.mp4' Response = requests.get (URL , stream = True) # stream = True set the content of a iterator object Print (response.content) with Open ( 'love_for_GD.mp4', 'WB') aS F: for content in response.iter_content (): F. write (content)
3.requests Advanced Usage:
'' ' ' '' '' ' Certificate validation (most sites are HTTPS) ' '' Import Requests # If ssl request, first check whether the certificate is legitimate, sub-rule error, the program terminal response = requests.get ( 'https : //www.xiaohuar.com ') Print (response.status_code) # 1 improvement: remove error, but a warning message will Import Requests Response = requests.get (' https://www.xiaohuar.com ', = False Verify ) # does not verify the certificate, a warning message, return 200 is Print (response.status_code) # 2 improvements: remove the error, and alarm information is removed Import Requests Import urllib3 urllib3.disable_warnings () # closure alert response = requests.get ( 'https: / /www.xiaohuar.com ', Verify = False) Print (response.status_code) # 3 # improvements: adding the certificate # many sites are https, but can also be accessed without a certificate,Most cases can carry or may not carry a certificate # know almost \ Baidu the like are available with or without # have rigid requirements,It must take, such as for the user to direct, before getting the certificate has access to a particular site Requests Import Import urllib3 # urllib3.disable_warnings () # closure alert # pseudocode Response = requests.get ( 'https://www.xiaohuar.com', # Verify = False, the directory where the certificate # /path/server.crt , / path / Key CERT = ( '/ path / server.crt', '/ path / Key')) Print (response.status_code) '' ' timeout setting ' '' # timeout setting # timeout two kinds: float or tuple # timeout = 0.1 # receiving data representative of timeout # timeout = (0.1,0.2) # 0.1 0.2 Representative representing the received data link timeout timeout Import Requests Response = requests.get ( 'https://www.baidu.com' , timeout = 0.0001) # Print (response.elapsed) Print (the Response.status_code) ''' Proxy settings: first sends a request to the agent, and then sent by the agent to help (blocked ip is a common thing) '' ' Import Requests Proxies = { proxy username password # tape, before the @ sign is the user name and password ' http ': 'HTTP: // Tank: 123 @ localhost: 9527', 'HTTP': 'HTTP: // localhost: 9527', 'HTTPS': 'HTTPS: // localhost: 9527', } Response = requests.get ( ' https://www.12306.cn ', Proxies = Proxies) Print (response.status_code) ' '' crawling west stab free agents: 1. access West stab free agent page 2. re module parses and extracts all agents 3 . crawling agents tested by ip test site 4. If test_ip throws an exception representative of void agents, or agents effective 5. the use of an effective proxy proxy test <TR class = "ODD"> <td class="country"><img src="//fs.xicidaili.com/images/flag/cn.png" alt="Cn"></td> <TD> 112.85.131.99 </ td> <td>9999</td> <td> <a href="/2019-05-09/jiangsu">江苏南通</a> </td> <td class="country">高匿</td> <td>HTTPS</td> <td class="country"> <div title="0.144秒" class="bar"> <div class="bar_inner fast" style="width:88%"> </div> </div> </td> <td class="country"> <div title="0.028秒" class="bar"> <div class="bar_inner fast" style="width:97%"> </div> </div> </td> <td>6天</td> <td>19-05-16 11:20</td> </tr> re: <tr class="odd">(.*?)</td>.*?<td>(.*?)</td> ''' import requests import re import time HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', } def get_index(url): time.sleep(1) response = requests.get(url, headers=HEADERS) return response def parse_index(text): ip_list = re.findall('<tr class="odd">.*?<td>(.*?)</td>.*?<td>(.*?)</td>', text, re.S) for ip_port in ip_list: ip = ':'.join(ip_port) yield ip def test_ip(ip): the try: print ( 'Test IP: S%'% IP) = {Proxies 'HTTPS': ip } # ip test site ip_url = 'https://www.ipip.net/' # ip access to valid and invalid proxy test site, if the results returned by 200 representatives of the current test ip normal Response = requests.get (ip_url, headers = hEADERS, Proxies Proxies =, = timeout. 1) IF 200 is response.status_code ==: Print (F 'useful ip: {ip}') return ip # If invalid proxy ip exception is thrown the except exception AS E: Print (E) # crawling using the proxy NBA DEF spider_nba (good_ip): URL = 'https://china.nba.com/' Proxies = { 'HTTPS': good_ip } Response = requests.get (url, headers = hEADERS, proxies = proxies) Print (response.status_code) Print (response.text) IF the __name__ == '__main__': Import Requests = the base_url 'https://www.xicidaili.com/nn/ {}' for Line in Range (. 1, 3677): ip_url = base_url.format (Line) Response = get_index (ip_url) # Western Analytical Agent retrieves each thorn ip list ip_list = parse_index (response.text) # cycles each ip for ip in ip_list: # Print (ip) # ip crawling down to test good_ip = test_ip (ip) IF good_ip: # agents really began testing spider_nba (good_ip) '' ' authentication settings ' '' URL = 'https://api.github.com/user' HEADERS = { By accessing github # api to test from requests.auth import HTTPBasicAuth '- Agent-the User': 'the Mozilla / 5.0 (the Windows NT 10.0; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 70.0.3538.77 Safari / 537.36', } # 1 testing, else return 401 # Response = requests.get (URL, headers = hEADERS) # Print (response.status_code) # 401 # Print (response.text) '' ' print result: { "Message": "the Requires authentication", "documentation_url": "HTTPS: // Developer. github.com/v3/users/#get-the-authenticated-user " } '' ' # # 2 test, carried out in the authentication HTTPBasicAuth requests.auth, returns the user authentication success information response = requests.get (url, headers = HEADERS, auth = HTTPBasicAuth ( ' tankjam', 'kermit46709394')) print(response.text) # Test # 3, by auth default parameters in the request is requests.get HTTPBasicAuth, returns the user authentication success information Response = requests.get (URL, headers = HEADERS, auth = ( 'tankjam', 'kermit46709394')) Print (Response. text) '' ' upload ' '' Import requests # text file Upload files1 = { 'file': Open ( 'user.txt', 'RB')} # files POST request parameter is a fixed parameter response = requests.post ( 'http://httpbin.org/post', Files = files1) Print (response.status_code) # 200 is Print (response.text) # 200 is files3 = { 'Movie': Open ( 'love_for_GD.mp4', 'RB')} Response = requests.post ( 'http://httpbin.org/post', files=files3) # upload image files files2 = { 'jpg': open ( ' punch .jpg', 'rb')} response = requests.post('http://httpbin.org/post', files=files2) Print (response.status_code) # 200 Print (response.text) # 200 # # upload video files Print (response.status_code) # 200 Print (response.text) # 200
4.selenium basic use:
'' '' '' '' ' The Selenium module to explain a What is selenium? Originally an automated testing tool. You can use it to help us drive the browser automatically performs certain self-defined operations. For example, execute JS code in the page, skip the login authentication. You can use selenium to help us achieve reptiles. Why use two selenium? 1, the advantages of: using log module requests need to analyze a large number of complex processes communicate using selenium can easily bypass the login authentication. 2. Disadvantages: the browser will load the css, js, images, videos ... data, reptiles efficiency compared requests module is lower. How to use the three selenium? Download selenium module: PIP3 install -i https://pypi.tuna.tsinghua.edu.cn/simple selenium download the browser Driver: http://npm.taobao.org/mirrors/chromedriver/2.38/ '' ' # selenium the first from selenium import webdriver # used to drive the browser from selenium. # Find property in what manner, By.ID, By.CSS_SELECTOR, By.Class from selenium.webdriver.common.by By Import from selenium.webdriver.common.keys # Import Keys keyboard operation # and together with the following WebDriverWait, EC is expected_conditions alias from selenium.webdriver.support Import expected_conditions AS EC # wait for a page to load some elements from selenium.webdriver.support.wait Import WebDriverWait Import Time # drive to open the Google browser by Google browser # webdriver.Chrome (r 'chromedriver.exe absolute path') # = webdriver.Chrome Chrome (r'D: \ BaiduNetdiskDownload \ chromedriver_win32 \ chromedriver.exe ') # absolute path input chromedriver.exe brackets # chromedriver.exe interpreter stored in python Scripts folder # chrome drive object is a Chrome webdriver.Chrome = () '' Example 1 '' ' # try if unusual try: # tank to get request transmitted blog page # chrome.get (' https://www.cnblogs.com/kermitjam/ ') # 1 Parameters: Parameters drive object 2: the waiting time the wait = WebDriverWait (Chrome, 10) #. 1, access Baidu chrome.get ( 'https://www.baidu.com/') # 2, the input block input lookup The input_tag = wait.until ( # calls EC the presence_of_element_located () EC.presence_of_element_located ( # here you can write a tuple # parameter 1: Find a way to attribute # 2 parameters: the name of the property (By.ID, "kw") ) ) the input_tag = wait.until (EC. presence_of_element_located ((By.ID, "kw"))) # 3, the search one punch man input_tag.send_keys ( 'Superman punch') # 4, press the Enter key keyboard input_tag.send_keys (Keys.ENTER) the time.sleep (. 3) # no matter what will happen to close the browser a finally: # close the browser chrome.close () '' example 2 '' ' the try: # tank to get request transmitted blog page # chrome.get (' https://www.cnblogs.com/kermitjam/ ') # parameter 1: 2 drive object parameters: wait time the wait = WebDriverWait (Chrome, 10) #. 1, access Jingdong home chrome.get ( 'https://www.jd.com/') # 2, the input block input lookup input_tag = wait.until (EC.presence_of_element_located (( By.ID, "Key"))) # 3, search Tang Poems input_tag.send_keys ( 'Three Hundred Tang Poems') # 4, look for the label according to the class attribute name = wait.until search_button ( EC.presence_of_element_located ((By.CLASS_NAME, 'Button'))) # 5, click the search button search_button.click () the time.sleep (3) # no matter what will happen to close the browser a finally: # close the browser chrome.close ()
5.selenium basic selector to use:
from selenium import webdriver # used to drive the browser Import Time # # '' ' # implicitly wait #' '' # # driven acquisition, Driver = webdriver.Chrome () # the try: # Explicit Wait: to wait a loading element # parameter 1: 2 drive object parameters: latency # = the wait WebDriverWait (Chrome, 10) driver.get ( 'https://china.nba.com/') # implicit wait: to wait for all the elements of the page load driver .implicitly_wait (10) news_tag = driver.find_element_by_class_name ( 'NAV-News') # label objects acquired Print (news_tag) # tag name acquired Print (news_tag.tag_name) # # the time.sleep (10) # the finally: driver.close () from selenium import webdriver # used to drive the browser Import Time '' ' =============== all methods ================= == Element is to find a label elements is to find all the labels 1, find_element_by_link_text link text go through 2, find_element_by_id id to look through . 3, find_element_by_class_name . 4, find_element_by_partial_link_text . 5, find_element_by_name . 6, find_element_by_css_selector . 7, find_element_by_tag_name '' ' # driven acquisition, = webdriver.Chrome Driver () the try: # Baidu transmission request to driver.get ( 'https://www.baidu.com/') driver.implicitly_wait (10) #. 1, by linking text find find_element_by_link_text # according to the login # Send_tag = driver.find_element_by_link_text ( 'login') # send_tag.click () # 2, find_element_by_partial_link_text find a label by the local text login_button = driver.find_element_by_partial_link_text ( 'registration') login_button.click () the time.sleep (. 1) #. 3 , find_element_by_class_name class attribute name lookup according login_tag = driver.find_element_by_class_name ( 'Tang-Pass-footerBarULogin') login_tag.click () the time.sleep (. 1) #. 4, according to find the name attribute find_element_by_name username = driver.find_element_by_name ( 'userName') username.send_keys ( '15,622,792,660') the time.sleep (. 1) #. 5, the id attribute name lookup find_element_by_id = driver.find_element_by_id password ( 'TANGRAM__PSP_10__password') password.send_keys ( '*******') the time.sleep (. 1) #. 6, according to the attribute look find_element_by_css_selector selectors # Find The login button ID login_submit = driver.find_element_by_css_selector ( 'TANGRAM__PSP_10__submit #') # driver.find_element_by_css_selector ( '. Pass-Submit-Button') login_submit.click () #. 7, according to the label name lookup tag find_element_by_tag_name div = driver.find_element_by_tag_name ( 'div') Print (div.tag_name) the time.sleep (10) the finally: driver.close ()