Training python fifth day

1.requests the post request

2.requests Advanced Usage

3.selenium module

1.

# Post requesting access to GitHub 
# request url: HTTPS: //github.com/login
# request method: POST
# request header:
source of a request on #
# Cookies
# user_agent
# :( body only request only POST)
# the commit: in Sign
# utf8: ✓
# authenticity_token: .........
# the Login: .....
# password: ......
# webauthn-Support: Supported


# get token information
Import Requests
Import Re
headers {= '- Agent-the User': '......'}
Response = requests.get (URL = '......', headers = headers)
authenticity_token the re.findall = ( '<INPUT type = " hidden "name =" authenticity_token "value =" (. *?) "/>',response.text,re.S) [0]
Print (authenticity_token)
# splicing request header
headers2 = {
'the Referer': '.....',
'User_Agent':'......',

}
form_data={
"commit":"Sign in",
"utf8":"✓",
"authenticity_token":authenticity token,
"login":"..",
"password":"....",
"webauthn-support": "supported",
}
response2=requests.post(url='...',data=form_data,headers=headers2,cookies=login_cookies)
print(response2.status_code)
with open('github.html','w',encoding='utf-8')as f:
f.write(response2.text)
2.
Requests Import 
response requests.get = ( 'https://www.baidu.com')
Print (response.status_code) # obtain the appropriate status code
print (response.url) # Get URL
Print (response.encoding) # character encoding
response = .encoding 'UTF_8'
Print (response.text) # Get the text
print (response.content) # obtain a binary stream
print (response.headers) # header information acquisition request page
address print (response.history) on a jump #
print (response.cookies) # get cookoes information
print (response.cookies.get_dict ()) # get converted information into cookies dictionary
print (response.cookies.items ()) # get converted information into cookies dictionary
print (response.encoding)
print (response.elapsed) # access time

# Stream = True, iter_content ()

Import Requests

= URL 'https://vd3.bdstatic.com/mda-ic4pfhh3ex32svqi/hd/mda-ic4pfhh3ex32svqi.mp4?auth_key=1557973824-0-0-bfb2e69bb5198ff65e18065d91b2b8c8&bcevod_channel=searchbox_feed&pd=wisenatural&abtest=all.mp4'
Response = requests.get (URL , Stream = True)
Print (response.content)

with Open ( 'love_for_GD.mp4', 'wb') AS f:
for Content in response.iter_content ():
f.write (Content)

# certificate validation (most sites is HTTPS)
Import requests
# If ssl request checks whether the certificate is valid, given sub-rule, the terminal program
Response = requests.get ( 'https://www.xiaohuar.com')
Print (response.status_code)

# 1 improvement : remove the error, but a warning message will
Import Requests
Response = requests.get ( 'https://www.xiaohuar.com', verify = False)
# does not validate the certificate, a warning message, returns 200
Print (response.status_code)

# 2 improvements: remove the error, and alarm information is removed
Import Requests
Import urllib3
urllib3.disable_warnings () # closure alert
response = requests.get ( 'https://www.xiaohuar.com', verify = False )
Print (response.status_code)

# 3 improved: plus certificate
# many sites are https, but without certificates can also be accessed, most cases can carry or may not carry a certificate
# know almost \ Baidu, etc. are available with or without
# has a mandatory requirement, you must take, such as for the user to direct, before getting the certificate has access to a particular site
Import Requests
Import urllib3
# urllib3.disable_warnings () # close the warning
the Response = requests.get (
' https://www.xiaohuar.com ',
# = False Verify,
CERT = (' / path / server.crt ',' / path / Key '))
Print (response.status_code)

# timeout setting
# timeout two kinds: float or tuple
# Timeout = 0.1 # receiving data representative of timeout
# timeout = (0.1,0.2) # 0.1 0.2 Representative representing the received data link timeout timeout

Import Requests

Response = requests.get ( 'https://www.baidu.com' ,
timeout = 0.0001)
# authentication settings
' ''
when logging site, a box will pop up, asking you to enter a user name and password (similar to the alert), this time not enter html page, to be authorized to enter through the html page.

Requests module provides us with a variety of authentication methods, including basic authentication and other ...

The principle refers to identify the user by entering the user name and password to obtain a user's credentials, and then authorize the user through the token.
Basic authentication:
HTTP Basic Auth authentication HTTP1.0 is proposed. The realm for each client, authentication is performed by way of providing a username and password when the authentication fails, the server receives a client request and returns 401.

'' '
Import Requests
# github be tested by accessing the api
URL =' https://api.github.com/user '
HEADERS = {
'- Agent-the User': 'the Mozilla / 5.0 (the Windows NT 10.0; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 70.0.3538.77 Safari / 537.36',
}

# 1 testing, else return 401
Response = requests.get ( URL, headers = hEADERS)
Print (response.status_code) # 401
Print (response.text)


# 2 test, carried out in the authentication HTTPBasicAuth requests.auth, returns the user authentication success information
from requests.auth Import HTTPBasicAuth
Response = requests.get (URL, headers = hEADERS, auth = HTTPBasicAuth ( 'tankjam', 'kermit46709394'))
Print (response.text)


# 3 test, by default parameters within requests.get auth request is HTTPBasicAuth, returns the user authentication success information
response = requests.get (URL, headers = hEADERS, the auth = ( 'tankjam', 'kermit46709394'))
Print (response.text)

. 3.
# What is a the Selenium 
# originally an automated testing tool that you can use to help us drive the browser to automatically perform certain defined operations,
# example js code execution on the page, skipping the login screen.

# Why use selenium?
# 1. Advantages:
# use requests analysis module requires a lot of communication processes
#, using selenium can easily skip the login authentication, reptiles efficient than requests
# 2. shortcoming
# browser will load the css, js, images, videos. . . Data reptiles efficiency is lower than the requests module

# using selenium?
1. Download module selenium #
# 2. Download browser driver

# selenium may be used for the first time there is a need to turn off the firewall error
from the webdriver selenium Import
Import Time

Chrome = webdriver.Chrome ()
chrome.get ( 'HTTPS: // WWW. baidu.com/ ')
the time.sleep (100)
# From selenium import webdriver # browser is used to drive the 
# from selenium.webdriver import ActionChains # crack the code when the slide can be used to drag the picture
# from selenium.webdriver.common.by import By # to find in what way, By .ID, By.CSS_SELECTOR
# Import from selenium.webdriver.common.keys keyboard operation keys #
# from selenium.webdriver.support import expected_conditions as EC # and together with the following WebDriverWait
# from selenium.webdriver.support.wait import WebDriverWait wait for page load certain elements #
# Import Time
# the try:
# = Driver webdriver.Chrome ()
# driver.get ( 'https://www.baidu.com')
# = the wait WebDriverWait (Driver, 10)
# = The input_tag wait.until (EC.presence_of_element_located ((By.ID, 'kW')))
# input_tag.send_keys ( 'beauty')
Input_tag.send_keys # (Keys.ENTER)
# the time.sleep (. 5)
# the finally:
# driver.close ()
# Selenium from the webdriver # Import for driving the browser
# from selenium.webdriver import ActionChains # sliding codes of cracks when used can drag images
# from selenium.webdriver.common.by import by # to find in what way, By.ID, By.CSS_SELECTOR
# # from selenium.webdriver.common.keys Import keys keyboard Controls
# from selenium. webdriver.support import expected_conditions as EC #, and together with the following WebDriverWait
# from selenium.webdriver.support.wait import WebDriverWait # wait for a page to load certain elements
# Import Time
# webdriver.Chrome driver = () # open Google drive browser
# driver.get ( 'https://www.baidu.com/') # send a get request to Baidu
# Driver.implicitly_wait (5) # Implicit wait if all elements are not loaded then wait five seconds to load after the completion of all tags waiting to find labels for all elements get before setting
# # selenium comes parsing
# the try:
# # All methods =============== ===================
# # Element is to find a label
# # elements is to find all tags
# # 1, find_element_by_id find the id
# # 2, find_element_by_link_text go through the link text
# #. 3, find_element_by_partial_link_text
# #. 4, find_element_by_tag_name
# #. 5, find_element_by_class_name
# #. 6, find_element_by_name
# #. 7, find_element_by_css_selector
# #. 8, find_element_by_xpath
# # 1, find_element_by_id find elements according to id
# # Input_tag = driver.find_element_by_id ( 'kw ') # kw find tag element
# # input_tag.send_keys ( 'kermit big baby') input kermit big baby #
# # input_tag.send_keys (Keys.ENTER) # Press Enter button
# # 2, find_element_by_link_text matches according exact text
# # login_button = driver.find_element_by_link_text ( 'login') # find landing a text label
# # login_button = driver.find_element_by_link_text ( 'login') # because it can not find an exact look ' landing '
# # login_button.click () click the login button #
# # 3, find_element_by_partial_link_text # according to the text to find local matching tag
# # login = driver.find_element_by_partial_link_text (' Gordon ') # login local matching has the word label
# # login. click () # click event
# # 4, find_element_by_tag_name # look under the label name
A driver.find_element_by_tag_name # = # ( 'A')
# # Print (A)
# #. 5, according to find find_element_by_class_name earth element
# # login_tag = driver.find_element_by_class_name ( 'tang -pass-footerBarULogin') # login button according to find earth element
# # login_tag.click () # click Log event
# # 6, find_element_by_name to find properties based on name
# # username = driver.find_element_by_name ( 'userName')
# # = driver.find_element_by_name password ( 'password')
# # username.send_keys ( '15,622,792,660')
# # password.send_keys ( 'k46709394')
# = # login_button driver.find_element_by_id ( 'TANGRAM__PSP_10__submit')
# # login_button.the Click ()
# #. 7, according to the attribute selector to find find_element_by_css_selector
Search driver.find_element_by_css_selector = # ( '. S_ipt')
# search.send_keys ( 'Male') # Baidu input box is added to the guy
# search.send_keys (Keys.ENTER) # hit enter
# # 8, find_element_by_xpath # look for the xpath
# # wait five seconds
# the time.sleep (5)
# a finally:
# driver.close ()
# 1, the Selenium just simulate the behavior of the browser, and the browser parses the page takes time (to perform css, js), some elements it may take a while to load up, in order to guarantee to find the elements, you must wait for

# 2, wait way in two ways:
# implicit wait: is set before browser.get ( 'xxx'), effective for all elements
# explicit waiting: after browser.get ( 'xxx'), is only valid for a certain element
......
......
crawling west stab agent:
'' ' 
Crawling west stab free agents:
1. Access West stab free agent page
2. Parse and extract all agents by re module
3. Test crawled through the proxy ip test site
4. If test_ip throws an exception on behalf of void agents, or agents effective
5. the use of an effective proxy proxy test

<TR class = "ODD">
<TD class = "Country"> <IMG the src = "// fs.xicidaili.com/images/flag/cn. PNG "Alt =" Cn "> </ TD>
<TD> 112.85.131.99 </ TD>
<TD> 9999 </ TD>
<TD>
<a href="/2019-05-09/jiangsu"> Nantong </a>
</ TD>
<TD class = "Country"> high hiding </ TD>
<TD> the HTTPS </ TD>
<TD class = "Country">
<div title = "0.144 seconds"class="bar">
<div class="bar_inner fast" style="width:88%">

</div>
</div>
</td>
<td class="country">
<div title="0.028秒" class="bar">
<div class="bar_inner fast" style="width:97%">

</div>
</div>
</td>

<td>6天</td>
<td>19-05-16 11:20</td>
</tr>
re:
<tr class="odd">(.*?)</td>.*?<td>(.*?)</td>

'''
import requests
import re
import time

HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
}


def get_index(url):
time.sleep(1)
response = requests.get(url, headers=HEADERS)
return response


parse_index DEF (text):
.?.?.?.? = ip_list the re.findall ( '<TR class = "ODD"> * <TD> (*) </ TD> * <TD> (*) </ TD > ', text, re.S)
for for ip_port in ip_list:
IP =': 'the Join (for ip_port).
the yield IP

DEF test_ip (IP):
Print (' test IP: S% '% IP)
the try:
Proxies = {
' HTTPS ': ip
}

# ip test site
ip_url =' https://www.ipip.net/ '

# ip access to valid and invalid proxy test site, if the results returned by 200 representatives of the current test ip normal
response requests.get = (ip_url, headers = hEADERS, Proxies Proxies =, = timeout. 1)

IF response.status_code == 200 is:
ip return

# ip proxy invalid if an exception is thrown
the except Exception AS E:
Print (E)

# 使用代理爬取nba
def spider_nba(good_ip):
url = 'https://china.nba.com/'

proxies = {
'https': good_ip
}

response = requests.get(url, headers=HEADERS, proxies=proxies)
print(response.status_code)
print(response.text)


if __name__ == '__main__':
base_url = 'https://www.xicidaili.com/nn/{}'

for line in range(1, 3677):
ip_url = base_url.format(line)

response = get_index(ip_url)

ip_list = parse_index(response.text)
for ip in ip_list:
# print(ip)
good_ip = test_ip(ip)

if good_ip:
# 真是代理,开始测试
spider_nba (good_ip)



# official website link: http://docs.python-requests.org/en/master/user/advanced/#proxies

# proxy settings: first sends a request to the agent and then sent by the agent help (ip is a common seal things)
Import Requests
Proxies = {
proxy # password with the user name, the symbol @ before the user name and password is
' HTTP ':' HTTP: // Tank: 123 @ localhost: 9527 ',
' HTTP ':' HTTP: / / localhost: 9527 ',
' HTTPS ':' HTTPS: // localhost: 9527 ',
}
Response = requests.get (' https://www.12306.cn ',
Proxies = Proxies)
Print (response.status_code)


# socks proxy support, installation: the install PIP Requests [socks]
Import Requests
Proxies = {
'HTTP': 'Socks5: // User: Pass @ Host: Port',
'HTTPS': 'Socks5: // User: Pass @ Host:port'
}
respone=requests.get('https://www.12306.cn',
proxies=proxies)

print(respone.status_code)


Guess you like

Origin www.cnblogs.com/7777qqq/p/11041731.html