python Training Day

Reptiles whole process #: 
# 1. Send request (library)
# Requests
# Selenium
# 2. acquires the corresponding data (the server returns)
# 3. parses and extracts data (resolver library)
# Re regular
# BS4 (beautifulsoup4)
# the Xpath
# 4. save the data (store)
# mongDB

# crawler frame
# Scrapy (object-oriented)

# requests module using detail
# HTTP protocol:
# request URL:
#
# request method:
# the GET
# request header:
# cookies: may concern
# User-Agent: to prove browser
#import requests
# requests.get Response = ()
# Print (response.status_code)
# Print (response.text)

# access parameters carried in the request header know almost
# Import requests
# # request Dictionary
Headers = {# 'User-Agent': 'the Mozilla / 5.0 (the Windows NT 10.0; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 74.0.3729.169 Safari / 537.36'
#
#}
# # in the get request, add Agent-User
# Response = requests.get (URL = 'HTTPS: //www.zhihu.com/explore',headers=headers)
# Print (response.status_code)
# with Open (' zhihu.html ',' W ' , encoding = 'UTF-. 8') AS F:
# f.write (response.text)
#
# carrying cookies
# carrying cookies cracks github login authentication log
# request url :. . . .
# Request method: . . .
# Request header: . . .
Requests Import
URL = 'HTTPS: //home.cnblogs.com/set/'
headers = { 'User-Agent': 'the Mozilla / 5.0 (the Windows NT 10.0; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 74.0 .3729.169 Safari / 537.
'Cookies'' _ = ga GA1.2.736039140.1560498371; _gid = GA1.2.393939017.1560498371; = __gads ID = 22042d69ef7c440a: T = 1560498371: S = ALNI_MZjbBvbmYulhYR0hD7DDAvxO0aolQ; .Cnblogs.AspNetCore.Cookies = CfDJ8D8Q4oM3DPZMgpKI1MnYlrl5aDQbB7qHF12lN377FcJeizO5Dr4IA_1e7Aq8woZhTxdhKDrbe8NA3gDFqxX5fXn7Op4tblZ3WlqCLIBc9yYqTekcG0jfa9xAH-ur9i-QKr9dvFLlxL1TVSknTiV9iA9nxENBL_WJqnpg8Lo7M5DkfKd0hslNAvuFza9WE3InaBkqJom6ThPvt0z-LN0yviYk5duwVIT8HM1tfOHM2KT_ERkPqKSUTgVRKYGKWrMsG89yDtjKBL1lp0IjzQtzIzK0215tgd3fh0guFL2U994D-ZgHTQthJ0ZZErBUrZ3Z2aHMiJnHXVJLWW3NWAlRuk-R4snWbHpJt8diYsfn-P-q79Ms2SmCAKEg8Vqzf41Qb5lYT_qvGWw0vU3uZwglGwb6KycLuTwKVIXYcrrmgR_F5mFa6MnIoylo1ljVhgRROZgBVQz15SMONXFGTpaX8zI; . CNBlogsCookie = A7F62226302E1403835FB5491EFFE521C6FEB4D05375BC64EC3B87D308A75E4372DBFD7E26B197F93A52D7C4212BA3EF74F4A65A51B7CA92266DAA7F0365C3C7FE6BA6294557EF3FB7CAB11990D3E5723D5FEB51; _gat = 1 '}
github_res=requests.get(url, headers=headers)

import requests
cookie = { 'Cookies'' _ = ga GA1.2.736039140.1560498371; _gid = GA1.2.393939017.1560498371; = __gads ID = 22042d69ef7c440a: T = 1560498371: S = ALNI_MZjbBvbmYulhYR0hD7DDAvxO0aolQ; .Cnblogs.AspNetCore.Cookies = CfDJ8D8Q4oM3DPZMgpKI1MnYlrl5aDQbB7qHF12lN377FcJeizO5Dr4IA_1e7Aq8woZhTxdhKDrbe8NA3gDFqxX5fXn7Op4tblZ3WlqCLIBc9yYqTekcG0jfa9xAH-ur9i-QKr9dvFLlxL1TVSknTiV9iA9nxENBL_WJqnpg8Lo7M5DkfKd0hslNAvuFza9WE3InaBkqJom6ThPvt0z-LN0yviYk5duwVIT8HM1tfOHM2KT_ERkPqKSUTgVRKYGKWrMsG89yDtjKBL1lp0IjzQtzIzK0215tgd3fh0guFL2U994D-ZgHTQthJ0ZZErBUrZ3Z2aHMiJnHXVJLWW3NWAlRuk-R4snWbHpJt8diYsfn-P-q79Ms2SmCAKEg8Vqzf41Qb5lYT_qvGWw0vU3uZwglGwb6KycLuTwKVIXYcrrmgR_F5mFa6MnIoylo1ljVhgRROZgBVQz15SMONXFGTpaX8zI; . CNBlogsCookie = A7F62226302E1403835FB5491EFFE521C6FEB4D05375BC64EC3B87D308A75E4372DBFD7E26B197F93A52D7C4212BA3EF74F4A65A51B7CA92266DAA7F0365C3C7FE6BA6294557EF3FB7CAB11990D3E5723D5FEB51; _gat = 1 '}
= requests.get github_res (URL, headers = headers, Cookies Cookies =)
Print ('[email protected] 'in github_res.text)

multithreaded reptiles
3.7 # 3.6python interpreter given the shutdown 
Import Requests
Import Re
Import UUID
from the ThreadPoolExecutor # Import concurrent.futures introduced thread pool module
pool = ThreadPoolExecutor (50) # 50 threads the thread pool limit
# crawlers Trilogy

# 1 sends a request
def get_page (URL):
Print (F 'start asynchronous task: URL {}')
Response = requests.get (URL)
return Response
# 2. analysis data
# acquires video before parsing Home ID
DEF parse_index (RES):
Response = res.result ()
ID_LIST = re.findall ( '<A href =. "? Video _ (*)"', response.text, re.S)
for m_id in ID_LIST:
detail_url = 'https://www.pearvideo.com/video_ '+ m_id
pool.submit (get_page, detail_url) .add_done_callback (parse_detail)
# resolve details page for the video url
def parse_detail(res):
response=res.result()
movie_url=re.findall('srcUrl="(.*?)"',response.text,re.S)[0]
pool.submit(get_page, movie_url).add_done_callback(save_movie)
# 保存数据
def save_movie(res):
movie_res=res.result()
with open(f'{uuid.uuid4()}.mp4','wb')as f:
f.write(movie_res.content)
print(f'视频下载结束:{movie_res.url}')
f.flush()
if __name__ == '__main__':
url='https://www.pearvideo.com/'
pool.submit(get_page, url).add_done_callback(parse_index)

爬取豆瓣部分电影相关信息
'' '' '' 
'' '
Home:
https://movie.douban.com/top250
GET
the User-Agent: Mozilla / 5.0 (Windows NT 10.0; WOW64) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome / 65.0. 3325.146 Safari / 537.36

Re regular:
# movie details page url, image links, movie name, movie score, the number of evaluators
<div class = "item"> * href = "(*.?)"> * src =.?.? "(. *?)". *? <span class = "title"> (. *?) </ span>. *? <span class = "rating_num". *?> (. *?) </ span> .? * <span> al evaluation (*.?)
'' '
Import Requests
Import Re
URL =' https://movie.douban.com/top250 '
headers = {
' - Agent-the User ':' the Mozilla / 5.0 (the Windows NT 10.0; WOW64) AppleWebKit / 537.36 (KHTML,the Gecko like) the Chrome / 65.0.3325.146 Safari / 537.36 '
}
#. 1, the fetch response data to the transmission request watercress TOP250
= requests.get the Response (url, headers = headers)

# Print (response.text)

# 2, through regular direct extraction of data
# movie details page url, image links, movie name, movie score, the number of evaluators
movie_content_list = re.findall (
# regular rules
' <div class = "item" >. *? href = "(. *?)">. *? src = "(. *?)". *? <span class = "title"> (. (. *?) who evaluated ', *?) </ span>. *? <span class = "rating_num." *?> (. *?) </ span>. *? <span>

# parse text
response. text,

# matching mode
re.S)

for movie_content in movie_content_list:
# decompression assignment of each film
detail_url, movie_jpg, name, Point, NUM = movie_content
Data = F 'movie Title: {name}, details page url: {detail_url} picture url: {movie_jpg}, Rating: {point}, the number of evaluators: {num} \ n '
Print (the Data)

# 3, save the data, the information is written to a movie file
with open('douban.txt', 'a', encoding='utf-8') as f:
f.write(data)

 

 
 
 

Guess you like

Origin www.cnblogs.com/7777qqq/p/11025103.html