python training day4

Today is the fourth day of training python today to teach some of the skills are mainly used.

A, python reptile:

1. Video crawling pears all video page:

 
'''
Crawling pear video:
Request url:
    https://www.pearvideo.com/
     
Request mode:
    GET
     
Request header:
    user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36
'''
 
# import requests
# Import re # regular module
#
# # 1, sends a request for pear video details page, fetch response data
# response = requests.get(url='https://www.pearvideo.com/')
# print(response.status_code)
# print(response.text)
#
# # Re.findall ( 'regular matching rule', 'parse the text', 'normal mode')
# # Re.S: global mode (for travel match the entire text)
##. Refers to the current location
# # * Refers to find all
# '''
# <a href="video_1543373"
# <a href="video_(.*?)"  # 提取1543373
# '''
#
# # 2, home video acquisition details page ID
# res = re.findall('<a href="video_(.*?)"', response.text, re.S)
# print(res)
#
#
# for m_id in res:
# # Stitching details page url
#     detail_url = 'https://www.pearvideo.com/video_' + m_id
#     print(detail_url)
 
 
 
import requests
import re # regular module
# Uuid.uuid4 () can generate a unique random string section of the world according to the time stamp
import uuid
 
# Reptile three-part song
 
# 1, the transmission request
def get_page(url):
    response = requests.get(url)
    return response
 
# 2, analysis data
# Parse home page for the video details page ID
def parse_index(text):
    res = re.findall('<a href="video_(.*?)"', text, re.S)
    # print(res)
 
    detail_url_list = []
    for m_id in res:
        # Stitching details page url
        detail_url = 'https://www.pearvideo.com/video_' + m_id
        # print(detail_url)
        detail_url_list.append(detail_url)
 
    # print(detail_url_list)
 
    return detail_url_list
 
# Parse details page for the video url
def parse_detail(text):
    ''''''
    '''
        (. *?): Extract the contents of the brackets
        * ?: direct match
        <video webkit-playsinline="" playsinline="" x-webkit-airplay="" autoplay="autoplay" src="https://video.pearvideo.com/mp4/adshort/20190613/cont-1566073-14015522_adpkg-ad_hd.mp4" style="width: 100%; height: 100%;"></video>
         
    正则: <video.*?src="(.*?)"
     
    # The above analysis is not required to write
     
    Regular: srcUrl = "(.? *)"
    '''
    movie_url = re.findall('srcUrl="(.*?)"', text, re.S)[0]
    return movie_url
 
 
# 3, save data
def save_movie(movie_url):
    response = requests.get(movie_url)
    # Video written to the local
    with open(f'{uuid.uuid4()}.mp4', 'wb') as f:
        f.write(response.content)
        f.flush()
 
if __name__ == '__main__': # main + ENTER
 
    # 1, sends a request to the home page
    index_res = get_page(url='https://www.pearvideo.com/')
 
    # 2, page parsing, for details page id
    detail_url_list = parse_index(index_res.text)
    # print(detail_url_list)
 
    # 3, sends a request for details of each page url
    for detail_url in detail_url_list:
        detail_res = get_page(url=detail_url)
        print(detail_res.text)
 
        # 4, parsing the details page for the video url
        movie_url = parse_detail(detail_res.text)
        print(movie_url)
 
        # 5, save video
        save_movie(movie_url)

  2 above higher performance crawled:

import requests
import re # regular module
# Uuid.uuid4 () can generate a unique random string section of the world according to the time stamp
import uuid
# Import thread pool module
from concurrent.futures import ThreadPoolExecutor
# Thread pool thread limit 50
pool = ThreadPoolExecutor(50)
 
# Reptile three-part song
 
# 1, the transmission request
def get_page(url):
    print (f 'Start asynchronous task: {url}')
    response = requests.get(url)
    return response
 
 
# 2, analysis data
# Parse home page for the video details page ID
def parse_index(res):
 
    response = res.result()
    # ID extracted Home All
    id_list = re.findall('<a href="video_(.*?)"', response.text, re.S)
    # print(res)
 
    # Id circulation list
    for m_id in id_list:
        # Stitching details page url
        detail_url = 'https://www.pearvideo.com/video_' + m_id
        # print(detail_url)
        # Submit details page url to get_page function
        pool.submit(get_page, detail_url).add_done_callback(parse_detail)
 
 
# Parse details page for the video url
def parse_detail(res):
    response = res.result()
    movie_url = re.findall('srcUrl="(.*?)"', response.text, re.S)[0]
    # Asynchronous submit the video url passed get_page function, the result returned to pass save_movie
    pool.submit(get_page, movie_url).add_done_callback(save_movie)
 
 
# 3, save data
def save_movie(res):
 
    movie_res = res.result()
 
    # Video written to the local
    with open(f'{uuid.uuid4()}.mp4', 'wb') as f:
        f.write(movie_res.content)
        print (f 'end video download: {movie_res.url}')
        f.flush()
 
 
if __name__ == '__main__': # main + ENTER
 
    # A request to send asynchronous get_page, a function of the results to parse_index
    url = 'https://www.pearvideo.com/'
    pool.submit(get_page, url).add_done_callback(parse_index)

  3.requests detailed use:

GET request to explain
'''
'''
User-Agent
# Access know almost found
Request url:
    https://www.zhihu.com/explore
     
Request mode:
    GET
     
Request header:
    user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36
     
    cookies
'''
 
# Access know almost
# import requests
# response = requests.get(url='https://www.zhihu.com/explore')
# print(response.status_code)  # 400
# Print (response.text) # returns an error page
 
 
Carrying access request # know almost header parameters:
import requests
 
# Request header dictionary
# headers = {
#     'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
# }
# In a get request to add user-agent
# response = requests.get(url='https://www.zhihu.com/explore', headers=headers)
# print(response.status_code)  # 200
# # print(response.text)
# with open('zhihu.html', 'w', encoding='utf-8') as f:
#     f.write(response.text)
 
 
'''
Request parameter params
Access Baidu search Anhui Polytechnic University url
https://www.baidu.com/s?wd= Anhui University of Engineering & pn = 10
https://www.baidu.com/s?wd= Anhui University of Engineering & pn = 20
 
# '''
from urllib.parse import urlencode
# url = 'https://www.baidu.com/s?wd=%E8%94%A1%E5%BE%90%E5%9D%A4'
# Url = '? Https://www.baidu.com/s' + urlencode ({ "wd": "Cai Xu Kun"})
url = 'https://www.baidu.com/s?'
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
}
# print(url)
# Add parameters in params get method
# Response = requests.get (url, headers = headers, params = { "wd": "Anhui Polytechnic University"})
response = requests.get(url, headers=headers, params={"wd": "安徽工程大学", "pn": "20"})
# print(response.text)
with open('gongcheng2.html', 'w', encoding='utf-8') as f:
    f.write(response.text)
 
 
'''
Carry cookies
Log in carrying crack github login authentication cookies
 
Request url:
    https://github.com/settings/emails
     
Request mode:
    GET
     
Request header:
    User-Agen
     
    Cookie: has_recent_activity=1; _ga=GA1.2.1416117396.1560496852; _gat=1; tz=Asia%2FShanghai; _octo=GH1.1.1728573677.1560496856; _device_id=1cb66c9a9599576a3b46df2455810999; user_session=1V8n9QfKpbgB-DhS4A7l3Tb3jryARZZ02NDdut3J2hy-8scm; __Host-user_session_same_site=1V8n9QfKpbgB-DhS4A7l3Tb3jryARZZ02NDdut3J2hy-8scm; logged_in=yes; dotcom_user=TankJam; _gh_sess=ZS83eUYyVkpCWUZab21lN29aRHJTUzgvWjRjc2NCL1ZaMHRsdGdJeVFQM20zRDdPblJ1cnZPRFJjclZKNkcrNXVKbTRmZ3pzZzRxRFExcUozQWV4ZG9kOUQzZzMwMzA2RGx5V2dSaTMwaEZ2ZDlHQ0NzTTBtdGtlT2tVajg0c0hYRk5IOU5FelYxanY4T1UvVS9uV0YzWmF0a083MVVYVGlOSy9Edkt0aXhQTmpYRnVqdFAwSFZHVHZQL0ZyQyt0ZjROajZBclY4WmlGQnNBNTJpeEttb3RjVG1mM0JESFhJRXF5M2IwSlpHb1Mzekc5M0d3OFVIdGpJaHg3azk2aStEcUhPaGpEd2RyMDN3K2pETmZQQ1FtNGNzYnVNckR4aWtibkxBRC8vaGM9LS1zTXlDSmFnQkFkWjFjanJxNlhCdnRRPT0%3D--04f6f3172b5d01244670fc8980c2591d83864f60
     
'''
import requests
 
# Url request
url = 'https://github.com/settings/emails'
 
# Request header
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36',
    # Cookies in the request header splicing
    # 'Cookie': 'has_recent_activity=1; _ga=GA1.2.1416117396.1560496852; _gat=1; tz=Asia%2FShanghai; _octo=GH1.1.1728573677.1560496856; _device_id=1cb66c9a9599576a3b46df2455810999; user_session=1V8n9QfKpbgB-DhS4A7l3Tb3jryARZZ02NDdut3J2hy-8scm; __Host-user_session_same_site=1V8n9QfKpbgB-DhS4A7l3Tb3jryARZZ02NDdut3J2hy-8scm; logged_in=yes; dotcom_user=TankJam;_gh_sess ZS83eUYyVkpCWUZab21lN29aRHJTUzgvWjRjc2NCL1ZaMHRsdGdJeVFQM20zRDdPblJ1cnZPRFJjclZKNkcrNXVKbTRmZ3pzZzRxRFExcUozQWV4ZG9kOUQzZzMwMzA2RGx5V2dSaTMwaEZ2ZDlHQ0NzTTBtdGtlT2tVajg0c0hYRk5IOU5FelYxanY4T1UvVS9uV0YzWmF0a083MVVYVGlOSy9Edkt0aXhQTmpYRnVqdFAwSFZHVHZQL0ZyQyt0ZjROajZBclY4WmlGQnNBNTJpeEttb3RjVG1mM0JESFhJRXF5M2IwSlpHb1Mzekc5M0d3OFVIdGpJaHg3azk2aStEcUhPaGpEd2RyMDN3K2pETmZQQ1FtNGNzYnVNckR4aWtibkxBRC8vaGM9LS1zTXlDSmFnQkFkWjFjanJxNlhCdnRRPT0% = 3D - 04f6f3172b5d01244670fc8980c2591d83864f60 '
}
# github_res = requests.get(url, headers=headers)
 
import requests
cookies = {
    'Cookie': 'has_recent_activity=1; _ga=GA1.2.1416117396.1560496852; _gat=1; tz=Asia%2FShanghai; _octo=GH1.1.1728573677.1560496856; _device_id=1cb66c9a9599576a3b46df2455810999; user_session=1V8n9QfKpbgB-DhS4A7l3Tb3jryARZZ02NDdut3J2hy-8scm; __Host-user_session_same_site=1V8n9QfKpbgB-DhS4A7l3Tb3jryARZZ02NDdut3J2hy-8scm; logged_in=yes; dotcom_user=TankJam;_gh_sess ZS83eUYyVkpCWUZab21lN29aRHJTUzgvWjRjc2NCL1ZaMHRsdGdJeVFQM20zRDdPblJ1cnZPRFJjclZKNkcrNXVKbTRmZ3pzZzRxRFExcUozQWV4ZG9kOUQzZzMwMzA2RGx5V2dSaTMwaEZ2ZDlHQ0NzTTBtdGtlT2tVajg0c0hYRk5IOU5FelYxanY4T1UvVS9uV0YzWmF0a083MVVYVGlOSy9Edkt0aXhQTmpYRnVqdFAwSFZHVHZQL0ZyQyt0ZjROajZBclY4WmlGQnNBNTJpeEttb3RjVG1mM0JESFhJRXF5M2IwSlpHb1Mzekc5M0d3OFVIdGpJaHg3azk2aStEcUhPaGpEd2RyMDN3K2pETmZQQ1FtNGNzYnVNckR4aWtibkxBRC8vaGM9LS1zTXlDSmFnQkFkWjFjanJxNlhCdnRRPT0% = 3D - 04f6f3172b5d01244670fc8980c2591d83864f60 '
}
 
github_res = requests.get(url, headers=headers, cookies=cookies)
 
print('15622792660' in github_res.text)

  Example 4: Film crawling watercress top250 information:

''''''
'''
Homepage:
    https://movie.douban.com/top250
    GET
    User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36
     
Regular re:
    # Movie details page url, image links, movie name, movie ratings, reviews the number of
    <div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价
'''
import requests
import re
url = 'https://movie.douban.com/top250'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
}
# 1, the fetch response data to the transmission request watercress TOP250
response = requests.get(url, headers=headers)
 
# print(response.text)
 
# 2, by extracting data parsing regular
# Movie details page url, image links, movie name, movie ratings, reviews the number of
movie_content_list = re.findall(
    # Regular rules
    '<div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价',
 
    # Parse text
    response.text,
 
    # Match mode
    re.S)
 
for movie_content in movie_content_list:
    # Decompression assignment every movie
    detail_url, movie_jpg, name, point, num = movie_content
    data = f 'Movie Name: {name}, details page url: {detail_url}, pictures url: {movie_jpg}, Rating: {point}, the number of evaluators: {num} \ n'
    print(data)
 
    # 3, save the data, the information is written to a movie file
    with open('douban.txt', 'a', encoding='utf-8') as f:
        f.write(data)

  

 

 

 

 

 
 
 

Guess you like

Origin www.cnblogs.com/jacob1998/p/11025643.html