Acquaintance Python 03 day

# A request .request library 
# install and use 
# 1. Open cmd 
# 2. Enter the install Requests PIP3 

# Import Requests introduced REQUEST request library # 

# 
# # Baidu page sends a request to obtain the corresponding objects 
# Response = requests.get (URL = 'HTTPS: //www.baidu.com/') 
# 
# # set. 8-character encoding UTF 
# response.encoding = 'UTF-. 8' 
# 
# # print the corresponding text 
# Print (response.text) 
# # the writes text corresponding local 
# with Open ( 'baidu.html', 'W', encoding = 'UTF-. 8') AS F: 
#      f.write (response.text) 


# video 
# import requests introduced rEQUEST request library # 
#= requests.get Response ( 'https://video.pearvideo.com/mp4/third/20190612/cont-1565462-11308777-161601-hd.mp4') 
# Print (response.content) 
# with Open ( 'video. MP4 ',' WB ') AS F: 
#   f.write (response.content) 


' '' 1. Home page sends a request to 
     resolve all videos acquired id 
     example: 
      video - 1570302 
     the re.findall () 
    2. Get video details page (column address) 


'' ' 
# Import requests 
# Import Re # regular, for parsing the text data 
# # 1 sends a request to the Home 
# Response = requests.get (' https://www.pearvideo.com/ ' ) 
# Print (the Response.Content) 
# # Re regular match all of the video obtain the above mentioned id 
# # Parameter 1: Regular match rule 
#Parse the text parameter # 2 
# # 3. matching pattern parameter 
# res_list the re.findall = ( '<A the href = "Video _ (. *?)"', Response.text, re.S) 
# Print (res_list) 
# # 2. video mosaic details page 
# for V_D in res_list: 
#          detai_url = "https://www.pearvideo.com/video_" + V_D 
#          Print (detai_url) 
# 
#          # for each video detail page sends a request to obtain the video resource 
#          = requests.get the Response (url = detai_url) 
#          Print (response.text) 
#          # parse and extract video details page the uRL of 
#          video_url = re.findall ( 'srcUrl = "(. *?)"', response.text, Re .S)  [0]
#         print(video_url)
#
#         Video Title # 
#          video_name the re.findall = ( '<h1 of class = "video_tt" (. *?) </ H1 of>', response.text, re.S) [0] 
#          Print (video_name) 
# 
#          # to the video URL send request binary video stream 
#          v_response = requests.get (video_url) 
# 
#          with Open ( '% s.mp4'% video_name, 'WB') AS F: 
#                  f.write (v_response.content) 
#                  Print (video_name 'crawling successful video') 
# 


# 3. packet capture 
#   open the browser developer mode (check) -> select Network 
# 
# 1. request the URL of 
# 2. request mode 
#   GET: 
#      Direct transmission request https://www.cnblogs.com/kermitjam/articles/9697851.html 
#   POST: 
#   required to carry user information to the destination address 
# 
#          https://www.cnblogs.com/login 
#           { 'User' : 'qing' 
#           'pwd': '123' 
#           } 
# 
# 3. response status: 
# 2xx: success 
# 3xx: redirection 
# 4xx: resource not found 
# 5xx: server error 
# 4 request headers 
# the User_Agent: user agent (proved to be a request sent by your browser and computer equipment) 
# Cookie: login user information (to prove that you target website users) 
# the Refer: the URL of last visited (to prove you are transferred from the target sites on the web) 
# 5 request body: 
#  post request only request body 
#          Form1 the Data 
#           { 'User': 'qing' 
#           'pwd': '123' 
#           } 


# crawler step 
# 1 transmits a request 
Import Requests
 Import Re 


DEF the get_page (the base_url): 
        Response = Requests. GET (the base_url)
         return Response 


# 2. parse text 
DEF parse_index (text): 
        RES = the re.findall (
                 '<div class="item">.*?<em class="">.*?</em>.*?<a href="(.*?)>.*?<span class="title"></span>.*?导演:(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价</span>.*?<span class="inq">(.*?)</span>',
                text, re.S)
        return res


# 3.保存数据
def save_data(data):
        with open('douban.txt', 'a', encoding='utf-8')as f:
                f.write(data)


# main+回车键
if __name__ == '__main__':
        #10 = NUM 
        # the base_url = 'HTTPS:? //movie.douban.com/top250} {Start = & filter ='. the format () 
        NUM = 0
 for Line in Range (10 ): 
        base_uri = ' HTTPS: // Movie. douban.com/top250?start={num}&filter= ' 
        NUM + = 25
         Print (base_uri) 

        # 1. sending a request, the calling function 
        Response = the get_page (base_uri)
         # 2. parse text 
        movie_list = parse_index (response.text)
         # 3. save the data 
        # data format 
for Movie in movie_list:

        # Decompression assignment 
        # movie rankings, movie URL, elevator name, director - starring - type, number of reviews, film synopsis 
        V_TOP, v_url, v_name, v_daoyan, v_point, v_num, v_desc = Movie
         # V_TOP = Movie [0] 
        movie_content = f ' '' 
        movie rankings: {v_top} 
        movies URL: {v_url} 
        movie name: {v_name} 
        movie starring: {v_daoyan} 
        movie ratings: {v_point} 
        number of evaluators: {v_num} 
        d film synopsis: {v_desc} 
        \ the n- 
        '' ' 
        Print (movie_content)
         # save the data 
        save_data (movie_content)

 

Guess you like

Origin www.cnblogs.com/qing1051663949/p/11093816.html