# A request .request library # install and use # 1. Open cmd # 2. Enter the install Requests PIP3 # Import Requests introduced REQUEST request library # # # # Baidu page sends a request to obtain the corresponding objects # Response = requests.get (URL = 'HTTPS: //www.baidu.com/') # # # set. 8-character encoding UTF # response.encoding = 'UTF-. 8' # # # print the corresponding text # Print (response.text) # # the writes text corresponding local # with Open ( 'baidu.html', 'W', encoding = 'UTF-. 8') AS F: # f.write (response.text) # video # import requests introduced rEQUEST request library # #= requests.get Response ( 'https://video.pearvideo.com/mp4/third/20190612/cont-1565462-11308777-161601-hd.mp4') # Print (response.content) # with Open ( 'video. MP4 ',' WB ') AS F: # f.write (response.content) ' '' 1. Home page sends a request to resolve all videos acquired id example: video - 1570302 the re.findall () 2. Get video details page (column address) '' ' # Import requests # Import Re # regular, for parsing the text data # # 1 sends a request to the Home # Response = requests.get (' https://www.pearvideo.com/ ' ) # Print (the Response.Content) # # Re regular match all of the video obtain the above mentioned id # # Parameter 1: Regular match rule #Parse the text parameter # 2 # # 3. matching pattern parameter # res_list the re.findall = ( '<A the href = "Video _ (. *?)"', Response.text, re.S) # Print (res_list) # # 2. video mosaic details page # for V_D in res_list: # detai_url = "https://www.pearvideo.com/video_" + V_D # Print (detai_url) # # # for each video detail page sends a request to obtain the video resource # = requests.get the Response (url = detai_url) # Print (response.text) # # parse and extract video details page the uRL of # video_url = re.findall ( 'srcUrl = "(. *?)"', response.text, Re .S) [0] # print(video_url) # # Video Title # # video_name the re.findall = ( '<h1 of class = "video_tt" (. *?) </ H1 of>', response.text, re.S) [0] # Print (video_name) # # # to the video URL send request binary video stream # v_response = requests.get (video_url) # # with Open ( '% s.mp4'% video_name, 'WB') AS F: # f.write (v_response.content) # Print (video_name 'crawling successful video') # # 3. packet capture # open the browser developer mode (check) -> select Network # # 1. request the URL of # 2. request mode # GET: # Direct transmission request https://www.cnblogs.com/kermitjam/articles/9697851.html # POST: # required to carry user information to the destination address # # https://www.cnblogs.com/login # { 'User' : 'qing' # 'pwd': '123' # } # # 3. response status: # 2xx: success # 3xx: redirection # 4xx: resource not found # 5xx: server error # 4 request headers # the User_Agent: user agent (proved to be a request sent by your browser and computer equipment) # Cookie: login user information (to prove that you target website users) # the Refer: the URL of last visited (to prove you are transferred from the target sites on the web) # 5 request body: # post request only request body # Form1 the Data # { 'User': 'qing' # 'pwd': '123' # } # crawler step # 1 transmits a request Import Requests Import Re DEF the get_page (the base_url): Response = Requests. GET (the base_url) return Response # 2. parse text DEF parse_index (text): RES = the re.findall ( '<div class="item">.*?<em class="">.*?</em>.*?<a href="(.*?)>.*?<span class="title"></span>.*?导演:(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价</span>.*?<span class="inq">(.*?)</span>', text, re.S) return res # 3.保存数据 def save_data(data): with open('douban.txt', 'a', encoding='utf-8')as f: f.write(data) # main+回车键 if __name__ == '__main__': #10 = NUM # the base_url = 'HTTPS:? //movie.douban.com/top250} {Start = & filter ='. the format () NUM = 0 for Line in Range (10 ): base_uri = ' HTTPS: // Movie. douban.com/top250?start={num}&filter= ' NUM + = 25 Print (base_uri) # 1. sending a request, the calling function Response = the get_page (base_uri) # 2. parse text movie_list = parse_index (response.text) # 3. save the data # data format for Movie in movie_list: # Decompression assignment # movie rankings, movie URL, elevator name, director - starring - type, number of reviews, film synopsis V_TOP, v_url, v_name, v_daoyan, v_point, v_num, v_desc = Movie # V_TOP = Movie [0] movie_content = f ' '' movie rankings: {v_top} movies URL: {v_url} movie name: {v_name} movie starring: {v_daoyan} movie ratings: {v_point} number of evaluators: {v_num} d film synopsis: {v_desc} \ the n- '' ' Print (movie_content) # save the data save_data (movie_content)