Crawling movie heaven - two pages crawl

# Address
  movie Paradise - 2019 fine new film - more
# target
  movie name, download link

# Analysis
********* a page to be crawled ***********  1, the name of the film     2, film links ********* two pages to be arrested take *********** 1. Download link
   



    

 

Implementation steps

1, it is determined whether there is the content needed to fetch data response

 

2, find the URL law

Page 1: HTTPS: //www.dytt8.net/html/gndy/dyzz/ list_23_1.html
Page 2: HTTPS: //www.dytt8.net/html/gndy/dyzz/ list_23_2.html
P the n-: HTTPS: //www.dytt8.net/html/gndy/dyzz/list_23_n.html

 

3, write regular expressions

1 , a page regular expression
    <Table width = " 100% " . *? <TD width = " . 5% " . *? <A the href = " (. *?) " . *? ULINK " > (. * ?) </a>. *? </ Table> 
2 , two pages of the regular expression
    <TD style = " WORD-the WRAP. *?>. *?> (. *?) </a>

 

4, code implementation

from urllib import request
import re
from useragents import ua_list
import time
import random

class FilmSkySpider (Object):
   DEF  __init__ (Self):
     # a page url address 
    self.url = ' https://www.dytt8.net/html/gndy ' \
                ' /dyzz/list_23_{}.html '

  # Get html performance function 
  DEF get_html (Self, url):
    headers = {
      'User-Agent':random.choice(ua_list)
    }
    req = request.Request(url=url,headers=headers)
    RES = request.urlopen (REQ)
     # through the website page source code, to view the site = charset 'GB2312' 
    # If you encounter a decoding error, can not recognize some of the characters, ignore ignore 
    HTML = res.read (). decode ( ' GB2312 ' , ' the ignore ' )

    return html

  # Regular parsing function 
  DEF re_func (Self, re_bds, HTML):
    pattern = re.compile(re_bds,re.S)
    r_list = pattern.findall(html)

    return r_list

  # Get Data function - html page is a response content 
  DEF parse_page (Self, HTML):
    re_bds = r'<table width="100%".*?<td width="5%".*?<a href="(.*?)".*?ulink">(.*?)</a>.*?</table>'
    # one_page_list: [('/html/xxx','幸福猎人'),()]
    one_page_list = self.re_func(re_bds,html)
    item = {}
    for film in one_page_list:
      item['name'] = film[1].strip()
      link = ' https://www.dytt8.net ' + film [0]
      item['download'] = self.parse_two_page(link)

      # Uniform: float, after crawling a movie information SLEEP 
      the time.sleep (random.uniform (1, 3 ))

      print(item)

  # Analytical Data page two 
  DEF parse_two_page (Self, Link):
    html = self.get_html(link)
    re_bds = r'<td style="WORD-WRAP.*?>.*?>(.*?)</a>'
    # two_page_list: ['ftp://xxxx.mkv']
    two_page_list = self.re_func(re_bds,html)
    download = two_page_list[0].strip()

    return download

  def main(self):
    for page in range(1,11):
      url = self.url.format(page)
      html = self.get_html(url)
      self.parse_page(html)
      # Uniform: Float 
      the time.sleep (random.uniform (1, 3 ))

if __name__ == '__main__':
  spider = FilmSkySpider()
  spider.main()

Guess you like

Origin www.cnblogs.com/hooo-1102/p/12159824.html