爬取猫眼电影排名

忧伤,没啥说的,弄了一下午,基本都是copy的,呀,正则泰难弄了,而且其他的一些第三方库也不怎么知道;

还是太垃圾了呀,加油吧

 1 import requests
 2 import time
 3 import json
 4 import re
 5 from requests.exceptions import RequestException
 6 def get_one_page(url):
 7     try:
 8         headers = {
 9                     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36',  }
10         response = requests.get(url,headers = headers)
11         if response.status_code == 200:
12             print("测试点")
13             return response.text
14         return None
15     except RequestException:
16         return None
17     
18 def parse_one_page(html):
19     pattern = re.compile(
20             '<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
21             +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
22             + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S)
23     print("测试点4")
24     items = re.findall(pattern,html)
25     print("测试点5")
26     for item in items:
27         yield{
28               '排名': item[0],
29             '图片': item[1],
30             '电影名': item[2],
31             '主演': item[3].strip()[3:],
32             '时间': item[4].strip()[5:],
33             '分数': item[5] + item[6]
34                 }
35 def write_to_file(content):
36     with open(r'F:\猫眼.txt','a+',encoding = 'utf-8') as f:
37         f.write(json.dumps(content,ensure_ascii = False)+'\n')
38         
39 def main(offset):
40     url = 'http://maoyan.com/board/4?offset=' + str(offset)
41     html = get_one_page(url)
42     print("测试点2")
43     for item in parse_one_page(html):
44         print("测试点3")
45         print(item)
46         write_to_file(item)
47     
48 if __name__=="__main__":
49     for i in range(10):
50         print("测试点")
51         main(offset = i *10)
52         time.sleep(1)

主要是熟悉正则的使用

源代码:

https://github.com/Python3WebSpider/MaoYan/blob/master/spider.py

猜你喜欢

转载自www.cnblogs.com/kangdong/p/9052956.html