import requests from requests.exceptions import RequestException import re import json from multiprocessing import Pool def page_one_html(url): try: response = requests.get(url); if response.status_code == 200: return response.text else: return None except RequestException: return None def parse_page_html(content): pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?title="(.*?)".*?data-src="(.*?)".*?' +'star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>' +'.*?</dd>',re.S) items = re.findall(pattern,content) #print(items) for item in items: yield{ "index":item[0], "title": item[1], "image":item[2], "actor":item[3].strip()[3:], "createTime":item[4].strip()[4:], "score":str(item[5])+str(item[6]) } def write_text(item): with open("result.txt","a",encoding="utf-8") as f: f.write(json.dumps(item,ensure_ascii=False) + "\n") f.close() def main(offset): url = "http://maoyan.com/board/4?offset="+str(offset) html = page_one_html(url) #print(html) for item in parse_page_html(html): write_text(item) if __name__=="__main__": pool = Pool() pool.map(main,[i*10 for i in range(10)])
from requests.exceptions import RequestException 异常处理很重要
import re
import json
from multiprocessing import Pool 线程池下线搜搜的