python爬虫——requests抓取某电影网站top100

今天闲的没事,学习了一下爬虫方面的知识,然后用requests库实现了抓取猫眼网站top100电影,还是挺有意思的。

最近用到python比较多,也算是加强了python的运用吧 :-)

import  requests
from requests.exceptions import RequestException
import re
import json
from multiprocessing import pool  # 引入进程池,多进程抓取

def get_one_page(url):
     try:
        headers={
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
        }
        response=requests.get(url,headers=headers)
        # check the status_code,if success,return the HTML code.
        if response.status_code==200:
            return response.text
        return None
     except RequestException:
         return None

# macth movie info in each HTML
def parse_source_page(html):
    # re.S表示.可以匹配换行符
    pattern=re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
                        +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                        +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S)
    all_items=re.findall(pattern,html)
  #   print("一共抓取到"+str(len(all_items))+"个电影")
    for item in all_items:
        yield {
            "index":item[0],
            "image_url":item[1],
            "movie_name":item[2],
            "stars":item[3].strip()[3:],
            "show_time":item[4].strip()[5:],
            "rate":item[5]+item[6]
        }

# content是包含n个dict类型的列表,需要用json.dumps()转为str类型
def save_info_to_file(content,filename):
    file=open(filename,"a",encoding="utf-8")
    for i in range(len(content)):
        file.write(json.dumps(content[i],ensure_ascii=False)+"\n")
    file.close()

def main(offset):
    url="https://maoyan.com/board/4?offset="+str(offset)
    html=get_one_page(url)
    item_list=[]
    for item in parse_source_page(html):
        print(item)
        item_list.append(item)
    save_info_to_file(item_list,"movies_top.txt")

if __name__=="__main__":
    # 多线程抓取,但抓取到的内容不一定是rank 1-100
    # spider_pool=pool.Pool()
    # spider_pool.map(main,[i*10 for i in range(10)])
    for i in range(10):
        main(i*10)

抓取记录

猜你喜欢

转载自blog.csdn.net/qq_37174526/article/details/80043501