python爬虫 爬取猫眼电影数据

# 定义一个函数获取猫眼电影的数据

import requests

def main():
    url = url = 'http://maoyan.com/board/4?offset=0'
    html = requests.get(url).text
    print(html)



if __name__ == '__main__':
    main()

# 利用正则匹配,获得我们想要的信息

"""
< dd >
< i class ="board-index board-index-10">10</i>
< a href = "/films/2760" title = "魂断蓝桥" class ="image-link" data-act="boarditem-click" 
data-val="{movieId:2760}" >
< img src = "//ms0.meituan.net/mywww/image/loading_2.e3d934bf.png" alt = "" class ="poster-default" / >
< img data - src = "http://p0.meituan.net/movie/46c29a8b8d8424bdda7715e6fd779c66235684.jpg@160w_220h_1e_1c" 
alt = "魂断蓝桥" class ="board-img" / >< / a >
< div class ="board-item-main" >
< div class ="board-item-content" >
< div class ="movie-item-info" >
< p class ="name" > < a href="/films/2760" title="魂断蓝桥" 
data-act="boarditem-click" data-val="{movieId:2760}" > 魂断蓝桥 < / a > < / p >
< p class ="star" >主演:费雯·丽, 罗伯特·泰勒, 露塞尔·沃特森< / p >
< p class ="releasetime" > 上映时间:1940-05-17(美国) < / p > < / div >
< div class ="movie-item-number score-num" >
< p class ="score" > < i class ="integer" > 9. < / i > < i class ="fraction" > 2 < / i > < / p >
< / div >< / div >< / div >
< / dd >
"""
import re


reg = r'<dd>.*?>(.*?)</i>.*?data-src="(.*?)".*?title="(.*?)".*?主演:(.*?)</p>.*?' \
              r'上映时间:(.*?)</p>.*?integer.*?>(.*?)</i>.*?fraction.*?>(.*?)</i>.*?'
reg = re.compile(reg, re.S)
items = re.findall(reg, html)
print(items)

# 循环遍历列表并且把列表转换为字典

for item in items:
        
    index = item[0]
    image = item[1]
    title = item[2]
    actor = item[3]
    time = item[4]
    score = item[5] + item[6]
    dict1 = {'index': index, 'image': image, 'title': title,
             'actor': actor, 'time': time, 'score': score}
    print(dict1)

# 把获得的数据保存在文件中

import json


with open('result.txt', 'a', encoding='utf-8') as f:
    f.write(json.dumps(dict1, ensure_ascii=False))

# 利用循环获取猫眼电影所有数据

def main():
    for i in range(10):
        url = 'http://maoyan.com/board/4?offset=' + str(i*10)
        

# 最后代码整理如下

import json
import re
from time import sleep
import requests


def main():

    for i in range(10):
        url = 'http://maoyan.com/board/4?offset=' + str(i * 10)
        html = requests.get(url).text

        reg = r'<dd>.*?>(.*?)</i>.*?data-src="(.*?)".*?title="(.*?)"'
              r'.*?主演:(.*?)</p>.*?上映时间:(.*?)</p>.*?integer.*?>'
              r'(.*?)</i>.*?fraction.*?>(.*?)</i>.*?'
        reg = re.compile(reg, re.S)
        items = re.findall(reg, html)

        for item in items:
            # print(item)
            index = item[0]
            image = item[1]
            title = item[2]
            actor = item[3]
            time = item[4]
            score = item[5] + item[6]
            dict1 = {'index': index, 'image': image, 'title': title,
                     'actor': actor, 'time': time, 'score': score}
            sleep(1)

            with open('result.txt', 'a', encoding='utf-8') as f:
                f.write(json.dumps(dict1, ensure_ascii=False))


if __name__ == '__main__':
    main()

猜你喜欢

转载自blog.csdn.net/yunfeiyang520/article/details/81453369