简单实现一个爬虫

python代码如下:

import requests
from requests.exceptions import RequestException
import re
import json

def get_one_page(url):
    try:
        res = requests.get(url)
        if res.status_code == 200:
            return res.text
        return None
    except RequestException:
        return None
def parse_one_page(html):
    pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
                         +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                         +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S)
    items = re.findall(pattern,html)
    for item in items:
        yield{
            'index': item[0],
            'image': item[1],
            'title': item[2],
            'actor': item[3].strip()[3:],
            'time': item[4].strip()[5:],
            'score': item[5] + item[6]
        }
def main():
    url = 'http://maoyan.com/board/4'
    html = get_one_page(url)
    for item in parse_one_page(html):
        print(json.dumps(item, encoding="UTF-8", ensure_ascii=False, sort_keys=False, indent=4))

if __name__ == '__main__':
    main()

运行结果:

{
    "index": "1", 
    "title": "霸王别姬", 
    "image": "http://p1.meituan.net/movie/20803f59291c47e1e116c11963ce019e68711.jpg@160w_220h_1e_1c", 
    "actor": "张国荣,张丰毅,巩俐", 
    "score": "9.6", 
    "time": "1993-01-01(中国香港)"
}
{
    "index": "2", 
    "title": "肖申克的救赎", 
    "image": "http://p0.meituan.net/movie/__40191813__4767047.jpg@160w_220h_1e_1c", 
    "actor": "蒂姆·罗宾斯,摩根·弗里曼,鲍勃·冈顿", 
    "score": "9.5", 
    "time": "1994-10-14(美国)"
}
{
    "index": "3", 
    "title": "罗马假日", 
    "image": "http://p0.meituan.net/movie/23/6009725.jpg@160w_220h_1e_1c", 
    "actor": "格利高利·派克,奥黛丽·赫本,埃迪·艾伯特", 
    "score": "9.1", 
    "time": "1953-09-02(美国)"
}
{
    "index": "4", 
    "title": "这个杀手不太冷", 
    "image": "http://p0.meituan.net/movie/fc9d78dd2ce84d20e53b6d1ae2eea4fb1515304.jpg@160w_220h_1e_1c", 
    "actor": "让·雷诺,加里·奥德曼,娜塔莉·波特曼", 
    "score": "9.5", 
    "time": "1994-09-14(法国)"
}
{
    "index": "5", 
    "title": "教父", 
    "image": "http://p0.meituan.net/movie/92/8212889.jpg@160w_220h_1e_1c", 
    "actor": "马龙·白兰度,阿尔·帕西诺,詹姆斯·凯恩", 
    "score": "9.3", 
    "time": "1972-03-24(美国)"
}
{
    "index": "6", 
    "title": "泰坦尼克号", 
    "image": "http://p0.meituan.net/movie/11/324629.jpg@160w_220h_1e_1c", 
    "actor": "莱昂纳多·迪卡普里奥,凯特·温丝莱特,比利·赞恩", 
    "score": "9.5", 
    "time": "1998-04-03"
}
{
    "index": "7", 
    "title": "龙猫", 
    "image": "http://p0.meituan.net/movie/99/678407.jpg@160w_220h_1e_1c", 
    "actor": "日高法子,坂本千夏,糸井重里", 
    "score": "9.2", 
    "time": "1988-04-16(日本)"
}
{
    "index": "8", 
    "title": "唐伯虎点秋香", 
    "image": "http://p0.meituan.net/movie/62/109878.jpg@160w_220h_1e_1c", 
    "actor": "周星驰,巩俐,郑佩佩", 
    "score": "9.2", 
    "time": "1993-07-01(中国香港)"
}
{
    "index": "9", 
    "title": "千与千寻", 
    "image": "http://p0.meituan.net/movie/9bf7d7b81001a9cf8adbac5a7cf7d766132425.jpg@160w_220h_1e_1c", 
    "actor": "柊瑠美,入野自由,夏木真理", 
    "score": "9.3", 
    "time": "2001-07-20(日本)"
}
{
    "index": "10", 
    "title": "魂断蓝桥", 
    "image": "http://p0.meituan.net/movie/12/8506449.jpg@160w_220h_1e_1c", 
    "actor": "费雯·丽,罗伯特·泰勒,露塞尔·沃特森", 
    "score": "9.2", 
    "time": "1940-05-17(美国)"
}

猜你喜欢

转载自blog.csdn.net/xf_87/article/details/79362429
今日推荐