爬虫大麦网

爬大麦网总体与上一篇博客爬豆瓣电影类似,大麦的选项较多,地点,活动类型等等

本文章与爬豆瓣相比额外使用了如下功能:

docopt从终端获取参数
prettytable整理打印格式

具体代码如下：

shows.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Usage:
    shows <city> <type>
"""
import os
import re
import csv
import sys
from prettytable import PrettyTable
from urllib.request import urlopen
from bs4 import BeautifulSoup
from docopt import docopt

sys.path.append('/home/han/PycharmProjects/TinyPythonProject/Beginner/shows')
from cities import cities

# 大麦网找活动
DAMAI_BASE_URL = "https://www.damai.cn/projectlist.do"
# ERR MSG
QUERY_DAYS_INVALID = 'Invalid days.'
CITY_NOT_FOUND = 'Sorry, your city is not supported.'
SHOW_NOT_FOUND = 'No result.'

# 活动类型
SHOW_TYPES = {
    '演唱会': {'mcid': 1, 'ccid': ''},
    '流行': {'mcid': 1, 'ccid': '9'},
    '摇滚': {'mcid': 1, 'ccid': '10'},
    '民族': {'mcid': 1, 'ccid': '11'},
    '音乐节': {'mcid': 1, 'ccid': '12'},

    '音乐会': {'mcid': 2, 'ccid': ''},

    '话剧歌剧': {'mcid': 3, 'ccid': ''},
    '话剧': {'mcid': 3, 'ccid': 19},
    '歌剧': {'mcid': 3, 'ccid': 20},
    '歌舞剧': {'mcid': 3, 'ccid': 21},
    '音乐剧': {'mcid': 3, 'ccid': 22},
    '儿童剧': {'mcid': 3, 'ccid': 23},

    '舞蹈': {'mcid': 4, 'ccid': '24'},
    '芭蕾': {'mcid': 4, 'ccid': '25'},
    '舞剧': {'mcid': 4, 'ccid': '26'},

    '相声': {'mcid': 5, 'ccid': 27},
    '魔术': {'mcid': 5, 'ccid': 28},
    '马戏': {'mcid': 5, 'ccid': 29},
    '杂技': {'mcid': 5, 'ccid': 30},
    '戏曲': {'mcid': 5, 'ccid': 31},

    '比赛': {'mcid': 6, 'ccid': ''},
}


# 爬title
def getTitle(html):
    titleList = re.findall(
        r'<p class="img"><a href=".*?".*?target="_blank"><img.*?src=".*?".*?alt=".*?".*?title="(.*?)" /></a></p>', html,
        re.S)
    newTitleList = []
    for index, item in enumerate(titleList):
        if item.find("js") == -1 and item.find("css") == -1 and item.find("dale") == -1 and item.find(
                "icon") == -1:
            newTitleList.append(item)
    return newTitleList


# 爬title
def getDetial(html):
    detialList = re.findall(
        r'<p class="img"><a href="(.*?)" target="_blank">', html, re.S)
    newDetialList = []
    for index, item in enumerate(detialList):
        newDetialList.append(item)
    return newDetialList


# 爬图片链接
def getImg(html):
    imgList = re.findall(
        r'<p class="img"><a href=".*?".*?target="_blank"><img.*?src="(.*?)".*?alt=".*?".*?title=".*?" /></a></p>', html,
        re.S)
    newImgList = []
    for index, item in enumerate(imgList):
        if item.find("js") == -1 and item.find("css") == -1 and item.find("dale") == -1 and item.find(
                "icon") == -1:
            newImgList.append(item)
    return newImgList


# 爬时间
def getTime(html):
    timeList = re.findall(r'<p class="mt5">.*?时间：(.*?)<span.*?class="ml20">', html, re.S)
    newTimeList = []
    for index, item in enumerate(timeList):
        newTimeList.append(item)
    return newTimeList


# 爬场馆
def getPlace(html):
    placeList = re.findall(r'场馆：<a href=".*?" target="_blank">(.*?)</a>.*?</span>', html, re.S)
    newPlaceList = []
    for index, item in enumerate(placeList):
        newPlaceList.append(item)
    return newPlaceList


# 爬票价
def getPrice(html):
    priceList = re.findall(r'<span class="price-sort">(.*?)</span>', html, re.S)
    newPriceList = []
    for index, item in enumerate(priceList):
        newPriceList.append(item)
    return newPriceList


# 爬状态
def getStatus(html):
    statusList = re.findall(r'<p>状态: (.*?)</p>', html, re.S)
    newStatusList = []
    for index, item in enumerate(statusList):
        newStatusList.append(item)
    return newStatusList


# 将url转化成html
def getHtml(url):
    try:
        page = urlopen(url)
        html = page.read()
    except Exception as e:
        print("failed to geturl:", e)
        return ""
    else:
        return html


# 将获取的信息进行保存
def saveInfo(infoList):
    with open('/home/han/PycharmProjects/TinyPythonProject/Beginner/shows/shows_scraper.csv', 'w+', newline='',
              encoding='utf-8') as fp:
        a = csv.writer(fp, delimiter=',')  # delimiter的意思是插入到csv文件中的一行记录以它分隔开
        a.writerow(['活 动', '活动链接', '图 片', '时 间', '场 馆', '价 格', '状 态'])
        a.writerows(infoList)
        print('保存完毕')


# 按格式输出
def pretty_print(infoList):
    pt = PrettyTable()
    pt._set_field_names(['活 动', '活动链接', '图 片', '时 间', '场 馆', '价 格', '状 态'])
    for info in infoList:
        pt.add_row(info)
    print(pt)


# 初始化
titles = []
details = []
imgs = []
times = []
places = []
prices = []
status = []

allInfo = []

arguments = docopt(__doc__)
city = cities.get(arguments['<city>'])
type = SHOW_TYPES.get(arguments['<type>'])
url = ('https://www.damai.cn/projectlist.do?cityID={}&mcid={}&ccid={}').format(
    city, type['mcid'], type['ccid']
)
html = urlopen(url)
bsObj = BeautifulSoup(html, 'html.parser')
page_num_text = bsObj.findAll("span", {"class": "ml10"})[0].get_text()
page_num = int(page_num_text[page_num_text.index('共') + 1:page_num_text.index('页')])
print("共%d页" % page_num)  # 得到活动一共多少页
for page in range(1, page_num + 1):
    url = ('https://www.damai.cn/projectlist.do?cityID={}&mcid={}&ccid={}&pageIndex={}').format(
        city, type['mcid'], type['ccid'], page
    )
    print("page:%d,url:%s" % (page, url))
    html = getHtml(url).decode("UTF-8")
    if (html == ''):
        titles.extend('none')
        details.extend('none')
        imgs.extend('none')
        times.extend('none')
        places.extend('none')
        prices.extend('none')
        status.extend('none')
    else:
        titles.extend(getTitle(html))
        details.extend(getDetial(html))
        imgs.extend(getImg(html))
        times.extend(getTime(html))
        places.extend(getPlace(html))
        prices.extend(getPrice(html))
        status.extend(getStatus(html))

print(len(titles))
print(len(details))
print(len(imgs))
print(len(times))
print(len(places))
print(len(prices))
print(len(status))

for i in range(0, len(titles)):
    tmp = []
    tmp.append(titles[i])
    tmp.append('https:' + details[i])
    tmp.append('https:' + imgs[i])
    tmp.append(times[i])
    tmp.append(places[i])
    tmp.append(prices[i])
    tmp.append(status[i])
    allInfo.append(tmp)

saveInfo(allInfo)  # 保存为csv格式文件
pretty_print(allInfo)  # prettytable格式整理打印

cities.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

cities = {
    '北京': '852',
    '上海': '872',
    '广州': '893',
    '深圳': '906',
    '武汉': '586',
    '苏州': '1087',
    '成都': '1377',
    '重庆': '200',
    '长沙': '702',
    '南京': '1038',
    '杭州': '1580',
    '沈阳': '1703',
    '无锡': '1052',
    '宁波': '1597',
    '郑州': '2148',
    '天津': '1209',
    '大连': '1725',
    '南昌': '465',
    '西安': '3250',
    '常州': '1077',
    '昆明': '1229',
    '桂林': '2103',
    '厦门': '372',
    '太原': '2984',
    '福州': '356',
    '温州': '1612',
    '合肥': '2520',
    '珠海': '913',
    '中山': '947',
    '石家庄': '2495',
    '佛山': '923',
    '南宁': '2024',
    '长春': '2812',
    '哈尔滨': '2648',
    '香港': '848',
    '青岛': '1847',
    '澳门': '850',
    '贵阳': '242',
    '济南': '1835',
    '东莞': '917',
    '呼和浩特': '3167',
    '银川': '54',
    '海外': '76',
    '柳州': '2037',
    '徐州': '2024',
    '绍兴': '1643'
}

运行格式为 python3 shows.py [地点] [演出类型]

例如python3 shows.py 北京演唱会

注意：地点必须是cities.py中的地点，类型必须为 SHOW_TYPES中的活动类型

爬虫 大麦网

猜你喜欢