14.爬虫

# 1.明确目的
# 2.找到数据对应的网页
# 3.分析网页的结构找到数据所在的标签的位置
# 4.模拟HTTP请求,向服务器发送这个请求,获取到服务器返回给我们的HTML
# 5.用正则表达式提取我们的数据

import re
from urllib import request

# <div class="video-info">
#     <span class="video-title" title="LPL半决赛 IG vs RNG">LPL半决赛 IG vs RNG</span>
#     <span class="video-nickname" title="LPL英文解说室">
#         <i class="icon-host-level icon-host-level-2" data-level="2"></i>
#         LPL英文解说室                      
#     </span>
#     <span class="video-number">32.7万</span>
#     <span class="video-station-info">
#         <i class="video-station-num">1人</i>
#     </span>
# </div>
class Splider():
    url = 'https://www.panda.tv/cate/lol'
    root_pattern = '<div class="video-info">([\s\S]*?)</div>'
    name_pattern = '</i>([\s\S]*?)</span>'
    number_pattern = '<span class="video-number">([\s\S]*?)</span>'
    
    def __fetch_content(self):
        r = request.urlopen(Splider.url)
        htmls = r.read()
        htmls = str(htmls,encoding='utf-8')
        return htmls

    def __analysis(self,htmls):
        root_html = re.findall(Splider.root_pattern,htmls)
        anchors = []

        for html in root_html:
            name = re.findall(Splider.name_pattern,html)
            number = re.findall(Splider.number_pattern,html)
            anchor ={'name':name,'number':number}
            anchors.append(anchor)
        return anchors

    def __refine(self,anchors):
        l = lambda anchor: {'name':anchor['name'][0].strip(),'number':anchor['number'][0]}
        return map(l,anchors)

    def __sort(self,anchors):
        anchors = sorted(anchors,key=self.__sort_seed,reverse=True)
        return anchors

    def __show(self,anchors):
        for rank in range(0,len(anchors)):
            print('rank  ' + str(rank + 1) + ':' + anchors[rank]['name'] + '    ' + anchors[rank]['number'])

    def __sort_seed(self,anchor):
        r = re.findall('\d*',anchor['number'])
        number = float(r[0])
        if '' in anchor['number']:
            number *=10000
        return number

    def go(self):
        htmls = self.__fetch_content()
        anchors = self.__analysis(htmls)
        anchors = list(self.__refine(anchors))
        anchors = self.__sort(anchors)
        self.__show(anchors)

splider = Splider()
splider.go()    
# rank  1:    1715.0万
# rank  2:LPL英文解说室    38.8万
# rank  3:魔兽后裔    16.9万
# rank  4:守卫者    12.4万
# rank  5:药水哥s    5.2万
# rank  6:2d战衣托儿索    3.5万
# rank  7:三枪赵信    3.4万
# rank  8:一只呆萌娜    8932
# rank  9:小黑胖砸    8387
# rank  10:阿涛皎月Carry    6960
# rank  11:熊猫TV丶油菜花1    6700
# rank  12:熊猫丶蛮神    6398
# rank  13:小星k95    6343
# rank  14:老头一必诚    4347
# rank  15:狐狸酱大魔王    4127
# rank  16:杀鸡菜逼俱乐部    3367
# rank  17:一个很C的稻草人    3211
# rank  18:筱筱玉丶微服私访    3080
# rank  19:魔剑神无敌    3007
# rank  20:金三炮丶丶    2881
# rank  21:刀锋秀秀QAQ    2780
# rank  22:我是小二阿    2498
# rank  23:Roumm    2140
# rank  24:冰雪丶狐狸    2085
# rank  25:_木木不酷    1947
# rank  26:有毒i吸血鬼    1846
# rank  27:东北小伙_    1784
# rank  28:琳琪baby    1780
# rank  29:皮小胖QAQ    1685
# rank  30:阿佑any    1668
# rank  31:美丽可爱栗子哟    1654
# rank  32:大表哥王者蛇女    1645
# rank  33:冷面寒枪人马神    1643
# rank  34:琦玉啊zzz    1639
# rank  35:熊猫TV老泽拉斯    1638
# rank  36:熊猫TV灬美猴王    1635
# rank  37:不会98K的传海    1619
# rank  38:分手何必把锅背走    1604
# rank  39:可乐解忧杂货铺    1569
# rank  40:二言是只喵c    1559
# rank  41:God_of_War龙宝宝    1553
# rank  42:骠骑大将军1    1546
# rank  43:暴力美学小毅    1536
# rank  44:可乐可乐大雪碧    1521
# rank  45:国际女特工    1520
# rank  46:醉梦醒酒    1500
# rank  47:阿毛君2    1499
# rank  48:请叫我越塔怪    1497
# rank  49:社会我墙哥丶    1494
# rank  50:Panda丶冰冰    1493
# rank  51:熊猫丶大风6    1493
# rank  52:初柔_    1491
# rank  53:熊猫TV天倪    1467
# rank  54:芥子喵i    1464
# rank  55:或许这就是离岛吧    1460
# rank  56:熊猫尼古拉斯胖虎    1451
# rank  57:雁回阿    1446
# rank  58:这个人帅到没朋友    1446
# rank  59:西灬瓜酱    1438
# rank  60:长路漫漫剑圣作伴    1430
# rank  61:幼稚凯卡特    1430
# rank  62:李阿特    1403
# rank  63:小白菜嗷呜    1398
# rank  64:盒你相约_张小白    1388
# rank  65:小哇D卡    1349
# rank  66:武媚儿丶    1340
# rank  67:暴走的青蛙队长    1313
# rank  68:2017英雄联盟全明星    1309
# rank  69:熊猫TV木灵符    1306
# rank  70:酥软迷妹小慢慢Zz    1304
# rank  71:言希cc    1292
# rank  72:熊猫TV丶黑默丁宇    1274
# rank  73:零七Se7en丶    1270
# rank  74:疯疯大魔王    1270
# rank  75:笨蛋真嗣    1259
# rank  76:熊猫TVsao马    1251
# rank  77:诗琪baby_    1249
# rank  78:小姑爷爷丶    1231
# rank  79:Sye钰神    1227
# rank  80:唐羽菲    1217
# rank  81:甜崽__    1204
# rank  82:冠胖又帅又皮    1192
# rank  83:繁星yer    1184
# rank  84:顺顺套路王    1181
# rank  85:Panda丶浅唱小生    1175
# rank  86:你的温岚    1169
# rank  87:苏璞呀丶    1159
# rank  88:涛涛段r    1121
# rank  89:糖小鱼丶丶    1110
# rank  90:我就是神仙丶丶    1092
# rank  91:熊猫TV丶sao白    1031
# rank  92:丨空城忆旧巷丨    1019
# rank  93:愿卿好    951
# rank  94:C哥哔哩罢了丶    943
# rank  95:周龍丶    925
# rank  96:熊猫TV桂林168    902
# rank  97:迟到不准时的岛屿    900
# rank  98:嘿we狗丶老污狗    896
# rank  99:努力的杰欧巴    886
# rank  100:我真的没有钱畫訫    841
# rank  101:裙裙裙子    811
# rank  102:Dyz8    795
# rank  103:喵菌i    788
# rank  104:电竞歌后小莲莲丶    785
# rank  105:人生如戏丶戏如命    781
# rank  106:熊猫LCS直播间    764
# rank  107:战士狂魔天    754
# rank  108:忘忧人云彩    749
# rank  109:熊猫第一姬    734
# rank  110:阿祥Q    724
# rank  111:灵魂纯白    722
# rank  112:林如风1    705
# rank  113:熊猫壹棉被    698
# rank  114:熊猫Tv丶K88    690
# rank  115:俊姑豹女丶丶    678
# rank  116:isme明非    674
# rank  117:熊猫tv胖胖虎    671

猜你喜欢

转载自www.cnblogs.com/zouke1220/p/8903616.html