''' 爬取豌豆荚app数据 spider_method: requests + bs4 or selenium url: https://www.wandoujia.com/category/6001 data: 名称、详情页url、下载人数、app大小 app_name, detail_url, download_num, app_size ''' from bs4 import BeautifulSoup # 爬虫三部曲 # 1.发送请求 import requests def get_page(url): reponse = requests.get(url) # print(reponse.text) return reponse # 2.解析数据 import re def parse_index(html): game_list = re.findall('<h2 class="app-title-h2"><a href="(.*?)" title="(.*?)" class="name">.*?</a>.*?<span class="install-count">(.*?)万人安装</span> <span class="dot">・</span> <span title="(.*?)MB">.*?MB</span>', html,
re.S) print(game_list) return game_list # 3.保存数据 def save_data(game): app_name, detail_url, download_num, app_size = game data = f''' =========欢迎========= 游戏名称:{app_name} 详情页url:{detail_url} 下载人数:{download_num}万人 app大小:{app_size}MB =========再见========= \n ''' print(data) with open('wandoujia.txt', 'a', encoding='utf-8') as f: f.write(data) if __name__ == '__main__': # 拼接主页 url = f'https://www.wandoujia.com/category/6001' print(url) # 1.往主页发送请求 index_res = get_page(url) # 2.解析主页获取游戏信息 game_list = parse_index(index_res.text) for game in game_list: # 3.保存数据 # print(game_list) save_data(game)
day03——抓取豌豆荚app数据
猜你喜欢
转载自www.cnblogs.com/ningshao/p/11129319.html
今日推荐
周排行