Python 爬虫 爬取安智网应用信息

爬取目标网站安卓应用的信息,爬取分类、更新时间、系统要求、下载量以及下载链接等描述信息

http://www.anzhi.com/

在dom中查看内容

但是在源码中发现并没有相关的信息,是使用js生成的

查看ajax请求,找到html片段,和分类详情,直接拼接相应的url,在使用soup解析即可

应用信息获取

下载使用js获取

opendown函数,直接模拟请求即可获取下载链接

function opendown(id){
	$.get("/ajaxdl_app.php?s="+id, function(result){
		if(result == 0)
			{
			$("#codedown").zxxbox({title: "安智网"});
				$('#down_from').attr('action', '/checkdown.php?s=' + id + '&n=1');
				$("#checkcod")[0].src="/checkcode/check_seccode.php?rand="+Math.random();
			}else{
				window.location.href="/dl_app.php?s="+id+"&n=5";
			}
	  });
}

源码

import requests
from bs4 import BeautifulSoup
import json
import time


# 获取 网页源码
def getPage(url):
    html = requests.get(url).text
    return html


# 获取所有分类信息,并找到所有分类的url
def getCat():
    url = 'http://www.anzhi.com/widgetcatetag_1.html'
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')
    cats = soup.select('a')
    cat_list = [i['href'] for i in cats if 'tsort' in i['href']]
    newCat = [f"http://www.anzhi.com/widgettsort_{i[i.find('_') + 1:i.find('_h')]}.html" for i in cat_list]
    return newCat


# 把所有分类页面内的app全部返回
def getSoftItems(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')
    links = soup.select('a.recommend_name,center')
    items = ['http://www.anzhi.com' + i['href'] for i in links]
    return items


# 返回 所有分类下的软件链接详情页面
def getAllLinks():
    cats = getCat()
    all_links = []
    for i in cats:
        all_links += getSoftItems(i)
    return all_links


# 根据 软件详情url 获得 对应的信息 json 文件
def getSoftJson(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')
    title = soup.select('div.detail_line > h3')[0].text
    infos = soup.select('#detail_line_ul > li')
    soft_id = soup.select('div.detail_down > a')[0]['onclick'][9:-2]
    js = {}
    js['name'] = title
    js['cat'] = infos[0].text.split(':')[-1]
    js['download_cnt'] = infos[1].text.split(':')[-1]
    js['time'] = infos[2].text.split(':')[-1]
    js['size'] = infos[3].text.split(':')[-1]
    js['sys'] = infos[4].text.split(':')[-1]
    js['download'] = f"http://www.anzhi.com/dl_app.php?s={soft_id}&n=5"
    print(js)
    time.sleep(0.1)
    return js


# getSoftJson('http://www.anzhi.com/pkg/365b_com.tc168.cpkb.html')
links = getAllLinks()
print(links)
all_json = []
try:
    for i in links:
        all_json.append(getSoftJson(i))
except Exception as e:
    print(e)
#
with open('all_app.json', encoding='utf8', mode='w+') as f:
    json.dump(all_json, f)

json文件

猜你喜欢

转载自my.oschina.net/ahaoboy/blog/1813775