第一部分:从网络获取APP相关数据
#导入第三方包 import re from bs4 import BeautifulSoup import requests import pandas In [3]: #网上购物类app链接 url = r'http://www.wandoujia.com/category/5017' #设置请求头 headers = {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} #1-抓取每一个子类的url,发送请求、解析html rep = requests.get(url, headers = headers).text soup = BeautifulSoup(rep, 'html.parser') urls = soup.find_all('h2', {'class': 'app-title-h2'}) urls.head() [<h2 class="app-title-h2"> <a class="name" href="http://www.wandoujia.com/apps/com.xunmeng.pinduoduo" title="拼多多">拼多多</a> </h2>, <h2 class="app-title-h2"> <a class="name" href="http://www.wandoujia.com/apps/com.achievo.vipshop" title="唯品会">唯品会</a> </h2>, <h2 class="app-title-h2"> <a class="name" href="http://www.wandoujia.com/apps/com.taobao.taobao" title="淘宝">淘宝</a> </h2>, <h2 class="app-title-h2"> <a class="name" href="http://www.wandoujia.com/apps/com.alibaba.wireless" title="阿里巴巴">阿里巴巴</a> </h2>, <h2 class="app-title-h2"> <a class="name" href="http://www.wandoujia.com/apps/com.tmall.wireless" title="天猫">天猫</a> In [38]: #商品类app的名称及链接 name = [i.text.strip() for i in urls] #print(name) #pattern = re.compile('href=(.*?)\s') #re.findall(pattern, urls) url_n = [ i.find_all('a')[0]['href'] for i in urls] #print(url_new) In [ ]: #2-获取所有app的url,列出所有页面的url url_new = [] for url in urls: res = requests.get(url, headers = headers).text soup = BeautifulSoup(res, 'html_parser') app_lists = soup.findAll('ul',{'id':'j-tag-list'})[0] url_new.extend([i.findAll('a')[0]['href']] for i in app_lists.findAll('h2', {'class':'app-title-h2'})) In [51]: #抓取以上app的url详细信息 install_people = [] comments = [] comments_ratio = [] for n, u in zip(name, url_new): #print(u) url_last = requests.get(u, headers = headers).text soup_last = BeautifulSoup(url_last, 'html.parser') #contents = soup_last.find_all('div', {'class' :'num-list'}) #print(contents) install_people.append(soup_last.find('span', {'class': 'item install'}).find('i').text) #print(install_people) comments.append(soup_last.find('span', {'class' : 'item love'}).find('i').text.strip()) comments_ratio.append(soup_last.find('div', {'class': 'comment-area'}).find('i').text) In [52]: print(install_people, comments, comments_ratio, url_new, name) ['8206.6万', '7515.4万', '5亿', '7221.6万', '9546.4万', '2.1亿', '2亿', '2515.2万', '1102.2万', '8699.4万', '8471万', '566.9万', '2252.1万', '89.7万', '115.9万', '752.6万', '3832万', '1790.9万', '24.8万', '20.1万', '1306.5万', '1181.6万', '56.2万', '36.8万'] ['87.00%', '87.00%', '70.00%', '58.00%', '42.00%', '67.00%', '51.00%', '94.00%', '51.00%', '72.00%', '96.00%', '70.00%', '51.00%', '83.00%', '100.00%', '78.00%', '78.00%', '55.00%', '81.00%', '99.00%', '90.00%', '90.00%', '99.00%', '100.00%'] ['3466', '2478', '11350', '533', '351', '10755', '264', '2316', '107', '476', '2875', '33', '261', '66', '216', '289', '593', '110', '27', '104', '5182', '751', '104', '869'] ['http://www.wandoujia.com/apps/com.xunmeng.pinduoduo', 'http://www.wandoujia.com/apps/com.achievo.vipshop', 'http://www.wandoujia.com/apps/com.taobao.taobao', 'http://www.wandoujia.com/apps/com.alibaba.wireless', 'http://www.wandoujia.com/apps/com.tmall.wireless', 'http://www.wandoujia.com/apps/com.sankuai.meituan', 'http://www.wandoujia.com/apps/com.jingdong.app.mall', 'http://www.wandoujia.com/apps/com.wuba.zhuanzhuan', 'http://www.wandoujia.com/apps/com.kuaibao.kuaidi', 'http://www.wandoujia.com/apps/com.suning.mobile.ebuy', 'http://www.wandoujia.com/apps/com.dianping.v1', 'http://www.wandoujia.com/apps/com.ymt360.app.mass', 'http://www.wandoujia.com/apps/com.cainiao.wireless', 'http://www.wandoujia.com/apps/com.lechuang.quanbaobei', 'http://www.wandoujia.com/apps/com.desire.tonight', 'http://www.wandoujia.com/apps/com.xingin.xhs', 'http://www.wandoujia.com/apps/com.mogujie', 'http://www.wandoujia.com/apps/com.dangdang.buy2', 'http://www.wandoujia.com/apps/com.taobao.litetao', 'http://www.wandoujia.com/apps/com.fanlishengqianlianmengw', 'http://www.wandoujia.com/apps/com.taobao.etao', 'http://www.wandoujia.com/apps/com.jym.mall', 'http://www.wandoujia.com/apps/com.xiaomi.youpin', 'http://www.wandoujia.com/apps/com.chuangnian.shenglala'] ['拼多多', '唯品会', '淘宝', '阿里巴巴', '天猫', '美团', '京东', '转转', '微快递', '苏宁易购', '大众点评', '一亩田', '菜鸟裹裹', '券宝贝', '夜欲两性情趣商城', '小红书', '蘑菇街', '当当', '淘宝特价版', '返利省钱联盟', '一淘', '交易猫', '米家有品', '省啦啦'] In [60]: #3-存储数据、导出数据 import pandas as pd app_data = pd.DataFrame({'app_name':name, 'install':install_people, 'commments':comments, 'goodcomments':comments_ratio}) app_data.to_csv('apps.csv', index = False, encoding = 'utf-8') In [59]: app_data.head() Out[59]: app_name commments goodcomments install 0 拼多多 87.00% 3466 8206.6万 1 唯品会 87.00% 2478 7515.4万 2 淘宝 70.00% 11350 5亿 3 阿里巴巴 58.00% 533 7221.6万 4 天猫 42.00% 351 9546.4万
第二部分:对APP数据,进行数据分析
In [2]:
#导入模块
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
#读取文件
app_data
=
pd
.
read_excel
(
r
'C:\Users\Administrator\Desktop\apps.xlsx'
)
In [3]:
#1-数据查看
#数据描述
app_data.shape
Out[3]:
In [4]:
app_data.head()
Out[4]:
In [5]:
app_data.count()
Out[5]:
In [6]:
#2-数据清洗,重复值和异常值剔除
#查找是否存在重复
any(app_data.duplicated())
Out[6]:
In [7]:
app_data.describe()
Out[7]:
In [10]:
app_data.comments.min()
Out[10]:
In [11]:
#去除重复值
app_data.drop_duplicates(inplace=True)
app_data.shape
Out[11]:
In [16]:
#app_data = app_data[app_data.comments != -1]
app_data = app_data.loc[app_data.comments !=-1,]
In [17]:
app_data.describe()
Out[17]:
In [22]:
app_data.describe(include = ['object'])
Out[22]:
In [23]:
app_data.head()
Out[23]:
In [37]:
#数据类型变化
#app_data.install.str.find('亿')
def transf(x):
if x.find('亿') != -1:
y = float(x[:-1])*10000
elif x.find('万') != -1:
y = float(x[:-1])
else:
y = float(x[:-1])/10000
return y
app_data['install_new'] = app_data.install.apply(transf)
y = lambda x: float(x[:-2]) if x.find('MB') != -1 else float(x[:-2])/1024
app_data['size_new'] = app_data['size'].apply(y)
In [42]:
y = lambda x: np.nan if x == '暂无' else float(x[:-1])/100
app_data['love_new'] = app_data['love'].apply(y)
In [45]:
#app_data['love_new'] = app_data['love_new'].fillna(app_data.love_new.median())
app_data['love_new'] = app_data['love_new'].replace(app_data.love_new.median, np.nan)
In [56]:
app_data['update_new'] = pd.to_datetime(app_data['update'], format = '%Y年%m月%d日')
In [57]:
app_data.describe()
Out[57]:
In [58]:
#去除不重要的因素
app = app_data.drop(['install', 'size', 'love', 'update'], axis = 1)
In [59]:
app.head()
Out[59]:
In [72]:
#数据的可视化分析
list = []
categories = ['商城', '团购', '优惠', '快递', '全球导购']
for cate in categories:
sub = app.loc[app.appcategory.apply(lambda x: x.find(cate) != -1), [ 'appname', 'install_new']]
sub.sort_values(by = ['install_new'], ascending = False)[:5]
sub['type'] = cate
list.append(sub)
#合并数据集
app_install = pd.concat(list)
In [66]:
#设置绘图
plt.style.use('ggplot')
ax1 = plt.subplot2grid((3,2),(0,0))
ax2 = plt.subplot2grid((3,2),(0,1))
ax3 = plt.subplot2grid((3,2),(1,0))
ax4 = plt.subplot2grid((3,2),(1,1))
ax5 = plt.subplot2grid((3,2),(2,0), colspan=2)
axes = [ax1, ax2,ax3,ax4,ax5]
types = app_install.type.unique()
In [81]:
#绘制5张图,各类APP下载量前五的应用
for i in range(5):
# 准备绘图数据
data = app_install.loc[app_install.type == types[i]]
# 绘制条形图
#axes[i].bar(, data.install_new, color = 'steelblue', alpha = 0.7)
axes[i].plot(kind = 'bar')
# 设置图框大小
gcf = plt.gcf()
gcf.set_size_inches(8, 6)
# 添加标题
axes[i].set_title(types[i]+'类APP下载量前5的应用', size = 9)
# 设置刻度位置
axes[i].set_xticks(np.arange(5) + 0.4)
# 为刻度添加标签值
axes[i].set_xticklabels(data.appname, fontdict={'fontsize':7}, color = 'red')
# 删除各子图上、右和下的边界刻度标记
axes[i].tick_params(top = 'off', bottom = 'off', right = 'off')
plt.subplots_adjust(hspace = 0.6, wspace = 0.3)
plt.show()
In [2]:
#各类APP中好评率最低的五个
list=[]
categories = ['商城', '团购', '优惠', '快递', '全球导购']
for cate in categories:
sub = app.loc[app.appcategory.apply(lambda x: x.find('cate') != -1), ['appname', 'love_new']]
sub.sort_values(by = ['love_new'])[:5]
sub['type'] = cate
list.append(sub)
app_love = pd.concat(list)
ax1 = plt.subplot2grid((3,2),(0,0))
ax2 = plt.subplot2grid((3,2),(0,1))
ax3 = plt.subplot2grid((3,2),(1,0))
ax4 = plt.subplot2grid((3,2),(1,1))
ax5 = plt.subplot2grid((3,2),(2,0),colspan = 2)
axes = [ax1, ax2, ax3, ax4, ax5]
types = app_love.type.unique()
for i in range(5):
data = app_love[app_love.type == types[i]]
axes[i].bar(range(5), data.love_new, color = 'steelblue', alpha = 0.7)
gcf = plt.gcf()
gcf.set_size_inches(8,6)
axes[i].set_title(types[i] + '类APP好评率后5的应用', size = 9)
axes[i].set_xticks(np.arange(5) + 0.4)
axes[i].set_xticklabels(data.appname, fontdict={'fontsize':7}, color = 'red')
# 设置y轴刻度位置
axes[i].set_yticks(np.arange(0,0.6,0.15))
# 为y轴刻度添加标签值
axes[i].set_yticklabels([str(i*100) + '%' for i in np.arange(0,0.6,0.15)])
# 删除各子图上、右和下的边界刻度标记
axes[i].tick_params(top = 'off', bottom = 'off', right = 'off')
# 调整子图之间的水平间距和高度间距
plt.subplots_adjust(hspace=0.6, wspace=0.3)
plt.show()
In [100]:
%matplotlib inline
import matplotlib
matplotlib.rcParams['font.family'] = 'Microsoft YaHei'
# 导入第三方模块
from sklearn.linear_model import LinearRegression
# 散点图
plt.scatter(app.comments, # 评价人数
app.love_new, # 好评率
s = 30, # 设置点的大小
c = 'black', # 设置点的颜色
marker = 'o', # 设置点的形状
alpha = 0.9, # 设置点的透明度
linewidths = 0.3, # 设置散点边界的粗细
label = '观测点'
)
# 建模
reg = LinearRegression().fit(app.comments.reshape(-1,1), app.love_new)
# 回归预测值
pred = reg.predict(app.comments.reshape(-1,1))
# 绘制回归线
plt.plot(app.comments, pred, linewidth = 2, label = '回归线')
# 显示图例
plt.legend(loc = 'lower right')
# 添加轴标签和标题
plt.title('评论人数与好评率的关系')
plt.xlabel('评论人数')
plt.ylabel('好评率')
# 去除图边框的顶部刻度和右边刻度
plt.tick_params(top = 'off', right = 'off')
# 显示图形
plt.show()