第一部分：从网络获取APP相关数据

#导入第三方包
import re
from bs4 import BeautifulSoup 
import requests
import pandas
In [3]:
#网上购物类app链接
url = r'http://www.wandoujia.com/category/5017'
#设置请求头
headers = {'Accept': '*/*', 
           'Accept-Encoding': 'gzip, deflate',
           'Accept-Language': 'zh-CN,zh;q=0.9',
           'Connection': 'keep-alive',
           'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
#1-抓取每一个子类的url，发送请求、解析html
rep = requests.get(url, headers = headers).text
soup = BeautifulSoup(rep, 'html.parser')
urls = soup.find_all('h2', {'class': 'app-title-h2'})
urls.head()
[<h2 class="app-title-h2">
<a class="name" href="http://www.wandoujia.com/apps/com.xunmeng.pinduoduo" title="拼多多">拼多多</a>
</h2>, <h2 class="app-title-h2">
<a class="name" href="http://www.wandoujia.com/apps/com.achievo.vipshop" title="唯品会">唯品会</a>
</h2>, <h2 class="app-title-h2">
<a class="name" href="http://www.wandoujia.com/apps/com.taobao.taobao" title="淘宝">淘宝</a>
</h2>, <h2 class="app-title-h2">
<a class="name" href="http://www.wandoujia.com/apps/com.alibaba.wireless" title="阿里巴巴">阿里巴巴</a>
</h2>, <h2 class="app-title-h2">
<a class="name" href="http://www.wandoujia.com/apps/com.tmall.wireless" title="天猫">天猫</a>

In [38]:
#商品类app的名称及链接
name = [i.text.strip() for i in urls]
#print(name)
#pattern = re.compile('href=(.*?)\s')
#re.findall(pattern, urls)

url_n = [ i.find_all('a')[0]['href'] for i in urls]
#print(url_new)
In [ ]:
#2-获取所有app的url,列出所有页面的url
url_new = []
for url in urls:
    res = requests.get(url, headers = headers).text
    soup = BeautifulSoup(res, 'html_parser')
    app_lists = soup.findAll('ul',{'id':'j-tag-list'})[0]
    url_new.extend([i.findAll('a')[0]['href']] for i in app_lists.findAll('h2', {'class':'app-title-h2'}))
In [51]:
#抓取以上app的url详细信息
install_people = []
comments = []
comments_ratio = []
for n, u in zip(name, url_new):
    #print(u)
    url_last = requests.get(u, headers = headers).text
    soup_last = BeautifulSoup(url_last, 'html.parser')
    #contents = soup_last.find_all('div', {'class' :'num-list'})
    #print(contents)
    install_people.append(soup_last.find('span', {'class': 'item install'}).find('i').text)
    #print(install_people)
    comments.append(soup_last.find('span', {'class' : 'item love'}).find('i').text.strip())
    comments_ratio.append(soup_last.find('div', {'class': 'comment-area'}).find('i').text)    
      
In [52]:
print(install_people, comments, comments_ratio, url_new, name)
['8206.6万', '7515.4万', '5亿', '7221.6万', '9546.4万', '2.1亿', '2亿', '2515.2万', '1102.2万', '8699.4万', '8471万', '566.9万', '2252.1万', '89.7万', '115.9万', '752.6万', '3832万', '1790.9万', '24.8万', '20.1万', '1306.5万', '1181.6万', '56.2万', '36.8万'] ['87.00%', '87.00%', '70.00%', '58.00%', '42.00%', '67.00%', '51.00%', '94.00%', '51.00%', '72.00%', '96.00%', '70.00%', '51.00%', '83.00%', '100.00%', '78.00%', '78.00%', '55.00%', '81.00%', '99.00%', '90.00%', '90.00%', '99.00%', '100.00%'] ['3466', '2478', '11350', '533', '351', '10755', '264', '2316', '107', '476', '2875', '33', '261', '66', '216', '289', '593', '110', '27', '104', '5182', '751', '104', '869'] ['http://www.wandoujia.com/apps/com.xunmeng.pinduoduo', 'http://www.wandoujia.com/apps/com.achievo.vipshop', 'http://www.wandoujia.com/apps/com.taobao.taobao', 'http://www.wandoujia.com/apps/com.alibaba.wireless', 'http://www.wandoujia.com/apps/com.tmall.wireless', 'http://www.wandoujia.com/apps/com.sankuai.meituan', 'http://www.wandoujia.com/apps/com.jingdong.app.mall', 'http://www.wandoujia.com/apps/com.wuba.zhuanzhuan', 'http://www.wandoujia.com/apps/com.kuaibao.kuaidi', 'http://www.wandoujia.com/apps/com.suning.mobile.ebuy', 'http://www.wandoujia.com/apps/com.dianping.v1', 'http://www.wandoujia.com/apps/com.ymt360.app.mass', 'http://www.wandoujia.com/apps/com.cainiao.wireless', 'http://www.wandoujia.com/apps/com.lechuang.quanbaobei', 'http://www.wandoujia.com/apps/com.desire.tonight', 'http://www.wandoujia.com/apps/com.xingin.xhs', 'http://www.wandoujia.com/apps/com.mogujie', 'http://www.wandoujia.com/apps/com.dangdang.buy2', 'http://www.wandoujia.com/apps/com.taobao.litetao', 'http://www.wandoujia.com/apps/com.fanlishengqianlianmengw', 'http://www.wandoujia.com/apps/com.taobao.etao', 'http://www.wandoujia.com/apps/com.jym.mall', 'http://www.wandoujia.com/apps/com.xiaomi.youpin', 'http://www.wandoujia.com/apps/com.chuangnian.shenglala'] ['拼多多', '唯品会', '淘宝', '阿里巴巴', '天猫', '美团', '京东', '转转', '微快递', '苏宁易购', '大众点评', '一亩田', '菜鸟裹裹', '券宝贝', '夜欲两性情趣商城', '小红书', '蘑菇街', '当当', '淘宝特价版', '返利省钱联盟', '一淘', '交易猫', '米家有品', '省啦啦']
In [60]:
#3-存储数据、导出数据
import pandas as pd
app_data = pd.DataFrame({'app_name':name,
                        'install':install_people,
                        'commments':comments,
                        'goodcomments':comments_ratio})
app_data.to_csv('apps.csv', index = False, encoding = 'utf-8')
In [59]:
app_data.head()
Out[59]:
app_name	commments	goodcomments	install
0	拼多多	87.00%	3466	8206.6万
1	唯品会	87.00%	2478	7515.4万
2	淘宝	70.00%	11350	5亿
3	阿里巴巴	58.00%	533	7221.6万
4	天猫	42.00%	351	9546.4万

第二部分：对APP数据，进行数据分析

     In [2]: 
   

#导入模块
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#读取文件 app_data = pd . read_excel ( r 'C:\Users\Administrator\Desktop\apps.xlsx' )

     In [3]: 
   

#1-数据查看
#数据描述
app_data.shape

       Out[3]: 
     

(1204, 10)

     In [4]: 
   

app_data.head()

       Out[4]: 
     

	appcategory	appname	comments	company	install	love	platform	size	update	version
0	网上购物-商城	拼多多	1921	上海寻梦信息技术有限公司	3841.9万	95.00%	Android 4.0.3 以上	13.35MB	2017年10月11日	3.41.0
1	网上购物-商城-优惠	寺库奢侈品	1964	北京寺库商贸有限公司	175.4万	100.00%	Android 4.1.x 以上	17.21MB	2017年09月30日	5.8.7
2	网上购物-商城	淘宝	14244	淘宝（中国）软件有限公司	4.6亿	68.00%	Android 4.0.2 以上	73.78MB	2017年10月13日	7.1.0
3	网上购物-商城	国美	271	国美在线电子商务有限公司	1123.1万	61.00%	Android 4.0.2 以上	59.44MB	2017年10月11日	5.0.5
4	网上购物-商城	阿里巴巴(alibaba)1688批发采购带直播的购物软件	419	阿里巴巴（中国）网络技术有限公司	4560.8万	50.00%	Android 4.0.2 以上	31.45MB	2017年09月20日	6.22.1.0

     In [5]: 
   

app_data.count()

       Out[5]: 
     

appcategory    1204
appname        1204
comments       1204
company        1037
install        1204
love           1204
platform       1204
size           1204
update         1204
version        1204
dtype: int64

     In [6]: 
   

#2-数据清洗，重复值和异常值剔除
#查找是否存在重复
any(app_data.duplicated())

       Out[6]: 
     

True

     In [7]: 
   

app_data.describe()

       Out[7]: 
     

	comments
count	1204.000000
mean	223.665282
std	1015.513950
min	-1.000000
25%	0.000000
50%	1.500000
75%	43.250000
max	14244.000000

     In [10]: 
   

app_data.comments.min()

       Out[10]: 
     

-1

     In [11]: 
   

#去除重复值
app_data.drop_duplicates(inplace=True)
app_data.shape

       Out[11]: 
     

(1112, 10)

     In [16]: 
   

#app_data = app_data[app_data.comments != -1]
app_data = app_data.loc[app_data.comments !=-1,]

     In [17]: 
   

app_data.describe()

       Out[17]: 
     

	comments
count	1110.000000
mean	163.337838
std	834.943282
min	0.000000
25%	0.000000
50%	1.000000
75%	25.750000
max	14244.000000

     In [22]: 
   

app_data.describe(include = ['object'])

       Out[22]: 
     

	appcategory	appname	company	install	love	platform	size	update	version
count	1110	1110	946	1110	1110	1110	1110	1110	1110
unique	28	1088	861	726	68	17	927	405	572
top	网上购物-快递	快递查询	来自互联网	1.1万	暂无	Android 4.0.2 以上	2.17MB	2014年09月02日	1.0
freq	210	6	8	20	642	261	5	63	69

     In [23]: 
   

app_data.head()

       Out[23]: 
     

	appcategory	appname	comments	company	install	love	platform	size	update	version
0	网上购物-商城	拼多多	1921	上海寻梦信息技术有限公司	3841.9万	95.00%	Android 4.0.3 以上	13.35MB	2017年10月11日	3.41.0
1	网上购物-商城-优惠	寺库奢侈品	1964	北京寺库商贸有限公司	175.4万	100.00%	Android 4.1.x 以上	17.21MB	2017年09月30日	5.8.7
2	网上购物-商城	淘宝	14244	淘宝（中国）软件有限公司	4.6亿	68.00%	Android 4.0.2 以上	73.78MB	2017年10月13日	7.1.0
3	网上购物-商城	国美	271	国美在线电子商务有限公司	1123.1万	61.00%	Android 4.0.2 以上	59.44MB	2017年10月11日	5.0.5
4	网上购物-商城	阿里巴巴(alibaba)1688批发采购带直播的购物软件	419	阿里巴巴（中国）网络技术有限公司	4560.8万	50.00%	Android 4.0.2 以上	31.45MB	2017年09月20日	6.22.1.0

     In [37]: 
   

#数据类型变化
#app_data.install.str.find('亿')
def transf(x):
    if x.find('亿') != -1:
        y = float(x[:-1])*10000
    elif x.find('万') != -1:
        y = float(x[:-1])
    else:
        y = float(x[:-1])/10000
    return y
app_data['install_new'] = app_data.install.apply(transf)

y = lambda x: float(x[:-2]) if x.find('MB') != -1 else float(x[:-2])/1024 
app_data['size_new'] = app_data['size'].apply(y)

     In [42]: 
   

y = lambda x: np.nan if x == '暂无' else float(x[:-1])/100
app_data['love_new'] = app_data['love'].apply(y)

     In [45]: 
   

#app_data['love_new'] = app_data['love_new'].fillna(app_data.love_new.median())
app_data['love_new'] = app_data['love_new'].replace(app_data.love_new.median, np.nan)

     In [56]: 
   

app_data['update_new'] = pd.to_datetime(app_data['update'], format = '%Y年%m月%d日')

     In [57]: 
   

app_data.describe()

       Out[57]: 
     

	comments	install_new	size_new	love_new
count	1110.000000	1110.000000	1110.000000	1110.000000
mean	163.337838	198.525721	12.919243	0.839369
std	834.943282	1713.697161	13.429087	0.169127
min	0.000000	0.000400	0.093457	0.020000
25%	0.000000	0.029200	4.607500	0.890000
50%	1.000000	3.100000	9.650000	0.890000
75%	25.750000	26.325000	17.415000	0.890000
max	14244.000000	46000.000000	247.060000	1.000000

     In [58]: 
   

#去除不重要的因素
app = app_data.drop(['install', 'size', 'love', 'update'], axis = 1)

     In [59]: 
   

app.head()

       Out[59]: 
     

	appcategory	appname	comments	company	platform	version	install_new	size_new	love_new	update_new
0	网上购物-商城	拼多多	1921	上海寻梦信息技术有限公司	Android 4.0.3 以上	3.41.0	3841.9	13.35	0.95	2017-10-11
1	网上购物-商城-优惠	寺库奢侈品	1964	北京寺库商贸有限公司	Android 4.1.x 以上	5.8.7	175.4	17.21	1.00	2017-09-30
2	网上购物-商城	淘宝	14244	淘宝（中国）软件有限公司	Android 4.0.2 以上	7.1.0	46000.0	73.78	0.68	2017-10-13
3	网上购物-商城	国美	271	国美在线电子商务有限公司	Android 4.0.2 以上	5.0.5	1123.1	59.44	0.61	2017-10-11
4	网上购物-商城	阿里巴巴(alibaba)1688批发采购带直播的购物软件	419	阿里巴巴（中国）网络技术有限公司	Android 4.0.2 以上	6.22.1.0	4560.8	31.45	0.50	2017-09-20

     In [72]: 
   

#数据的可视化分析
list = []
categories = ['商城', '团购', '优惠', '快递', '全球导购']
for cate in categories:
    sub = app.loc[app.appcategory.apply(lambda x: x.find(cate) != -1), [ 'appname', 'install_new']]
    sub.sort_values(by = ['install_new'], ascending = False)[:5]
    sub['type'] = cate
    list.append(sub)
#合并数据集
app_install = pd.concat(list)

     In [66]: 
   

#设置绘图
plt.style.use('ggplot')
ax1 = plt.subplot2grid((3,2),(0,0))
ax2 = plt.subplot2grid((3,2),(0,1))
ax3 = plt.subplot2grid((3,2),(1,0))
ax4 = plt.subplot2grid((3,2),(1,1))
ax5 = plt.subplot2grid((3,2),(2,0), colspan=2)
axes = [ax1, ax2,ax3,ax4,ax5]
types = app_install.type.unique()

     In [81]: 
   

#绘制5张图，各类APP下载量前五的应用
for i in range(5):    
    # 准备绘图数据
    data = app_install.loc[app_install.type == types[i]]    
    # 绘制条形图
    #axes[i].bar(, data.install_new, color = 'steelblue', alpha = 0.7)    
    axes[i].plot(kind = 'bar')
    # 设置图框大小
    gcf = plt.gcf()
    gcf.set_size_inches(8, 6)    
    # 添加标题
    axes[i].set_title(types[i]+'类APP下载量前5的应用', size = 9)    
    # 设置刻度位置
    axes[i].set_xticks(np.arange(5) + 0.4)    
    # 为刻度添加标签值
    axes[i].set_xticklabels(data.appname, fontdict={'fontsize':7},  color = 'red')    
    # 删除各子图上、右和下的边界刻度标记
    axes[i].tick_params(top = 'off', bottom = 'off', right = 'off')
    
plt.subplots_adjust(hspace = 0.6, wspace = 0.3)
plt.show()

     In [2]: 
   

#各类APP中好评率最低的五个
list=[]
categories = ['商城', '团购', '优惠', '快递', '全球导购']
for cate in categories:
    sub = app.loc[app.appcategory.apply(lambda x: x.find('cate') != -1), ['appname', 'love_new']]
    sub.sort_values(by = ['love_new'])[:5]
    sub['type'] = cate
    list.append(sub)
app_love = pd.concat(list)

ax1 = plt.subplot2grid((3,2),(0,0))
ax2 = plt.subplot2grid((3,2),(0,1))
ax3 = plt.subplot2grid((3,2),(1,0))
ax4 = plt.subplot2grid((3,2),(1,1))
ax5 = plt.subplot2grid((3,2),(2,0),colspan = 2)

axes = [ax1, ax2, ax3, ax4, ax5]
types = app_love.type.unique()
for i in range(5):
    data = app_love[app_love.type == types[i]]
    axes[i].bar(range(5), data.love_new, color = 'steelblue', alpha = 0.7)
    gcf = plt.gcf()
    gcf.set_size_inches(8,6)
    axes[i].set_title(types[i] + '类APP好评率后5的应用', size = 9)
    axes[i].set_xticks(np.arange(5) + 0.4)
    axes[i].set_xticklabels(data.appname, fontdict={'fontsize':7},  color = 'red')    
    # 设置y轴刻度位置
    axes[i].set_yticks(np.arange(0,0.6,0.15))    
    # 为y轴刻度添加标签值
    axes[i].set_yticklabels([str(i*100) + '%' for i in np.arange(0,0.6,0.15)])    
    # 删除各子图上、右和下的边界刻度标记
    axes[i].tick_params(top = 'off', bottom = 'off', right = 'off')
    
# 调整子图之间的水平间距和高度间距
plt.subplots_adjust(hspace=0.6, wspace=0.3)
plt.show()

     In [100]: 
   

%matplotlib inline
import matplotlib
matplotlib.rcParams['font.family'] = 'Microsoft YaHei'
# 导入第三方模块
from sklearn.linear_model import LinearRegression


# 散点图
plt.scatter(app.comments, # 评价人数
            app.love_new, # 好评率
            s = 30, # 设置点的大小 
            c = 'black', # 设置点的颜色
            marker = 'o', # 设置点的形状
            alpha = 0.9, # 设置点的透明度
            linewidths = 0.3, # 设置散点边界的粗细
            label = '观测点'
            )

# 建模
reg = LinearRegression().fit(app.comments.reshape(-1,1), app.love_new)

# 回归预测值
pred = reg.predict(app.comments.reshape(-1,1))

# 绘制回归线
plt.plot(app.comments, pred, linewidth = 2, label = '回归线')

# 显示图例
plt.legend(loc = 'lower right')
# 添加轴标签和标题
plt.title('评论人数与好评率的关系')
plt.xlabel('评论人数')
plt.ylabel('好评率')

# 去除图边框的顶部刻度和右边刻度
plt.tick_params(top = 'off', right = 'off')

# 显示图形
plt.show()

APP应用市场数据分析

猜你喜欢