Data analysis actual project-eggshell apartment complaint analysis

Abstract: Due to the epidemic this year, the thunderstorms in long-term rental apartments have been accelerated. The eggshell apartment I rented frequently reported various negative news. Until the contract expired in early October, I did not dare to renew the lease and check out! The deposit refund process is first returned to the APP, and then withdrawn. The APP shows that it will arrive in 14 working days. As of 2020-11-10 (2020-11-07, the rent is cancelled and reflected), one month has passed and has not been received. When the payment was made, the customer service could not be contacted. As a last resort, Zou Hei Mao and 12315 made a complaint. During the complaint process, it was found that there were more than 25,000 complaints, so the black cat was posted on the content of the complaint about Eggshell Apartment. I crawled it down and analyzed it, so I have this complete data analysis actual project, from data acquisition to simple analysis of data

If you have the same problem, you can complain about a wave of
black cat complaints
12315 complaints

One, data capture

import requests,time
import pandas as pd
import numpy as np
requests.packages.urllib3.disable_warnings()  # 屏蔽https请求证书验证警告
from fake_useragent import UserAgent  # 生成随机请求头


# uid请求数据,数据格式较为规范,方便处理
def request_data_uid(req_s,couid,page,total_page):
    params = {
    
    
                'couid': couid, # 商家ID
                'type': '1',
                'page_size': page * 10, # 每页10条
                'page': page,  # 第几页
                # 'callback':'jQuery11',
            }
    print(f"正在爬取第{page}页,共计{total_page}页,剩余{total_page-page}页")
    url = 'https://tousu.sina.com.cn/api/company/received_complaints'
    
    # 伪造随机请求头
    header={
    
    'user-agent':UserAgent().random}
    res=req_s.get(url,headers=header,params=params, verify=False)
#     res = requests.get(url, params=params, verify=False)
    info_list = res.json()['result']['data']['complaints']
    result =[]
    for info in info_list:
        _data = info['main']
        
        # 投诉日期
        timestamp =float(_data['timestamp'])
        date = time.strftime("%Y-%m-%d",time.localtime(timestamp))
        
        # sn:投诉编号    title :投诉问题   appeal:投诉诉求   summary :问题说明
        data = [date,_data['sn'],_data['title'],_data['appeal'],_data['summary']]
        result.append(data)

    pd_result = pd.DataFrame(result,columns=["投诉日期","投诉编号","投诉问题","投诉诉求","详细说明"])
    return pd_result


# keywords请求数据,数据格式相对混乱
# 紫梧桐这种没有收录商家ID的公司只能用keywords进行检索处理
# 蛋壳公寓有uid的这种也可以使用keywods进行数据请求

def request_data_keywords(req_s,keyword,page,total_page):
#     page =1
    params = {
    
    
                'keywords':keyword, # 检索关键词
                'type': '1',
                'page_size': page * 10, # 每页10条
                'page': page,  # 第几页
                # 'callback':'jQuery11',
            }
    print(f"正在爬取第{page}页,共计{total_page}页,剩余{total_page-page}页")
    # url = 'https://tousu.sina.com.cn/api/company/received_complaints'
    url ='https://tousu.sina.com.cn/api/index/s?'
    
      # 伪造随机请求头
    header={
    
    'user-agent':UserAgent().random}
    res=req_s.get(url,headers=header,params=params, verify=False)
#     res = requests.get(url, params=params, verify=False)
    info_list = res.json()['result']['data']['lists']
    result =[]
    for info in info_list:
        _data = info['main']
        
        # 投诉日期
        timestamp =float(_data['timestamp'])
        date = time.strftime("%Y-%m-%d",time.localtime(timestamp))
        
        # sn:投诉编号    title :投诉问题   appeal:投诉诉求   summary :问题说明
        data = [date,_data['sn'],_data['title'],_data['appeal'],_data['summary']]
        result.append(data)

    pd_result = pd.DataFrame(result,columns=["投诉日期","投诉编号","投诉问题","投诉诉求","详细说明"])
    return pd_result


#生成并保持请求会话
req_s = requests.Session() 

# 蛋壳公寓
result = pd.DataFrame()
total_page = 2507
for  page in range(1,total_page+1):
    data = request_data_uid(req_s,'5350527288',page,total_page)
    result = result.append(data)
result['投诉对象']="蛋壳公寓"
result.to_csv("蛋壳公寓投诉数据.csv",index=False)

# 紫梧桐 关键词检索
# 蛋壳公寓为品牌名,工商注册名称为紫梧桐资产管理有限公司
result = pd.DataFrame()
total_page = 56
for  page in range(1,total_page+1):
    data = request_data_keywords(req_s,'紫梧桐',page,total_page)
    result = result.append(data)
result['投诉对象']="紫梧桐"
result.to_csv("紫梧桐投诉数据.csv",index=False)

Insert picture description here
Insert picture description here

Second, clean the drawing

import os,re
import pandas as pd
import numpy as np


# 数据清洗,处理keywords爬取导致的投诉标题混乱
data_path = os.path.join('data','紫梧桐投诉数据.csv')
data =pd.read_csv(data_path)
pattern=r'[^\u4e00-\u9fa5\d]'
data['投诉问题']=data['投诉问题'].apply(lambda x: re.sub(pattern,'',x))
data.to_csv(data_path,index=False,encoding="utf_8_sig")


# 数据合并
result = pd.DataFrame()
for wj in os.listdir('data'):
    data_path = os.path.join('data',wj)
    data =pd.read_csv(data_path)
    result = result.append(data)
result.to_csv("data/合并后蛋壳投诉数据.csv",index=False,encoding="utf_8_sig")
# 读取数据
data = pd.read_csv("data/合并后蛋壳投诉数据.csv")

# 筛选到截止昨天的数据,保证按天数据的完整性
data = data[data.投诉日期<='2020-11-09']
print(f"截至2020-11-09之前,黑猫投诉累计收到蛋壳公寓相关投诉共计 {len(data)} 条")

Insert picture description here

# 时间分布处理
_data=data.groupby('投诉日期').count().reset_index()[['投诉日期','投诉编号']]
_data.rename(columns={
    
    "投诉编号":"投诉数量"},inplace = True)


# 2020-01-30之前投诉数量求和
num1 = _data[_data.投诉日期<='2020-01-30'].投诉数量.sum()
data0 =pd.DataFrame([['2020-01-30之前',num1]],columns=['投诉日期','投诉数量'])
# 2020-02-01 ~ 2020-02-21号之间投诉情况分布
data1=_data[(_data.投诉日期>='2020-02-01')&(_data.投诉日期<='2020-02-21')]

# 2020-02-21 ~ 2020-11-05
num2 = _data[(_data.投诉日期>='2020-02-21')&(_data.投诉日期<='2020-11-05')].投诉数量.sum()

# 2020-11-06 ~ 2020-11-09 本数据只采集到2020-11-09
print(f"2020-11-06当天投诉量{_data[_data.投诉日期=='2020-11-06'].iloc[0,1]}条")
                            
data2=_data[(_data.投诉日期>'2020-11-06')&(_data.投诉日期<='2020-11-09')]


data3=pd.DataFrame([['2020-02-21 ~ 2020-11-05',num2]],columns=['投诉日期','投诉数量'])
new_data = pd.concat([data0,data1,data3,data2])

Insert picture description here

'''配置绘图参数'''
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['font.size']=18
plt.rcParams['figure.figsize']=(12,8)
plt.style.use("ggplot")
new_data.set_index('投诉日期').plot(kind='bar') # 剔除了2020-11-06的数据,24093条

Before 2020-01-30, it was a normal number of complaints. Occasionally, one or two orders. In February, due to the epidemic, the number of complaints increased significantly. It may be due to the inability to clean up due to the epidemic, the rental subsidy for the epidemic, and the violence of long-term rental apartments. Negative news such as Ray and Eggshell’s bankruptcy has caused tensions among tenants.

The number of complaints from 2020-02-21 to 2020-11-05 is normal, slightly more than before 2020-01-30, and it is still within the acceptable range of normal operations

Insert picture description here
2020-11-06 Suddenly there were more than 24,000 complaints . Outliers affected the display. They were eliminated separately. I checked the news to see if there were any major incidents. The results were true. According to 36氪Report 2020-11-06 Eggshell Apartment Affiliated Company said that the person being executed, the execution target exceeded 5.19 million yuan

Insert picture description here
Since then, the number of complaints about eggshells in black cats on 7, 8, and 9 has been increasing daily at 2-300 . It seems that the official rumours of eggshell bankruptcy are all nonsense. Maybe it is not a rumor. Ofo's queue for debt collection is not groundless

The above is still the complaint data obtained only from the black cat. How many users will have no complaints and who think they are unlucky?

Next, let’s take a look. What are the main complaints from users? What is the main appeal?

Three, word cloud generation

import jieba# 分词模块
import re
import collections
import PIL.Image as img# pip install PIL
from wordcloud import WordCloud
import PIL.Image as img# pip install PIL
from wordcloud import WordCloud

# 投诉详细说明合并后进行分词
all_word=''
for line in data.values:
    word = line[4]
    all_word = all_word+word

# jieba分词
result=list(jieba.cut(all_word))

# 投诉问题词云图
wordcloud=WordCloud(
    width=800,height=600,background_color='white',
    font_path='C:\\Windows\\Fonts\\msyh.ttc',# 如果存在中文字符需要加载解析的词典
    max_font_size=500,min_font_size=20
).generate(' '.join(result))
image=wordcloud.to_image()
# image.show()# 生成图片展示
wordcloud.to_file('蛋壳公寓投诉详情.png')# 在本地生成文件展示


# 投诉标题合并后进行分词
all_word=''
for line in data.values:
    word = line[2]
    all_word = all_word+word

# jieba分词
result=list(jieba.cut(all_word))

# 生成词云图
# 投诉问题词云图
wordcloud=WordCloud(
    width=800,height=600,background_color='white',
    font_path='C:\\Windows\\Fonts\\msyh.ttc',# 如果存在中文字符需要加载解析的词典
    max_font_size=500,min_font_size=20
).generate(' '.join(result))
image=wordcloud.to_image()
# image.show()# 生成图片展示
wordcloud.to_file('蛋壳公寓投诉问题.png')# 在本地生成文件展示

# 投诉诉求合并后进行分词
all_word=''
for line in data.values:
    word = line[3]
    all_word = all_word+word

# jieba分词
result=list(jieba.cut(all_word))

# 生成词云图
# 投诉问题词云图
wordcloud=WordCloud(
    width=800,height=600,background_color='white',
    font_path='C:\\Windows\\Fonts\\msyh.ttc',# 如果存在中文字符需要加载解析的词典
    max_font_size=500,min_font_size=20
).generate(' '.join(result))
image=wordcloud.to_image()
# image.show()# 生成图片展示
wordcloud.to_file('蛋壳公寓投诉诉求.png')# 在本地生成文件展示

Eggshell apartment complaint details word cloud

The details of the complaint can be seen. The main complaints are: cash withdrawal (should be the same problem as me, reflected in the deposit), cash back for activities (how much money is returned each month, except for the normal cash back for the first two months, there is no later Pay on time, if the customer service fails to get through, then I don’t pay much attention), mainly there are customer service failures, cleaning issues, etc.! Maybe face the problem face to face, and there may not be so many complaints. The most unbearable thing is that apart from the fact that the official 400 is easy to get through when looking for a house for the first time, it is basically impossible to contact customer service later.

Insert picture description here
Eggshell apartment complaint appeal word cloud diagram

The main appeals of the complaint users are strongly demanded to impose corresponding penalties on Eggshell Apartments
and demand refunds and compensation

Insert picture description here
Eggshell apartment complaint problem word cloud illustration

The complaint problem, which is the title of the complaint,
can also be reflected here. The main problem is the reflection of the payment and the cashback of the activity, as well as some cleaning problems.

Insert picture description here

Guess you like

Origin blog.csdn.net/qq_35866846/article/details/109601322