Python big data analysis movie rating and duration, etc.

Prepare relevant data

Link: https://pan.baidu.com/s/1EvuEnVhSAUghEkF5rckMoA?pwd=2222 
Extraction code: 2222

1. Using Kmeans to analyze the relationship between duration and rating

 Import related libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from datetime import datetime
from sklearn.model_selection import train_test_split #划分测试集与训练集
from sklearn.linear_model import LinearRegression as LR #回归模块
from sklearn.metrics import mean_squared_error #MSE
from sklearn.metrics import mean_absolute_error #MAE
from sklearn.metrics import r2_score #R2
plt.rcParams['font.sans-serif']=['SimHei']
data = pd.read_csv('C:\\Users\\wt\\Desktop\\data1.csv')
mold = data.iloc[:, 1]
avg_rating_num = np.mean(data.iloc[:, [2]], axis=0)
X = data.iloc[:, [0, 2]]
X = X.values.astype('float32')

Record the number of related movie types

label = {}
def fetch(s):
    if s in label:
        label[s] += 1
    else:
        label[s] = 1
for i in mold:
    print(i)
    if len(i) == 2:
        fetch(i[0:2])
    elif len(i) == 3:
        fetch(i[0:3])
    elif len(i) == 5:
        fetch(i[0:2])
        fetch(i[3:5])
    elif len(i) == 8:
        fetch(i[0:2])
        fetch(i[3:5])
        fetch(i[6:8])
    elif len(i) == 11:
        fetch(i[0:2])
        fetch(i[3:5])
        fetch(i[6:8])
        fetch(i[9:11])
    elif len(i) == 14:
        fetch(i[0:2])
        fetch(i[3:5])
        fetch(i[6:8])
        fetch(i[9:11])
        fetch(i[12:14])

 print correlation coefficient

print(np.corrcoef(X[:, 0], X[:, 1]))

clf = KMeans(n_clusters=4)  
y_pred = clf.fit_predict(X)  

patches, text = plt.pie(label.values(), labels=label.keys(), radius=1)
text[-1].set_text('')
text[-2].set_text('')
text[-3].set_text('')
text[-5].set_text('')
text[-6].set_text('')
text[-7].set_text('')
text[-8].set_text('')
text[-9].set_text('')
text[13].set_text('')
for t in text:
    t.set_size(10)
plt.title("高质量电影类型成分分析")

Draw a graph to analyze the proportion of movie genres

x = [n[0] for n in X]
y = [n[1] for n in X]
plt.title("Kmeans分析时长与评分")
plt.scatter(x, y, c=y_pred, marker='x')
plt.xlabel("时长")
plt.ylabel("评分")
plt.show()

 Using kmeans duration and distribution

  2. Multivariate regression model analysis of playback volume

Record Movie Type

item=['剧情','犯罪']
def finds(iss):
    global item
    iss=iss.split("/")
    i=0
    while(1):
        if i==len(item):
            break
        for y in range(len(iss)):
            if iss[y]== item[i]:
                iss.remove(iss[y])
                break
        i+=1
    return iss    
for i in range(len(data.mold)):
    iss=finds(data.mold[i])
    item=item+list(iss)

 Custom One Hot Encoding

def my_get_dummies(ser):
  
    data=[]
    data=list(data)
    base_data=np.zeros((len(ser),),dtype=np.int)
    for i in range(len(item)):
        data.append(base_data)
    array = np.array(data, dtype = int)
    array=array.reshape(250,27)
    df=pd.DataFrame(array,columns=item,index=ser.index)
    for irec in ser.index:
        rec=ser[irec].split(',')
        for dirt in rec:
            if dirt not in item:
                print(dirt)
            else:
                df[dirt][irec]=1
    return df        
data=data.join(my_get_dummies(data.mold))
data

 Modeling

#二分原则为80%为样例数据作为模型训练集20%为样本数据作为测试集检查估计能力
 
from sklearn.model_selection import train_test_split #划分测试集与训练集
from sklearn.linear_model import LinearRegression as LR #回归模块
##在ipy中显示图像
%matplotlib inline
#设置绘图显示中文字体

pd.set_option('display.max_columns', None)
#特征提取
film_type=data[item]
film_type
# total_layer=data.总楼层
# 选择自变量与因变量
X = pd.concat([film_type,data.duration,data.Wtsee_people,data.Rating_people,data.Comments_people,data.year,data.rating_num],axis=1)
Y = data.Watching_people
print(type(X))
X = X.fillna(0)
#划分测试集与训练集
Xtrain,Xtest,Ytrain,Ytest=train_test_split(X,Y,test_size=0.2,random_state=420)
reg=LR().fit(Xtrain,Ytrain)
#预测
Yhat=reg.predict(Xtest)
#查看回归系数
print(list(zip(X.columns,reg.coef_)))
#查看截距
print(reg.intercept_)

from sklearn.metrics import mean_squared_error #MSE
from sklearn.metrics import mean_absolute_error #MAE
from sklearn.metrics import r2_score #R2
mse= mean_squared_error(Ytest,Yhat)
mae= mean_absolute_error(Ytest,Yhat)
r2=r2_score(Ytest,Yhat)
#调整R2
n=Xtest.shape[0]
k=Xtest.shape[1]
adj_r2=1-(1-r2)*((n-1)/(n-k-1))
print('MSE:'+str(mse))
print('MAE:'+str(mae))
print('R2:'+str(r2))
print('调整后R2:'+str(adj_r2))

evaluation model

 Drawing charts and evaluation results

#绘制前50条记录
n=50
#绘制模型预测值
plt.plot(range(len(Yhat[:n])),Yhat[:n])
#绘制模型真实值
plt.plot(range(len(Ytrain[:n])),Ytrain[:n])
#图形设置
plt.xlabel('个例')
plt.ylabel('播放量')
plt.title('线性回归预测结果')
plt.legend(["预估","实际"])

 Show the actual value of the test set and the predicted value of the model in the form of a line chart

#绘制前50条记录
n=50
#绘制模型预测值
plt.plot(range(len(Yhat[:n])),Yhat[:n])
#绘制模型测试真实值
plt.plot(range(len(Ytest[:n])),Ytest[:n])
#图形设置
plt.xlabel('个例')
plt.ylabel('播放量')
plt.title('线性回归预测结果')
plt.legend(["预估","实际"])

3. Decision Tree Prediction Scoring

Calculate the pearsonr coefficient to judge the degree of correlation

from sklearn import tree#决策树模型
from sklearn.model_selection import train_test_split#划分测试集合与训练集合
from sklearn.model_selection import GridSearchCV#用于找到最优模型
from scipy.stats import pearsonr
# 通常情况下通过以下取值范围判断变量的相关强度:
# 相关系数         0.8-1.0     极强相关
#                  0.6-0.8     强相关
#                  0.4-0.6     中等程度相关
#                  0.2-0.4     弱相关
#                  0.0-0.2     极弱相关或无相关
# x=np.array([1,3,5])
# y=np.array([1,3,4])
# pc = pearsonr(x,y)
# print("相关系数:",pc[0])
# print("显著性水平:",pc[1])

pccs = pearsonr(data['duration'],data['rating_num'])
print('时长')
print("相关系数:",pccs[0])
print("显著性水平:",pccs[1])
pccs = pearsonr(data['Watching_people'],data['rating_num'])
print('评分')
print("相关系数:",pccs[0])
print("显著性水平:",pccs[1])
pccs = pearsonr(data['year'],data['rating_num'])
print('年份')
print("相关系数:",pccs[0])
print("显著性水平:",pccs[1])
pccs = pearsonr(data['Rating_people'],data['rating_num'])
print('评价人数')
print("相关系数:",pccs[0])
print("显著性水平:",pccs[1])
pccs = pearsonr(data['Comments_people'],data['rating_num'])
print('短评人数')
print("相关系数:",pccs[0])
print("显著性水平:",pccs[1])
pccs = pearsonr(data['Wtsee_people'],data['rating_num'])
print('想看人数')
print("相关系数:",pccs[0])
print("显著性水平:",pccs[1])

build tree model

X=pd.concat([data['Watching_people'],data['Wtsee_people'],data['Watching_people'],data['Rating_people'],data['Comments_people'],data['year']],axis=1)
Y=data['rating_num']
# 划分测试与训练集
Xtrain,Xtest,Ytrain,Ytest=train_test_split(X,Y,test_size=0.1,random_state=420)
# 选择最优参数
tree_param={'criterion':['mse','friedman_mse','mae'],'max_depth':list(range(10))}
# GridSearchCV网格搜索,搜索的是参数,即在指定的参数范围内,按步长依次调整参数,利用调整的参数训练学习器,从所有的参数中找到在验证集上精度最高的参数,这其实是一个训练和比较的过程。k折交叉验证将所有数据集分成k份,不重复地每次取其中一份做测试集,
# 用其余k-1份做训练集训练模型,之后计算该模型在测试集上的得分,将k次的得分取平均得到最后的得分。
grid=GridSearchCV(tree.DecisionTreeRegressor(),param_grid=tree_param,cv=3)#实例化对象
grid.fit(Xtrain,Ytrain)#训练模型
grid.best_params_,grid.best_score_#最优参数,最优分数
print(grid.best_params_)
print(grid.best_score_)
# #建立决策树(改进的均方误差不纯度准则)
dtr=tree.DecisionTreeRegressor(criterion='friedman_mse',max_depth =4)
# #训练决策树
#预测训练结果
dtr.fit(Xtrain,Ytrain)
pred=dtr.predict(Xtest)

 Draw a picture to predict the next 25 actual scores and predicted scores

fig=plt.figure(figsize=(15.6,7.2))
ax=fig.add_subplot(111)
s1=ax.scatter(range(len(pred)),pred,facecolors="red",label='预测')
s2=ax.scatter(range(len(Ytest)),Ytest,facecolors="blue",label='实际')
plt.legend()

 It can be observed that about 15 predicted scores are close to the true value

The error is around 0.3

Guess you like

Origin blog.csdn.net/m0_59054762/article/details/130416630