【数据分析】探索性数据分析 -3

%matplotlib inline
%config InlineBackend.figure_format='retina'

from __future__ import absolute_import, division, print_function
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib.pyplot import GridSpec
import seaborn as sns
import numpy as np
import pandas as pd
import os, sys
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
sns.set_context("poster", font_scale=1.3)

import missingno as msno
import pandas_profiling

from sklearn.datasets import make_blobs
import time
def save_subgroup(dataframe, g_index, subgroup_name, prefix='raw_'):
    save_subgroup_filename = "".join([prefix, subgroup_name, ".csv.gz"])
    dataframe.to_csv(save_subgroup_filename, compression='gzip', encoding='UTF-8')
    test_df = pd.read_csv(save_subgroup_filename, compression='gzip', index_col=g_index, encoding='UTF-8')
    # Test that we recover what we send in
    if dataframe.equals(test_df):
        print("Test-passed: we recover the equivalent subgroup dataframe.")
    else:
        print("Warning -- equivalence test!!! Double-check.")
def load_subgroup(filename, index_col=[0]):
    return pd.read_csv(filename, compression='gzip', index_col=index_col)

Tidy Dyads and Starting Joins

clean_players = load_subgroup("cleaned_players.csv.gz")
players = load_subgroup("raw_players.csv.gz", )
countries = load_subgroup("raw_countries.csv.gz")
referees = load_subgroup("raw_referees.csv.gz")
agg_dyads = pd.read_csv("raw_dyads.csv.gz", compression='gzip', index_col=[0, 1])

在这里插入图片描述

all(agg_dyads['games'] == agg_dyads.victories + agg_dyads.ties + agg_dyads.defeats)

True

# Sanity check passes
len(agg_dyads.reset_index().set_index('playerShort'))

146028
agg_dyads['totalRedCards'] = agg_dyads['yellowReds'] + agg_dyads['redCards']
agg_dyads.rename(columns={'redCards': 'strictRedCards'}, inplace=True)

Remove records that come from players who don’t have a skintone rating

There are a couple of ways to do this – set operations and joins are two ways demonstrated below:

clean_players.head()

agg_dyads.head()

agg_dyads.reset_index().head()

agg_dyads.reset_index().set_index('playerShort').head()
player_dyad = (clean_players.merge(agg_dyads.reset_index().set_index('playerShort'),
                                   left_index=True,
                                   right_index=True))
clean_dyads = (agg_dyads.reset_index()[agg_dyads.reset_index()
                                   .playerShort
                                   .isin(set(clean_players.index))
                                  ]).set_index(['refNum', 'playerShort'])
clean_dyads.shape, agg_dyads.shape, player_dyad.shape

Disaggregate

The dyads are currently an aggregated metric summarizing all times a particular referee-player pair play were matched. To properly handle the data, we have to disaggregate the data into a tidy/long format. This means that each game is a row.

# inspired by https://github.com/mathewzilla/redcard/blob/master/Crowdstorming_visualisation.ipynb
colnames = ['games', 'totalRedCards']
j = 0
out = [0 for _ in range(sum(clean_dyads['games']))]

for index, row in clean_dyads.reset_index().iterrows():
    n = row['games']
    d = row['totalRedCards']
    ref = row['refNum']
    player = row['playerShort']
    for _ in range(n):
        row['totalRedCards'] = 1 if (d-_) > 0 else 0
        rowlist=list([ref, player, row['totalRedCards']])
        out[j] = rowlist
        j += 1

tidy_dyads = pd.DataFrame(out, columns=['refNum', 'playerShort', 'redcard'],).set_index(['refNum', 'playerShort'])
# Ok, this is a bit crazy... tear it apart and figure out what each piece is doing if it's not clear
clean_referees = (referees.reset_index()[referees.reset_index()
                                                 .refNum.isin(tidy_dyads.reset_index().refNum
                                                                                       .unique())
                                        ]).set_index('refNum')
clean_countries = (countries.reset_index()[countries.reset_index()
                                           .refCountry
                                           .isin(clean_referees.refCountry
                                                 .unique())
                                          ].set_index('refCountry'))
发布了116 篇原创文章 · 获赞 10 · 访问量 1341

猜你喜欢

转载自blog.csdn.net/weixin_44727383/article/details/105028945