【数据分析】探索性数据分析(EDA)-1

任务:

探索性数据分析(EDA). 挑战目标: 这些裁判在给红牌的时候咋想的呢,会不会被跟球员的肤色有关?

%matplotlib inline
%config InlineBackend.figure_format='retina'

from __future__ import absolute_import, division, print_function
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib.pyplot import GridSpec
import seaborn as sns
import numpy as np
import pandas as pd
import os, sys
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
sns.set_context("poster", font_scale=1.3)

import missingno as msno
import pandas_profiling

from sklearn.datasets import make_blobs
import time

数据简介:

数据包含球员和裁判的信息,2012-2013年的比赛数据,总共设计球员2053名,裁判3147名,特征列表如下:

https://docs.google.com/document/d/1uCF5wmbcL90qvrk_J27fWAvDcDNrO9o_APkicwRkOKc/edit

Variable Name: Variable Description:
playerShort short player ID
player player name
club player club
leagueCountry country of player club (England, Germany, France, and Spain)
height player height (in cm)
weight player weight (in kg)
position player position
games number of games in the player-referee dyad
goals number of goals in the player-referee dyad
yellowCards number of yellow cards player received from the referee
yellowReds number of yellow-red cards player received from the referee
redCards number of red cards player received from the referee
photoID ID of player photo (if available)
rater1 skin rating of photo by rater 1
rater2 skin rating of photo by rater 2
refNum unique referee ID number (referee name removed for anonymizing purposes)
refCountry unique referee country ID number
meanIAT mean implicit bias score (using the race IAT) for referee country
nIAT sample size for race IAT in that particular country
seIAT standard error for mean estimate of race IAT
meanExp mean explicit bias score (using a racial thermometer task) for referee country
nExp sample size for explicit bias in that particular country
seExp standard error for mean estimate of explicit bias measure
df = pd.read_csv("redcard.csv.gz", compression='gzip')
df.describe().T

在这里插入图片描述

all_columns = df.columns.tolist()

Create Tidy Players Table

player_index = 'playerShort'
player_cols = [#'player', # drop player name, we have unique identifier
               'birthday',
               'height',
               'weight',
               'position',
               'photoID',
               'rater1',
               'rater2',
              ]
all_cols_unique_players = df.groupby('playerShort').agg({col:'nunique' for col in player_cols})
all_cols_unique_players.head()

在这里插入图片描述

all_cols_unique_players[all_cols_unique_players > 1].dropna().shape[0] == 0
def get_subgroup(dataframe, g_index, g_columns):
    
    """Helper function that creates a sub-table from the columns and runs a quick uniqueness test."""
    g = dataframe.groupby(g_index).agg({col:'nunique' for col in g_columns})
    if g[g > 1].dropna().shape[0] != 0:
        print("Warning: you probably assumed this had all unique values but it doesn't.")
    return dataframe.groupby(g_index).agg({col:'max' for col in g_columns})
players = get_subgroup(df, player_index, player_cols)
players.head()

在这里插入图片描述

def save_subgroup(dataframe, g_index, subgroup_name, prefix='raw_'):
    save_subgroup_filename = "".join([prefix, subgroup_name, ".csv.gz"])
    dataframe.to_csv(save_subgroup_filename, compression='gzip', encoding='UTF-8')
    test_df = pd.read_csv(save_subgroup_filename, compression='gzip', index_col=g_index, encoding='UTF-8')
    # Test that we recover what we send in
    if dataframe.equals(test_df):
        print("Test-passed: we recover the equivalent subgroup dataframe.")
    else:
        print("Warning -- equivalence test!!! Double-check.")
players = get_subgroup(df, player_index, player_cols)
players.head()

在这里插入图片描述

save_subgroup(players, player_index, "players")

Create Tidy Clubs Table

club_index = 'club'
club_cols = ['leagueCountry']
clubs = get_subgroup(df, club_index, club_cols)
clubs.head()

在这里插入图片描述

clubs['leagueCountry'].value_counts()

England 48
Spain 27
France 22
Germany 21
Name: leagueCountry, dtype: int64

save_subgroup(clubs, club_index, "clubs", )

Create Tidy Referees Table

referee_index = 'refNum'
referee_cols = ['refCountry']
referees = get_subgroup(df, referee_index, referee_cols)
referees.head()

在这里插入图片描述

save_subgroup(referees, referee_index, "referees")

Create Tidy Countries Table

country_index = 'refCountry'
country_cols = ['Alpha_3', # rename this name of country
                'meanIAT',
                'nIAT',
                'seIAT',
                'meanExp',
                'nExp',
                'seExp',
               ]
countries = get_subgroup(df, country_index, country_cols)
countries.head()

在这里插入图片描述

rename_columns = {'Alpha_3':'countryName', }
countries = countries.rename(columns=rename_columns)
countries.head()

在这里插入图片描述

save_subgroup(countries, country_index, "countries")

Create separate (not yet Tidy) Dyads Table

dyad_index = ['refNum', 'playerShort']
dyad_cols = ['games',
             'victories',
             'ties',
             'defeats',
             'goals',
             'yellowCards',
             'yellowReds',
             'redCards',
            ]
dyads = get_subgroup(df, g_index=dyad_index, g_columns=dyad_cols)

在这里插入图片描述

save_subgroup(dyads, dyad_index, "dyads")
发布了116 篇原创文章 · 获赞 10 · 访问量 1339

猜你喜欢

转载自blog.csdn.net/weixin_44727383/article/details/105028415