pandas库学习基础和业务经验（一）--重点推荐

近期用到的函数总结：
#分组排序功能
import pandas as pd
def test_f(df,column,istopn = False,n=1):
    """
    df:数据框
    column:为需要对之聚合的列
    istopn:返回每一组的第n行数据
    """
    count = len(df)
    distinct_count = df[column].nunique()
    sum = df[column].sum()
    avg = df[column].mean()
    if istopn == True:
        # 降序生序，自己注意
        # df.sort_values(by=column, ascending=False)[:n] # 返回前n个
        temp_data = df.sort_values(by=column, ascending=False)
        temp_data['row'] = range(len(df))
        return temp_data
    else:
        return pd.DataFrame({'count':[count],'distinct_count':[distinct_count],'sum':[sum],'avg':[avg]})
#应用函数 https ://blog.csdn.net/wendaomudong_l2d4/article/details/84818133
df_sort = df.groupby(['user']).apply(test_f,column = 'amount',istopn = True)

===============================2019-5-21-总结===============================================
df_sort
df_sort.index = range(len(df_sort))
df_sort

#分组排序求值
#https: //mp.weixin.qq.com/s?src=11&timestamp=1558517199&ver=1622&signature= 
#ewVTWFGVfhJknoV9tNKp6LU3gTg8qy7whwHziCriX1k0IH5sjcmlkuMutaFbQKH*xWX7*jpJQ7UGt8roQAY7PsLobAojY1rAltgvWSG04y-J55iXkU634aehmUqOtmCn&new=1
best_rating_per_price = reviews.groupby('price')['points'].max().sort_index()
best_rating_per_price.head()
#https: //blog.csdn.net/Li_qf/article/details/84852633
price_extremes = reviews.groupby('variety')['price'].agg(['max', 'min'])
price_extremes.head()

#重点分组排序一起http: //www.cnblogs.com/fatcici2017/p/6634910.html
#先筛选出还有'from'列中带有'iphone 6s'的行，然后对这些数据进行groupby，结果倒序排
#约等同于sql中的groupby+where+order by +desc

df[df['from'].str.contains('iphone 6s plus')].groupby(['from','to'])['uid'].agg({'uv':'count'}).sort_values(by='uv',ascending=0)

#字段拆分重命名
demo=pd.merge(df_6b, pd.DataFrame(df_6b["index"].str.split('-',expand=True)), how='left', left_index=True, right_index=True)
demo.head()
#多个字段合并成一个新字段，如果加一个常数值，则应该另做一个字段，然后再关联
df_5["index"]=df_5["端口"].str.cat([df_5["类型"],df_5["店铺"],df_5["month"]],sep = '-')
#字段删除或不显示
df.drop(df.columns["index"], axis=1, inplace=True)
# 可以通过subset参数来删除在age和sex中含有空数据的全部行
df4 = df4.dropna(subset=["age", "sex"])
#修改列名
a.rename(columns={'A':'a', 'B':'b', 'C':'c'}, inplace = True)
#字符串替换
df_1['店铺'] = df_1['店铺'].apply(lambda x : x.replace("较前一月",""))
#关联http: //www.cnblogs.com/keye/p/10791705.html
# 基于共同列alpha的内连接
df7 = pd.merge(df_6a,df_6b,on=["端口","类型","店铺","month"],how='inner')
# 创建DataFrame对象
df = pd.DataFrame([1, 2, 3, 4, 5], columns=['cols'], index=['a','b','c','d','e'])
# 对DataFrame对象进行列扩充
df2['col4'] = ['cnn','rnn']

================================2019-5-23-总结====================================================
#查看数据框字段的类型
#1.少量字段
df.dtypes
#2.大量字段
cols = df.columns
for col in cols:
    print(col+' : '+ str(df[col].dtype))
#3.指定字段
cols = df.columns
for col in cols:
    if str(df[col].dtype) == 'object':
        print(col)
#4.查看每一列类型
df.info()

#5.字符串转日期并做索引
df['date'] = pd.to_datetime(df['date'])
df.set_index("date", inplace=True)
#先索引，后转类型
df2.set_index("date", inplace=True)
df2.index = pd.DatetimeIndex(df.index)
#或者df2.index = pd.DatetimeIndex(df2["date"])
# 想要真正的改变数据框，通常需要通过赋值来进行，比如
df["字段名"] = df["Customer Number"].astype("int")

##通过自定义函数清理数据
def convert_currency(var):
    """
    convert the string number to a float
    _ 去除$
    - 去除逗号，
    - 转化为浮点数类型
    """
    new_value = var.replace(",","").replace("$","")
    return float(new_value)

# 通过replace函数将$以及逗号去掉，然后字符串转化为浮点数，让pandas选择pandas认为合适的特定类型，float或者int，该例子中将数据转化为了float64
# 通过pandas中的apply函数将2016列中的数据全部转化
df["2016"].apply(convert_currency)

# 当然可以通过lambda 函数将这个比较简单的函数一行带过
df["2016"].apply(lambda x: x.replace(",","").replace("$","")).astype("float64")
#同样可以利用lambda表达式将PercentGrowth进行数据清理
df["Percent Growth"].apply(lambda x: x.replace("%","")).astype("float")/100

# 同样可以通过自定义函数进行解决，结果同上
# 最后一个自定义函数是利用np.where() function 将Active 列转化为布尔值。
df["Active"] = np.where(df["Active"] == "Y", True, False)
##读取文件
read_excel(io, sheetname=0, header=0, skiprows=None, skip_footer=0, index_col=None,names=None)

######字段的拆分与合并
#1.合并（字符串合并）
df_5["index"]=df_5["端口"].str.cat([df_5["类型"],df_5["店铺"],df_5["month"]],sep = '-')
df_5["index"]=df_5["端口"].str.cat(df_5["类型"],sep = '-')
#2.合并（含有数值）
df_5["index"] = df_5["index"].map(lambda x:str(x))
df_5["index"]=df_5["端口"].str.cat(df_5["类型"],sep = '-')
#3.字段拆分重命名
demo=pd.merge(df_6b, pd.DataFrame(df_6b["index"].str.split('-',expand=True)), how='left', left_index=True, right_index=True)
#分段函数:在分段的时候有6个值，但是分段的标签只有5个
#默认左开右闭，那么只需要加一个参数：right = False就可以。
df["字段"] = pd.cut(df["字段"],["阈值","阈值","阈值","阈值"],labels = ["阈值","阈值","阈值"])
#删除表中全部为NaN的行
df.dropna(axis=0,how='all')
#删除表中含有任何NaN的行
df.dropna(axis=0,how='any') #drop all rows that have any NaN values

==============================================================================================
#总结一下astype()函数有效的情形：
#数据列中的每一个单位都能简单的解释为数字(2, 2.12等）
#数据列中的每一个单位都是数值类型且向字符串object类型转换
#如果数据中含有缺失值、特殊字符astype()函数可能失效。
#使用自定义函数进行数据类型转换
#方案一
def convert_currency(value):
    """
    转换字符串数字为float类型
     - 移除 ￥ ,
     - 转化为float类型
    """
    new_value = value.replace(',', '').replace('￥', '')
    return np.float(new_value)
data['2016'].apply(convert_currency)
#方案二
data['2016'].apply(lambda x: x.replace('￥', '').replace(',', '')).astype('float')

#import pandas as pd  日期函数的应用
dates = ['2017-01-05', 'Jan 5, 2017', '01/05/2017', '2017.01.05', '2017/01/05','20170105']
pd.to_datetime(dates)

===================参考博客http:// www.cnblogs.com/ onemorepoint/p/9404753.html====================================================================
#混合字符串类型替换成数值函数
def convert_currency(var):
    """
    convert the string number to a float
    _ 去除$
    - 去除逗号，
    - 转化为浮点数类型
    """
    new_value = var.replace(",","").replace("$","")
    return float(new_value)

#将百分数型号的字符串转化为数值类型函数
def convert_percent(value):
    """
    转换字符串百分数为float类型小数
    - 移除 %
    - 除以100转换为小数
    """
    new_value = value.replace('%', '')
    return float(new_value) / 100

#导入数据进行整体形式替换
data2 = pd.read_csv("data.csv",
                   converters={
                               '客户编号': str,
                               '2016': convert_currency,
                               '2017': convert_currency,
                               '增长率': convert_percent,
                               '所属组': lambda x: pd.to_numeric(x, errors='coerce'),
                               '状态': lambda x: np.where(x == "Y", True, False)
                              },
                   encoding='gbk')


#可视图表的展示，要统计的值，横坐标，纵坐标，用的函数
mean_demo = temp_demo.pivot_table("2019年消费",index = "年",columns = "活跃度",aggfunc = "mean")
mean_demo
#tips.pivot_table(values=['tip_pct', 'size'], index=['sex', 'day'], columns='smoker')


#取列，求对应列的函数
a = temp_demo.columns
columns = ["2018年消费","2019年消费","百分占比","单元"]
temp_demo[columns].max()

#博客：blog.csdn.net/liuhehe123/article/details/85921930
#//blog.csdn.net/Li_qf/article/details/84852633  葡萄酒问题
#排序
sorted_varieties = price_extremes.sort_values(by=['min', 'max'], ascending=False)
sorted_varieties
#分组
price_extremes = reviews.groupby("variety").price.agg(['min','max'])
price_extremes 


#1.读取数据
import pandas as pd
reviews = pd.read_csv('./winemag-data-130k-v2.csv', index_col=0)
#2.谁是品酒最多的人？创建一个由taster_twitter_handle 为索引，值为每个人品酒次数的Series
reviews_written = reviews.groupby('taster_twitter_handle').size()
reviews_written
#3.创建一个以价格(price)为索引，最高评分(points)为值的Series，排序以价格递增。
best_rating_per_price = reviews.groupby('price')['points'].max().sort_index()
best_rating_per_price.head()
#4.创建一个DataFrame，以种类(variety)为索引，值为每个种类的最大值和最小值
price_extremes = reviews.groupby('variety')['price'].agg(['max', 'min'])
price_extremes.head()
#5.对上题中price_extremes进行处理，先按min降序排列，再按max降序排列。
sorted_varieties = price_extremes.sort_values(['min', 'max'], ascending=False)
sorted_varieties.head()
#6.创建一个Series，索引为品酒师(taster_name)，值为该人所有评分(points)的平均分。
reviewer_mean_ratings = reviews.groupby('taster_name').points.mean()
reviewer_mean_ratings
#7.创建一个Series，索引为多索引{country, variety}，值为个数，按值递减。
country_variety_counts = reviews.groupby(['country', 'variety']).size().sort_values(ascending=False)
country_variety_counts.head()

#8.删除空行  blog.csdn.net/houyanhua1/article/details/87855228
df4 = df4.dropna(subset=['age', 'body','home.dest'])
df.dropna(axis=0, inplace=True)   # inplace=True表示原地修改，修改后的结果直接作用于原df。 默认False
#9.NaN的处理方式二：填充
df2 = df.fillna(100)  # 填充成100
# 可以只填充某一列
df4 = df["YY"].fillna(df["YY"].mean())  # df.mean()表示每一列的平均值（Series类型）
# 填充平均值
df3 = df.fillna(df.mean())  # df.mean()表示每一列的平均值（Series类型）。 df.median()中位数
#9.基础学习 blog.csdn.net/claroja/article/details/65661826
#10# create new Title column
#从pandas中的一个单元格的字符串中提取字符串
df['Title'] = df['Name'].str.extract('([A-Za-z]+)\.', expand=True)
pandas库学习基础和业务经验（一）--重点推荐

猜你喜欢