sql语句 pandas 实现

import pandas as pd


df = pd.DataFrame({'total_bill': [16.99, 10.34, 23.68, 23.68, 24.59],
                   'tip': [1.01, 1.66, 3.50, 3.31, 3.61],
                   'sex': ['Female', 'Male', 'Male', 'Male', 'Female']})
# print(df)
# data type of columns
# print(df.dtypes)
# indexes
# print(df.index)
# return pandas.Index
# print(df.columns)
# each row, return array[array]
# print(df.values)
# a tuple representing the dimensionality of df
# print(df.shape)

'''
select
'''
# loc,基于列label,可选取特定行(根据行index);
# iloc,基于行/列的position;

# print(df.loc[1:3, ['total_bill', 'tip']])
# print(df.loc[1:3, 'tip': 'total_bill'])
# print(df.iloc[1:3, [1, 2]])
# print(df.iloc[1:3, 1:3])

# at,根据指定行index及列label,快速定位DataFrame的元素;
# iat,与at类似,不同的是根据position来定位的;

# print(df.at[3, 'tip'])
# print(df.iat[3, 1])

# ix,为loc与iloc的混合体,既支持label也支持position;
# print(df.ix[1:3, [1, 2]])
# print(df.ix[1:3, ['total_bill', 'tip']])

# print(df[1: 3])
# print(df[['total_bill', 'tip']])
# 返回Series
# print(df['total_bill'])

'''
where
'''
# print(df[df['sex'] == 'Female'])
# print(df[df['total_bill'] > 20])
# 另一个写法
# print(df.query('total_bill > 20'))
#
# and
# print(df[(df['sex'] == 'Female') & (df['total_bill'] > 20)])
# # or
# print(df[(df['sex'] == 'Female') | (df['total_bill'] > 20)])
# # in
# print(df[df['total_bill'].isin([21.01, 23.68, 24.59])])
# # not
# print(df[-(df['sex'] == 'Male')])
# print(df[-df['total_bill'].isin([21.01, 23.68, 24.59])])
# string function
# print(df=df[(-df['app'].isin(sys_app)) & (-df.app.str.contains('^微信\d+$'))])

# 对where条件筛选后只有一行的dataframe取其中某一列的值,其两种实现方式如下
# total = df.loc[df['tip'] == 1.66, 'total_bill'].values[0]
# total2 = df[df['tip'] == 1.66]['total_bill'].values[0]
# total = df.get_value(df.loc[df['tip'] == 1.66].index.values[0], 'total_bill')

'''
group
'''

# print(df.groupby('sex')['tip'].sum())
# print(df.groupby('sex').size())
# print(df.groupby('sex').count())
# print(df.groupby('sex')['tip'].count())
# print(df.groupby('sex').agg({'tip': np.min, 'total_bill': np.sum}))

'''
as
'''
# first implementation
# df.columns = ['total', 'pit', 'xes']
# second implementation
# df.rename(columns={'total_bill': 'total', 'tip': 'pit', 'sex': 'xes'}, inplace=True)

'''
join
'''

# 1.
# df1 = pd.DataFrame({'total_bill': [16.99, 10.34, 23.68, 23.68, 24.59, 12.32],
#                     'tip': [1.01, 1.66, 3.50, 3.31, 3.61, 3.9],
#                     'sex': ['Female', 'Male', 'Male', 'Male', 'Female','Male']})
# df2 = pd.DataFrame({'total_b': [16.99, 10.34, 23.68, 23.68, 24.59],
#                     'ti': [1.01, 1.66, 3.50, 3.31, 3.61],
#                     'sex': ['Female', 'Male', 'Male', 'Male', 'Female']})
# print(df1.join(df2, how='left'))
# print(df1.join(df2, how='right'))
# def left_join(left, right, on, right_col, default_value):
#     df = pd.merge(left, right, how='left', on=on)
#     df[right_col] = df[right_col].map(lambda x: default_value if pd.isnull(x) else x)
#     return df
# print(left_join(df1,df2,'sex','total_b',0))

# 2.
# df3 = pd.DataFrame({'total_bill': [16.99, 10.34, 23.68, 23.68, 24.59, 12.32],
#                     'tip': [1.01, 1.66, 3.50, 3.31, 3.61, 3.9],
#                     'sex': ['Female', 'Male', 'Male', 'Male', 'Female','Male']})
# df4 = pd.DataFrame({'total_b': [16.99, 10.34, 23.68, 23.68, 24.59],
#                     'ti': [1.01, 1.66, 3.50, 3.31, 3.61],
#                     'sex': ['Female', 'Male', 'Male', 'Male', 'Female']})
#
# print(pd.merge(df3, df4, how='left', on=['sex']))
# print(df3.merge(df4, how='right', on=['sex']))

'''
order
'''

# print(df.sort_values(['total_bill', 'tip'], ascending=[False, True]))

猜你喜欢

转载自blog.csdn.net/a35155/article/details/81325226