1、where
sql:
select * from temp1
where position_dt >= 20170704
and position_dt between 20170701 and 20170704
and sk_f_prod in ('pd001', 'pd002')
and secu_type_code not in ('期货', '远期')
and account_project_name like '证券投资%'
and (mkt_suspension_info is null or secu_mkt_code = '001')
and case when prod_name is null then '无名' else prod_name end = '无名'
)
pandas:
df = df[(df['position_dt'] >= 20170701)
&(df['position_dt'] <= 20170704)
&(df.sk_f_prod.isin(['pd001', 'pd002']))
&(~df.secu_type_code.isin(['期货', '远期']))
&(df.account_project_name.str.contains('证券投资'))
&(df.mkt_suspension_info.isnull() | (df.secu_mkt_code == '001'))
&(df['prod_name'].map((lambda x: x if x else '无名')) == '无名')]
2、group by:求最大值、求和、求数量
sql:
select position_dt,
sk_f_prod,
max(locc_position_cost),
sum(orgc_posi_mkt_val),
count(case
when substr(subj_code, 1, 4) = '1102' then 1
when secu_var = '股票品种' and secu_inner_code like '%600038%' then 0
else 2
end) quantity
from sql_where
group by position_dt, sk_f_prod
pandas:
df['amount'] = df.apply(lambda x: 1 if x.subj_code[:4] == '1102' else (0 if x.secu_var == '股票品种' and x.secu_inner_code.str.contains('600038') else 2), axis=1)
df = df.groupby(['sk_f_prod', 'position_dt']).agg({'locc_position_cost': 'max', 'orgc_posi_mkt_val': 'sum', 'amount': 'count'})
3、order by, top
sql:
select * from
(select row_number() over(order by sk_primary_key desc) rn,a.* from sql_where a) b
where rn <= 3
pandas:
df = df.sort_values('sk_primary_key', ascending=False).head(3)
4、distinct
sql:
select distinct sk_f_prod from sql_where
pandas:
print(df['sk_f_prod'].unique())
5、replace
sql:
select case when subj_class = '证券投资' then '123' else subj_class end subj_class from sql_where a
pandas:
print(df['subj_class'].replace('证券投资', '123'))
6、累加和累乘
sql:
select sk_f_prod, position_dt, locc_position_cost,
sum(position_quantity) over(order by sk_primary_key) posi_sum
,exp(sum(ln(position_quantity/10000)) over(order by sk_primary_key)) posi_prod
from sql_where
pandas:
df = df.sort_values('sk_primary_key', ascending=True)['position_quantity'].cumsum()
df = (df.sort_values('sk_primary_key', ascending=True)['position_quantity']/10000).cumprod()
7、union和union all
sql:
select * from sql_where
where sk_primary_key = '99148311'
union all
select * from sql_where
where sk_primary_key in('99148311', '99153271'
select * from sql_where
where sk_primary_key = '99148311'
union
select * from sql_where
where sk_primary_key in('99148311', '99153271')
pandas:
df1 = df[df['sk_primary_key'] == '99148311']
df2 = df[df['sk_primary_key'].isin(['99148311', '99153271'])]
df = pd.concat([df1, df2])
df = pd.concat([df1, df2]).drop_duplicates()
8、inner join、left join、right join、full join、rename
sql:
--inner join
select a.*, b.* from
(select sk_primary_key from sql_where
where sk_primary_key in ('99148311', '99153271', '99157749')) a
inner join
(select sk_primary_key from sql_where
where sk_primary_key in ('99148311', '99158720', '99183591')) b
on a.sk_primary_key = b.sk_primary_key
--left join
select a.*, b.* from
(select sk_primary_key from sql_where
where sk_primary_key in ('99148311', '99153271', '99157749')) a
left join
(select sk_primary_key from sql_where
where sk_primary_key in ('99148311', '99158720', '99183591')) b
on a.sk_primary_key = b.sk_primary_key
--right join
select a.*, b.* from
(select sk_primary_key from sql_where
where sk_primary_key in ('99148311', '99153271', '99157749')) a
right join
(select sk_primary_key from sql_where
where sk_primary_key in ('99148311', '99158720', '99183591')) b
on a.sk_primary_key = b.sk_primary_key
--full join
select a.*, b.* from
(select sk_primary_key sk_primary_key1, sk_f_prod sk_f_prod_left from sql_where
where sk_primary_key in ('99148311', '99153271', '99157749')) a
full join
(select sk_primary_key sk_primary_key2, sk_f_prod sk_f_prod_right from sql_where
where sk_primary_key in ('99148311', '99158720', '99183591')) b
on a.sk_primary_key1 = b.sk_primary_key2
pandas:
df1 = df[df['sk_primary_key'].isin(['99148311', '99153271', '99157749'])][['sk_primary_key', 'sk_f_prod']]
df1 = df1.rename(columns={'sk_primary_key': 'sk_primary_key1'})
df2 = df[df['sk_primary_key'].isin(['99148311', '99158720', '99183591'])][['sk_primary_key', 'sk_f_prod']]
df2 = df2.rename(columns={'sk_primary_key': 'sk_primary_key2'})
# inner join
# df = pd.merge(left=df1, right=df2, left_on='sk_primary_key1', right_on='sk_primary_key2', how='inner', suffixes={'_left', '_right'})
# left join
# df = pd.merge(left=df1, right=df2, left_on='sk_primary_key1', right_on='sk_primary_key2', how='left', suffixes={'_left', '_right'})
# right join
# df = pd.merge(left=df1, right=df2, left_on='sk_primary_key1', right_on='sk_primary_key2', how='right', suffixes={'_left', '_right'})
# outer join
df = pd.merge(left=df1, right=df2, left_on='sk_primary_key1', right_on='sk_primary_key2', how='outer', suffixes={'_left', '_right'})
9、lead、lag
sql:
select id,
date_dt,
nav,
lead(nav, 1) over(order by trade_dt) lead_nav,
lag(nav, 1) over(order by trade_dt) nav_nav
from temp1
pandas:
df = df.sort_values(by='TRADE_DT').reset_index(drop=True)
lag:df['RIGHT_UNIT_NAV'].shift(1), lead:df['RIGHT_UNIT_NAV'].shift(1)