Pandas与SQL语句对照

建立学生-课程数据库

表结构

student

|字段名|类型|备注|
|-|||
|sno|char|学号|
|sname|char|姓名|
|ssex|bool|性别|
|sage|int|年龄|
|sdept|char|所在系|

course

|字段名|类型|备注|
|-|||
|cno|char|课程号|
|cname|char|课程名称|
|cpno|char|先行课程|
|ccredit|int|学分|

|字段名|类型|备注|
|-|||
|sno|char|学号|
|cno|char|课程号|
|grade|int|成绩|

import pandas as pd

读取表数据内容

student=pd.read_csv('d:\\abc\\student.csv')
sc=pd.read_csv('d:\\abc\\sc.csv')
course=pd.read_csv('d:\\abc\\course.csv')

student

	sno	sname	ssex	sage	sdept
0	95001	李勇	True	20	CS
1	95002	刘晨	False	19	IS
2	95003	王敏	False	18	MA
3	95004	张立	True	19	IS
4	95005	刘云	False	18	CS
5	95006	张密码	False	20	is
6	95010	提供给	False	21	NaN

sc

	sno	cno	grade
0	95001	1	92.0
1	95001	2	95.0
2	95001	3	88.0
3	95001	4	60.0
4	95001	5	70.0
5	95001	6	80.0
6	95001	7	90.0
7	95002	1	95.0
8	95002	2	92.0
9	95003	2	85.0
10	95004	1	58.0
11	95004	2	85.0
12	95004	4	NaN
13	95111	1	90.0

功能对照表

行列操作

列选择

# select sno,sname,sage from student 
temp=student[['sno','sage','sname']]
temp.head(3)

	sno	sage	sname
0	95001	20	李勇
1	95002	19	刘晨
2	95003	18	王敏

增加一列

student['label']=student['sno'].astype("str") + '_'+ student['sname']
student.head()

	sno	sname	ssex	sage	sdept	label
0	95001	李勇	True	20	CS	95001_李勇
1	95002	刘晨	False	19	IS	95002_刘晨
2	95003	王敏	False	18	MA	95003_王敏
3	95004	张立	True	19	IS	95004_张立
4	95005	刘云	False	18	CS	95005_刘云

列删除

del student['label'] #第一种方法
student.drop(columns=['ssex', 'sage'],inplace=True)  # 第二种方法
student

	sno	sname	sdept
0	95001	李勇	CS
1	95002	刘晨	IS
2	95003	王敏	MA
3	95004	张立	IS
4	95005	刘云	CS
5	95006	张密码	is
6	95010	提供给	NaN

行删除

# delete from student where sage>18

# inplace=True 表示在现有表上删除
# student.drop(student[student.sage>18].index,inplace=True)

# inplace=False ,不删除原表内容
# temp=student.drop(student[student.sage>18].index,inplace=False)

# delete from student where sage>18 and sdept='IS'
temp= student.drop(student[(student.sage>18) & (student.sdept=='IS')].index)
temp

	sno	sname	ssex	sage	sdept
0	95001	李勇	True	20	CS
2	95003	王敏	False	18	MA
4	95005	刘云	False	18	CS
5	95006	张密码	False	20	is
6	95010	提供给	False	21	NaN

数据修改

# update student set sage=19 where sdept='IS'

student.loc[student.sdept=='IS','sage']=19
student

	sno	sname	ssex	sage	sdept
0	95001	李勇	True	20	CS
1	95002	刘晨	False	19	IS
2	95003	王敏	False	18	MA
3	95004	张立	True	19	IS
4	95005	刘云	False	18	CS
5	95006	张密码	False	20	is
6	95010	提供给	False	21	NaN

条件筛选

# select sno,sage,sname from stuent where sage=20

temp=student[['sno','sage','sname']].query('sage==20')   #第一种方法
temp=student[['sno','sage','sname']][student.sage==20]   #第二种方法
temp=student[['sno','sage','sname']].loc[student.sage==20]   #第三种方法
temp.head()

	sno	sage	sname
0	95001	20	李勇
5	95006	20	张密码

多条件选择

# select sno,sage,sname from student where sage=20 and sname='李勇'

temp=student[['sno','sage','sname']].query("sage==20 and sname=='李勇'")   #第一种方法
temp.head()

	sno	sage	sname
0	95001	20	李勇

查找空值的内容

# select * from student where sdept is null

temp=student[student.sdept.isnull()]
temp.head()

	sno	sname	ssex	sage	sdept
6	95010	提供给	False	21	NaN

多值筛选

# select * from student where sdept in ('CS','MA')
temp=student[student.sdept.isin(['CS','MA'])]
temp.head()

	sno	sname	ssex	sage	sdept
0	95001	李勇	True	20	CS
2	95003	王敏	False	18	MA
4	95005	刘云	False	18	CS

区间查找

# select * from student where sage between 18 and 20
temp=student.query('19<=sage<=20')
temp.head()

	sno	sname	ssex	sage	sdept
0	95001	李勇	True	20	CS
1	95002	刘晨	False	19	IS
3	95004	张立	True	19	IS
5	95006	张密码	False	20	is

重复值消除

# select distinct sno from student

temp=sc['sno'].drop_duplicates()  # 默认写法
temp.head()

0     95001
7     95002
9     95003
10    95004
13    95111
Name: sno, dtype: int64

# 查找 sage、sdept 重复的，如果发现，保留第一行
# 参数 keep：{‘first’, ‘last’, False}, 默认值 ‘first’；first： 保留第一次出现的重复行，删除后面的重复行。；last： 删除重复项，除了最后一次出现；False： 删除所有重复项。
# 参数 inplace：布尔值，默认为False，是否直接在原数据上删除重复项或删除重复项后返回副本。
temp=student.drop_duplicates(subset=['sage','sdept'],keep='first')  # 带参数写法
temp.head()

	sno	sname	ssex	sage	sdept
0	95001	李勇	True	20	CS
1	95002	刘晨	False	19	IS
2	95003	王敏	False	18	MA
4	95005	刘云	False	18	CS
5	95006	张密码	False	20	is

数据返回量控制

返回三条记录

# select * from student limit3

student[0:3]
student.iloc[0:3]

# pandas以类似字典的方式来获取某一列的值，比如df[‘A’]，这会得到df的A列。如果我们对某一行感兴趣呢？这个时候有两种方法，一种是iloc方法，另一种方法是loc方法。loc是指location的意思，iloc中的i是指integer。

# 也就是说loc是根据index来索引，比如下边的df定义了一个index，那么loc就根据这个index来索引对应的行。iloc并不是根据index来索引，而是根据行号来索引，行号从0开始，逐次加1。

	sno	sname	ssex	sage	sdept
0	95001	李勇	True	20	CS
1	95002	刘晨	False	19	IS
2	95003	王敏	False	18	MA

带有函数的例子

#select * from student where sage>=(select max(sage) from student)

temp=student[student.sage>=student.sage.max()]
temp.head()

	sno	sname	ssex	sage	sdept
6	95010	提供给	False	21	NaN

联合查询

#select cno,grade from student,sc where student.sno=sc.sno

tempa=student.query("sname=='李勇'")['sno'].values[0]
temp=sc.query("sno==%s" % tempa)[['cno','grade']]
temp.head()

	cno	grade
0	1	92.0
1	2	95.0
2	3	88.0
3	4	60.0
4	5	70.0

# 等值连接
# select * from student,sc where student.sno=sc.sno
temp=student.merge(sc,on='sno')
temp.tail()

	sno	sname	sdept	cno	grade
8	95002	刘晨	IS	2	92.0
9	95003	王敏	MA	2	85.0
10	95004	张立	IS	1	58.0
11	95004	张立	IS	2	85.0
12	95004	张立	IS	4	NaN

pd.merge函数更多的内容，参考：https://blog.csdn.net/brucewong0516/article/details/82707492

# 外连接
# select * from student left join sc on student.sno=sc.sno


temp=pd.merge(student,sc,on='sno',how='outer') #第一种写法
temp=student.merge(sc,on='sno',how='outer')    #第二种写法
temp.head()

	sno	sname	ssex	sage	sdept	cno	grade
0	95001	李勇	True	20.0	CS	1.0	92.0
1	95001	李勇	True	20.0	CS	2.0	95.0
2	95001	李勇	True	20.0	CS	3.0	88.0
3	95001	李勇	True	20.0	CS	4.0	60.0
4	95001	李勇	True	20.0	CS	5.0	70.0

聚合Aggregate

统计极值

# student.sage.mean() #均值
# student.sage.sum()  #求和
# student.sage.count()#计数
# student.sage.max()  #最大值

# 两种写法均可
# student.sage.min()  #最小值
student['sage'].min()

常规统计

student.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 5 columns):
sno      7 non-null int64
sname    7 non-null object
ssex     7 non-null bool
sage     7 non-null int64
sdept    6 non-null object
dtypes: bool(1), int64(2), object(2)
memory usage: 359.0+ bytes

student.describe()

	sno	sage
count	7.000000	7.000000
mean	95004.428571	19.285714
std	2.992053	1.112697
min	95001.000000	18.000000
25%	95002.500000	18.500000
50%	95004.000000	19.000000
75%	95005.500000	20.000000
max	95010.000000	21.000000

#类似于Counter功能
# select sage,count(sage) from student group by sage
student['sage'].value_counts()

20    2
19    2
18    2
21    1
Name: sage, dtype: int64

groupby

# select avg(sage) from student group by sdept
student.groupby('sdept')['sage'].mean()

sdept
CS    19
IS    19
MA    18
is    20
Name: sage, dtype: int64

排序

# select * from student order by sage  desc
temp=student.sort_values(by=['sage'],ascending=False)
temp

	sno	sname	ssex	sage	sdept
6	95010	提供给	False	21	NaN
0	95001	李勇	True	20	CS
5	95006	张密码	False	20	is
1	95002	刘晨	False	19	IS
3	95004	张立	True	19	IS
2	95003	王敏	False	18	MA
4	95005	刘云	False	18	CS

参考 Pandas基本操作以及SQL对照：https://www.jianshu.com/p/14af48479078

Pandas与SQL语句对照

Pandas与SQL语句对照

建立学生-课程数据库

表结构

读取表数据内容

功能对照表

行列操作

列选择

增加一列

列删除

行删除

数据修改

条件筛选

多条件选择

查找空值的内容

多值筛选

区间查找

重复值消除

数据返回量控制

带有函数的例子

联合查询

聚合Aggregate

统计极值

常规统计

groupby

排序

猜你喜欢