read_excel参数详解
ExcelWriter
merge
query
filter
pandas语句与sql语句对比
import pandas as pd
file_path = r'.\工作统计表202209.xls'
file_path2 = r'.\合并.xlsx'
"""
pandas其他参数:
io, # 文件路径
sheet_name=0,# 读取的sheet名
skiprows=None, # 跳过指定的行,skiprows=1 跳过第1行,skiprows=3 跳过前3行,skiprows=[1,3,5] 跳过第1,3,5行,skiprows=lambda x: x % 2 == 0 跳过偶数行
header=0,# 指定第几行作为表头,header以上的行将会被忽略
names=None, # 指定表头名称,需要传递一个列表并且长度与DataFrame列数一致
index_col=None, # 指定列为索引列,None:从0开始自动生成。整数:指定第几列为索引,从0开始
usecols=None, # 指定解析的列数,默认为None,解析所有列。如果为str,则表示Excel列字母和列范围的逗号分隔列表(例如“ A:E”或“ A,C,E:F”)
# 如果为int列表,则表示解析那几列
squeeze=False, # 默认为False。如果设置squeeze=True则表示如果解析的数据只包含一列,则返回一个Series.此参数后面版本将作废
dtype=None, # 指定数据类型,默认为None,不改变数据类型。
engine=None, # 可以接受的参数有'xlrd','openpyxl'或'odf'
converters=None, # 对指定列的数据进行指定函数的处理,传入参数为列名与函数组成的字典。key 可以是列名或者列的序号,values是函数,可以def函数或者直接lambda都行
true_values=None,# 将指定的文本转换为True,默认为None
false_values=None,# 将指定的文本转换为False,默认为None
nrows=None, # 指定需要读取前多少行,通常用于较大的数据文件中
na_values=None, # 指定某些列的某些值为NaN
keep_default_na=True, # 表示导入数据时是否导入空值。默认为True,即自动识别空值并导入
na_filter=True, # 当数据为空时是否设置为NaN
verbose=False, # 是否显示程序处理过程中的一些额外信息,类似于输出log
parse_dates=False, # parse_dates将Date列设置为时间类型
date_parser=None,# 利用lambda函数,将某个字符串列,解析为日期格式;一般是配合parse_dates参数,一起使用
thousands=None,# 指定千分位分隔符
comment=None,
skipfooter=0, # skipfooter = 0不忽略,skipfooter = 1 忽略最后一行,skipfooter = 2 忽略最后两行
convert_float=True,
mangle_dupe_cols=True,
storage_options: StorageOptions = None,
"""
df = pd. read_excel( file_path, sheet_name= '9月' )
df. head( 2 )
df = pd. read_excel( file_path, sheet_name= '9月' , skiprows= 3 )
df. head( 2 )
names = [ '序号' , '分中心' , '姓名' , '主调次数' , '陪调次数' , '上会数量' ]
df = pd. read_excel( file_path, sheet_name= '9月' , skiprows= 3 , names= names)
df. head( 2 )
df = pd. read_excel( file_path, sheet_name= '9月' , skiprows= 3 , header= 6 )
df. head( 2 )
df = pd. read_excel( file_path, sheet_name= '9月' , skiprows= 3 , index_col= 2 )
df. head( 2 )
df = pd. read_excel( file_path, sheet_name= '9月' , skiprows= 3 , usecols= [ 1 , 2 ] )
df. head( 2 )
df = pd. read_excel( file_path, sheet_name= '9月' , skiprows= 3 , usecols= [ 2 ] , squeeze= True )
df. head( 2 )
C:\Users\AppData\Local\Temp\ipykernel_4420\1678458970.py:2: FutureWarning: The squeeze argument has been deprecated and will be removed in a future version. Append .squeeze("columns") to the call to squeeze.
df = pd.read_excel(file_path,sheet_name=‘9月’,skiprows=3,usecols=[2],squeeze=True) # index_col指定第几列作为索引列
df1 = pd. read_excel( file_path, sheet_name= '9月' , skiprows= 3 , usecols= [ 0 , 2 ] )
print ( 'df1-->' , df1. info( ) )
df2 = pd. read_excel( file_path, sheet_name= '9月' , skiprows= 3 , usecols= [ 0 , 2 ] , dtype= str )
print ( 'df2-->' , df2. info( ) )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 7 non-null float64
1 Unnamed: 2 7 non-null object
dtypes: float64(1), object(1)
memory usage: 272.0+ bytes
df1--> None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 7 non-null object
1 Unnamed: 2 7 non-null object
dtypes: object(2)
memory usage: 272.0+ bytes
df2--> None
df = pd. read_excel( file_path, sheet_name= '9月' , skiprows= 3 , usecols= [ 2 , 3 ] )
df. head( 2 )
df = pd. read_excel( file_path2, sheet_name= 'Sheet2' , usecols= 'E,P' , converters= {
'授信金额' : lambda x: x/ 10000 } )
df. head( )
df = pd. read_excel( file_path, sheet_name= '9月' , skiprows= 3 , usecols= [ 1 , 2 ] )
print ( df)
df = pd. read_excel( file_path, sheet_name= '9月' , skiprows= 3 , usecols= [ 1 , 2 ] , true_values= [ '城阳' ] , false_values= [ '平均' , '合计' ] )
df
df = pd. read_excel( file_path, sheet_name= '9月' , skiprows= 3 , usecols= [ 1 , 2 ] , nrows= 3 )
df
df = pd. read_excel( file_path, sheet_name= '9月' , skiprows= 3 , usecols= [ 1 , 2 ] , na_values= '王腾' )
df
df1 = pd. read_excel( file_path, sheet_name= '9月' , skiprows= 3 , usecols= [ 1 , 2 ] , na_filter= True )
df2 = pd. read_excel( file_path, sheet_name= '9月' , skiprows= 3 , usecols= [ 1 , 2 ] , na_filter= False )
print ( df1)
print ( df2)
print ( df1. iloc[ 8 , 1 ] , type ( df1. iloc[ 8 , 1 ] ) )
print ( df2. iloc[ 8 , 1 ] , type ( df2. iloc[ 8 , 1 ] ) )
nan <class 'float'>
<class 'str'>
df1 = pd. read_excel( file_path, sheet_name= '9月' , skiprows= 3 , usecols= [ 1 , 2 ] , verbose= True )
df1
df1 = pd. read_excel( file_path2, sheet_name= 'Sheet4' , usecols= 'E,S' , parse_dates= [ 1 ] ,
date_parser= lambda x: pd. to_datetime( x, format = "%Y-%m-%d" ) )
df1. head( )
df1. dtypes
客户号 object
合同首签日 datetime64[ns]
dtype: object
ExcelWriter
多个df写入同一个文件
df1 = pd. read_excel( file_path, sheet_name= '9月' , skiprows= 3 , usecols= [ 1 , 2 ] )
df1. to_excel( r'C:\Users\Desktop\测试\out.xlsx' , index= False , sheet_name= 'out1' )
df2 = pd. read_excel( file_path, sheet_name= '9月' , skiprows= 3 , usecols= [ 2 , 3 ] )
df2. to_excel( r'C:\Users\Desktop\测试\out.xlsx' , index= False , sheet_name= 'out2' )
writer = pd. ExcelWriter( r'C:\Users\Desktop\测试\out.xlsx' )
df1. to_excel( writer, sheet_name= 'out1' )
df2. to_excel( writer, sheet_name= 'out2' )
writer. save( )
writer = pd. ExcelWriter( r'C:\Users\Desktop\测试\out1.xlsx' , mode= 'a' , if_sheet_exists= 'replace' )
df1. to_excel( writer, sheet_name= 'out3' )
writer. save( )
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_14984\1386687888.py in <module>
1 # df输出到已存在的文件中
2 # a模式必须文件已存在
----> 3 writer = pd.ExcelWriter(r'C:\Users\Desktop\测试\out1.xlsx',mode='a',if_sheet_exists='replace') # error,replace,new
4 df1.to_excel(writer,sheet_name='out3')
5 writer.save()
D:\RpaStudy\lib\site-packages\pandas\io\excel\_openpyxl.py in __init__(self, path, engine, mode, storage_options, **engine_kwargs)
29
30 super().__init__(
---> 31 path, mode=mode, storage_options=storage_options, **engine_kwargs
32 )
33
D:\RpaStudy\lib\site-packages\pandas\io\excel\_base.py in __init__(self, path, engine, date_format, datetime_format, mode, storage_options, **engine_kwargs)
798 if not isinstance(path, ExcelWriter):
799 self.handles = get_handle(
--> 800 path, mode, storage_options=storage_options, is_text=False
801 )
802 self.sheets: Dict[str, Any] = {}
D:\RpaStudy\lib\site-packages\pandas\io\common.py in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
649 else:
650 # Binary mode
--> 651 handle = open(handle, ioargs.mode)
652 handles.append(handle)
653
FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\\Desktop\\测试\\out1.xlsx'
merge
数据合并
df1 = pd. DataFrame( {
'id' : range ( 1 , 6 ) , 'name' : [ 'a' , 'b' , 'c' , 'd' , 'e' ] } , index= range ( 5 ) )
df1
id
name
0
1
a
1
2
b
2
3
c
3
4
d
4
5
e
df2 = pd. DataFrame( {
'id' : range ( 2 , 7 ) , 'score' : [ 88 , 77 , 67 , 100 , 93 ] } , index= range ( 5 ) )
df2
id
score
0
2
88
1
3
77
2
4
67
3
5
100
4
6
93
df1. merge( df2, how= 'outer' , on= 'id' )
id
name
score
0
1
a
NaN
1
2
b
88.0
2
3
c
77.0
3
4
d
67.0
4
5
e
100.0
5
6
NaN
93.0
df1. merge( df2, how= 'inner' , on= 'id' )
id
name
score
0
2
b
88
1
3
c
77
2
4
d
67
3
5
e
100
pd. merge( df1, df2, how= 'left' , on= [ 'id' ] )
id
name
score
0
1
a
NaN
1
2
b
88.0
2
3
c
77.0
3
4
d
67.0
4
5
e
100.0
pd. merge( df1, df2, how= 'right' , on= [ 'id' ] )
id
name
score
0
2
b
88
1
3
c
77
2
4
d
67
3
5
e
100
4
6
NaN
93
query
df2
id
score
0
2
88
1
3
77
2
4
67
3
5
100
4
6
93
df2. query( 'score > 80' )
id
score
0
2
88
3
5
100
4
6
93
df2. loc[ df2[ 'score' ] > 80 ]
id
score
0
2
88
3
5
100
4
6
93
name = 'b'
df1. query( 'name == @name' )
df2. query( 'score <88 and id <5' )
df2. loc[ ( df[ 'score' ] < 88 ) & ( df[ 'id' ] < 5 ) ]
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
D:\RpaStudy\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3079 try:
-> 3080 return self._engine.get_loc(casted_key)
3081 except KeyError as err:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'score'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_14984\3182303803.py in <module>
1 # 多条件筛选
2 df2.query('score <88 and id <5') # 与:and或&,或:or或|
----> 3 df2.loc[(df['score'] <88) &(df['id'] <5)]
D:\RpaStudy\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
3022 if self.columns.nlevels > 1:
3023 return self._getitem_multilevel(key)
-> 3024 indexer = self.columns.get_loc(key)
3025 if is_integer(indexer):
3026 indexer = [indexer]
D:\RpaStudy\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3080 return self._engine.get_loc(casted_key)
3081 except KeyError as err:
-> 3082 raise KeyError(key) from err
3083
3084 if tolerance is not None:
KeyError: 'score'
df3 = pd. DataFrame( {
'a' : [ 1 , 2 , 3 ] , 'a b' : [ 4 , 5 , 6 ] } )
df3
df3. query( 'a b >= 5 ' )
Traceback (most recent call last):
File "D:\RpaStudy\lib\site-packages\IPython\core\interactiveshell.py", line 3553, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "C:\Users\AppData\Local\Temp\ipykernel_14984\960150518.py", line 1, in <module>
df3.query('a b >= 5 ')
File "D:\RpaStudy\lib\site-packages\pandas\core\frame.py", line 3466, in query
res = self.eval(expr, **kwargs)
File "D:\RpaStudy\lib\site-packages\pandas\core\frame.py", line 3596, in eval
return _eval(expr, inplace=inplace, **kwargs)
File "D:\RpaStudy\lib\site-packages\pandas\core\computation\eval.py", line 342, in eval
parsed_expr = Expr(expr, engine=engine, parser=parser, env=env)
File "D:\RpaStudy\lib\site-packages\pandas\core\computation\expr.py", line 798, in __init__
self.terms = self.parse()
File "D:\RpaStudy\lib\site-packages\pandas\core\computation\expr.py", line 817, in parse
return self._visitor.visit(self.expr)
File "D:\RpaStudy\lib\site-packages\pandas\core\computation\expr.py", line 397, in visit
raise e
File "D:\RpaStudy\lib\site-packages\pandas\core\computation\expr.py", line 393, in visit
node = ast.fix_missing_locations(ast.parse(clean))
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.7_3.7.2544.0_x64__qbz5n2kfra8p0\lib\ast.py", line 35, in parse
return compile(source, filename, mode, PyCF_ONLY_AST)
File "<unknown>", line 1
a b >=5
^
SyntaxError: invalid syntax
df3. query( '`a b` >= 5 ' )
filter
过滤标签
可以调用 filter() 的对象还有以下对象有:
pandas.DataFrame.filter
pandas.Series.filter
pandas.core.groupby.DataFrameGroupBy.filter
参数:
items:list-like,对应轴的标签名列表
like:str,支持对应标签名的模糊名查询
regex:str (正则表达式),按正则表达式查询标签名
axis:{0 or ‘index’, 1 or ‘columns’, None}, default None,要筛选的轴,表示为索引(int)或轴名称(str)。默认情况下为信息轴(info axis),Series为‘index’, DataFrame为‘columns’
返回:
用法:
需要注意的是,此方法不会对数据帧的数据内容进行过滤,仅应用于按标签筛选。
其中的参数 items, like, 和 regex parameters 被强制执行为相互排斥,即只能有一个存在。
df = pd. DataFrame( [ [ 1 , 2 , 3 ] , [ 4 , 5 , 6 ] ] ,
index= [ 'mouse' , 'rabbit' ] ,
columns= [ 'one' , 'two' , 'three' ] )
df
one
two
three
mouse
1
2
3
rabbit
4
5
6
df. filter ( items= [ 'one' , 'three' ] )
one
three
mouse
1
3
rabbit
4
6
df. filter ( regex= 'e$' , axis= 1 )
one
three
mouse
1
3
rabbit
4
6
df. filter ( like= 'bbi' , axis= 0 )
one
two
three
rabbit
4
5
6
df. filter ( items= [ 'one' , 'three' ] , regex= 'e$' , axis= 1 )
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_14984\1446589175.py in <module>
1 # items\ like \ regex参数是互斥的,只能传其中一个
----> 2 df.filter(items=['one','three'],regex='e$', axis=1)
D:\RpaStudy\lib\site-packages\pandas\core\generic.py in filter(self, items, like, regex, axis)
4961 if nkw > 1:
4962 raise TypeError(
-> 4963 "Keyword arguments `items`, `like`, or `regex` "
4964 "are mutually exclusive"
4965 )
TypeError: Keyword arguments `items`, `like`, or `regex` are mutually exclusive
df3. query( '`a b` >= 5 ' ) . filter ( items= 'a' )
DataFrameGroupBy的filter方法
类似 SQL 中 groupby 后的 having 操作
用法:
DataFrameGroupBy.filter(func, dropna=True,*args, **kwargs)
参数:
func:函数,函数应用于每个分组的子帧,应该返回 True 或 False。
dropna:删除未通过筛选器的组。默认为 True,如果为 False,则评估为 False 的组将填充 NaN
*args, **kwargs:func 的参数
返回:
通过筛选操作后的 DataFrame
df = pd. DataFrame( {
'A' : [ 'foo' , 'bar' , 'foo' , 'bar' ,
'foo' , 'bar' ] ,
'B' : [ 1 , 2 , 3 , 4 , 5 , 6 ] ,
'C' : [ 2.0 , 5. , 8. , 1. , 2. , 9. ] } )
df
A
B
C
0
foo
1
2.0
1
bar
2
5.0
2
foo
3
8.0
3
bar
4
1.0
4
foo
5
2.0
5
bar
6
9.0
grouped = df. groupby( 'A' )
grouped
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000025F36381608>
grouped. filter ( lambda x: x[ 'B' ] . mean( ) > 3 )
A
B
C
1
bar
2
5.0
3
bar
4
1.0
5
bar
6
9.0
grouped. filter ( lambda x: x[ 'B' ] . mean( ) > 3 , dropna= False )
A
B
C
0
NaN
NaN
NaN
1
bar
2.0
5.0
2
NaN
NaN
NaN
3
bar
4.0
1.0
4
NaN
NaN
NaN
5
bar
6.0
9.0
pandas语句与sql语句对比
df = pd. read_excel( file_path2, sheet_name= 'Sheet2' , engine= 'openpyxl' , usecols= 'E,V,AO' )
print ( df. shape)
df. head( )
(30643, 3)
客户号
发放金额
账户状态
0
P020000370
500000.0
结清
1
P02000008696
150000.0
结清
2
P03000005191
100000.0
结清
3
P01916008458
200000.0
结清
4
P02902202526
300000.0
结清
df. query( '账户状态 != "结清"' ) . filter ( items= [ '客户号' , '发放金额' ] ) . groupby( '客户号' ) . filter ( lambda x: x[ '客户号' ] . size == 1 ) . sort_values( '发放金额' , ascending= True ) [ 2 : 5 ]
客户号
发放金额
18448
P030000197
20000.0
10729
P01000033252
20000.0
7104
P03000117634
20000.0
df1 = df. query( '账户状态 != "结清"' ) . groupby( '客户号' , as_index= False ) . filter ( lambda x: x[ '客户号' ] . size > 1 )
df1. groupby( '客户号' , as_index= False ) . agg( {
'发放金额' : 'sum' } )
客户号
发放金额
0
C10000008477
2300000.0
1
C10000015283
1700000.0
2
C100000341
2350000.0
3
C10000048097
1950000.0
4
C10000059901
3500000.0
...
...
...
1748
P039300119
5620000.0
1749
P03906157244
2000000.0
1750
P03906003686
1890000.0
1751
P03906003879
5000000.0
1752
P03910001267
8000000.0
1753 rows × 2 columns