pandas入门学习

知道pandas主要的数据结构以后就可以实际操作学习了,其实学习主要还是学习如何使用Series和Dataframe的属性和方法使用规则,多使用,多看官方文档就差不多了,下面开始学习pandas:

Series的创建

使用Python的列表创建Series:

import numpy as np
import pandas as pd

# 使用list创建
s1 = pd.Series([1,2,3,4]) # 可以发现索引index默认从0开始进行自动索引
s1
0    1
1    2
2    3
3    4
dtype: int64

# 值属性,可以方便查看Series的值
s1.values
array([1, 2, 3, 4], dtype=int64)

# 索引index属性,返回的是索引从开始到结束和间隔的值
s1.index 
RangeIndex(start=0, stop=4, step=1)


# 使用numpy的数组进行创建
s2 = pd.Series(np.arange(10))
s2
0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32



# 通过字典进行创建
s3 = pd.Series({'1':1, '2':2, '3':3})
s3
1    1
2    2
3    3
dtype: int64

s3.values
array([1, 2, 3], dtype=int64)

s3.index
Index(['1', '2', '3'], dtype='object')

# 手动赋值索引
s4 = pd.Series([1,2,3,4], index=['A','B','C','D'])
s4
A    1
B    2
C    3
D    4
dtype: int64


s4.values
array([1, 2, 3, 4], dtype=int64)
s4.index
Index(['A', 'B', 'C', 'D'], dtype='object')
s4['A']  # 根据索引取值
1
s4[s4>1] # 根据值得范围取值
B    2
C    3
D    4
dtype: int64

s4.to_dict() # 把Series转换为字典输出,也就是说可以通过字典创建Series,也可以通过Series转换为字典
{'A': 1, 'B': 2, 'C': 3, 'D': 4}

s5 = pd.Series(s4.to_dict()) # 来回转
s5
A    1
B    2
C    3
D    4
dtype: int64

index_1 = ['A','B','C','D','E'] # 可单独把索引写出,再赋值给Series,同时多增加一个索引
s6 = pd.Series(s5, index=index_1)
s6  # 多增加的索引的值为NAN
A    1.0
B    2.0
C    3.0
D    4.0
E    NaN
dtype: float64

pd.isnull(s6) # 根据pd.isnall()判断Series的元素是否有空值,如果有返回Ture,反之False
A    False
B    False
C    False
D    False
E     True
dtype: bool

pd.notnull(s6) # 类似的操作 
A     True
B     True
C     True
D     True
E    False
dtype: bool

s6.name = 'demo' # 给Series赋予名字
s6
A    1.0
B    2.0
C    3.0
D    4.0
E    NaN
Name: demo, dtype: float64

s6.index.name = 'demo_index' # 给索引起个名字
s6
demo_index
A    1.0
B    2.0
C    3.0
D    4.0
E    NaN
Name: demo, dtype: float64

s6.index 
Index(['A', 'B', 'C', 'D', 'E'], dtype='object', name='demo_index')

pandas的Dataframe学习 

pandas的Dataframe学习
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

import webbrowser
link = 'https://www.tiobe.com//tiobe-index/'
webbrowser.open(link)
True

df = pd.read_clipboard() # 读取粘贴板的内容
df
Sep	2018	Sep.1	2017	Change	Programming	Language	Ratings	Change.1
0	0	1	1	NaN	Java	17.436%	+4.75%	NaN	NaN
1	1	2	2	NaN	C	15.447%	+8.06%	NaN	NaN
2	2	3	5	change	Python	7.653%	+4.67%	NaN	NaN
3	3	4	3	change	C++	7.394%	+1.83%	NaN	NaN
4	4	5	8	change	Visual	Basic	.NET	5.308%	+3.33%
5	5	6	4	change	C#	3.295%	-1.48%	NaN	NaN
6	6	7	6	change	PHP	2.775%	+0.57%	NaN	NaN
7	7	8	7	change	JavaScript	2.131%	+0.11%	NaN	NaN
8	8	9	-	change	SQL	2.062%	+2.06%	NaN	NaN
9	9	10	18	change	Objective-C	1.509%	+0.00%	NaN	NaN

type(df)
pandas.core.frame.DataFrame

df.columns # 读取索引,该操作作用很大,可以读取行的索引,以此可以通过该索引进行访问
Index(['Sep', '2018', 'Sep.1', '2017', 'Change', 'Programming', 'Language',
       'Ratings', 'Change.1'],
      dtype='object')

df.Ratings # 通过标签读取该列内容
0    17.436%
1    15.447%
2     7.653%
3     7.394%
4     5.308%
5     3.295%
6     2.775%
7     2.131%
8     2.062%
9     1.509%
Name: Ratings, dtype: object

# 从原先的dataframe中提取想要的数据,并赋给新的dataframe,该操作很有用
df_new = DataFrame(df, columns=['Sep 2019','Sep 2018', 'Change', 'Programming Language'])
# 同时增加新的标签,并自动赋值nan
df_new
Sep 2019	Sep 2018	Change	Programming Language
0	NaN	1	NaN	Java
1	NaN	2	NaN	C
2	NaN	3	change	Python
3	NaN	4	change	C++
4	NaN	5	change	Visual Basic .NET
5	NaN	6	change	C#
6	NaN	7	change	PHP
7	NaN	8	change	JavaScript
8	NaN	9	change	SQL
9	NaN	10	change	Objective-C

df_new['Sep 2019'] = range(10) # 给新的列标签赋值
type(df['Sep 2018'])
pandas.core.series.Series
df_new
Sep 2019	Sep 2018	Change	Programming Language
0	0	1	NaN	Java
1	1	2	NaN	C
2	2	3	change	Python
3	3	4	change	C++
4	4	5	change	Visual Basic .NET
5	5	6	change	C#
6	6	7	change	PHP
7	7	8	change	JavaScript
8	8	9	change	SQL
9	9	10	change	Objective-C

pandas_DataFrame常用读取各种格式的数据操作

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

import webbrowser
link = 'https://pandas.pydata.org/pandas-docs/version/0.20/io.html'
webbrowser.open(link)
True

df1 = pd.read_clipboard() # 读取粘贴板的内容

df1
Format Type	Data Description	Reader	Writer
0	text	CSV	read_csv	to_csv
1	text	JSON	read_json	to_json
2	text	HTML	read_html	to_html
3	text	Local clipboard	read_clipboard	to_clipboard
4	binary	MS Excel	read_excel	to_excel
5	binary	HDF5 Format	read_hdf	to_hdf
6	binary	Feather Format	read_feather	to_feather
7	binary	Msgpack	read_msgpack	to_msgpack
8	binary	Stata	read_stata	to_stata
9	binary	SAS	read_sas	None
10	binary	Python Pickle Format	read_pickle	to_pickle
11	SQL	SQL	read_sql	to_sql
12	SQL	Google Big Query	read_gbq	to_gbq

df1.to_clipboard()
df1.to_csv('df1.csv',index=False) # 把内容写到名为df1.csv的文件中,把索引序号去除

!type df1.csv # window安装的jupyter使用该操作查看文件内容
Format Type,Data Description,Reader,Writer
text,CSV,read_csv,to_csv
text,JSON,read_json,to_json
text,HTML,read_html,to_html
text,Local clipboard,read_clipboard,to_clipboard
binary,MS Excel,read_excel,to_excel
binary,HDF5 Format,read_hdf,to_hdf
binary,Feather Format,read_feather,to_feather
binary,Msgpack,read_msgpack,to_msgpack
binary,Stata,read_stata,to_stata
binary,SAS,read_sas,
binary,Python Pickle Format,read_pickle,to_pickle
SQL,SQL,read_sql,to_sql
SQL,Google Big Query,read_gbq,to_gbq

df2 = pd.read_csv('df1.csv') # 在通过pd.read_csv读出来
df2
Format Type	Data Description	Reader	Writer
0	text	CSV	read_csv	to_csv
1	text	JSON	read_json	to_json
2	text	HTML	read_html	to_html
3	text	Local clipboard	read_clipboard	to_clipboard
4	binary	MS Excel	read_excel	to_excel
5	binary	HDF5 Format	read_hdf	to_hdf
6	binary	Feather Format	read_feather	to_feather
7	binary	Msgpack	read_msgpack	to_msgpack
8	binary	Stata	read_stata	to_stata
9	binary	SAS	read_sas	NaN
10	binary	Python Pickle Format	read_pickle	to_pickle
11	SQL	SQL	read_sql	to_sql
12	SQL	Google Big Query	read_gbq	to_gbq

df1.to_json()
'{"Format Type":{"0":"text","1":"text","2":"text","3":"text","4":"binary","5":"binary","6":"binary","7":"binary","8":"binary","9":"binary","10":"binary","11":"SQL","12":"SQL"},"Data Description":{"0":"CSV","1":"JSON","2":"HTML","3":"Local clipboard","4":"MS Excel","5":"HDF5 Format","6":"Feather Format","7":"Msgpack","8":"Stata","9":"SAS","10":"Python Pickle Format","11":"SQL","12":"Google Big Query"},"Reader":{"0":"read_csv","1":"read_json","2":"read_html","3":"read_clipboard","4":"read_excel","5":"read_hdf","6":"read_feather","7":"read_msgpack","8":"read_stata","9":"read_sas","10":"read_pickle","11":"read_sql","12":"read_gbq"},"Writer":{"0":"to_csv","1":"to_json","2":"to_html","3":"to_clipboard","4":"to_excel","5":"to_hdf","6":"to_feather","7":"to_msgpack","8":"to_stata","9":null,"10":"to_pickle","11":"to_sql","12":"to_gbq"}}'

pd.read_json(df1.to_json())
Format Type	Data Description	Reader	Writer
0	text	CSV	read_csv	to_csv
1	text	JSON	read_json	to_json
10	binary	Python Pickle Format	read_pickle	to_pickle
11	SQL	SQL	read_sql	to_sql
12	SQL	Google Big Query	read_gbq	to_gbq
2	text	HTML	read_html	to_html
3	text	Local clipboard	read_clipboard	to_clipboard
4	binary	MS Excel	read_excel	to_excel
5	binary	HDF5 Format	read_hdf	to_hdf
6	binary	Feather Format	read_feather	to_feather
7	binary	Msgpack	read_msgpack	to_msgpack
8	binary	Stata	read_stata	to_stata
9	binary	SAS	read_sas	None
df1.to_html('df1.html')
!dir
 驱动器 C 中的卷是 OS
 卷的序列号是 E28F-E81C

 C:\Users\Admin\Documents\jupyter notebook\数据挖掘学习 的目录

2018/10/01  16:58    <DIR>          .
2018/10/01  16:58    <DIR>          ..
2018/10/01  16:34    <DIR>          .ipynb_checkpoints
2018/10/01  16:47               508 df1.csv
2018/10/01  16:58             2,061 df1.html
2018/09/30  16:22             8,896 numpy_array.ipynb
2018/09/30  17:03             7,648 numpy_Array的input和output.ipynb
2018/09/30  21:33           154,056 numpy_example.ipynb
2018/10/01  13:11            20,844 numpy_example1.ipynb
2018/09/30  16:58               570 numpy_two_file.npz
2018/09/30  16:52               168 numpy_wb_array.npy
2018/09/30  16:31            13,759 numpy_数组与矩阵的运算.ipynb
2018/10/01  16:01            16,335 pandas_Dataframe.ipynb
2018/10/01  16:57            19,694 pandas_DataFrame_IO.ipynb
2018/10/01  15:42             9,681 pandas_series.ipynb
2018/09/30  16:45               195 x.pkl
2018/10/01  16:34            10,325 深入理解Series和DataFrame.ipynb
              14 个文件        264,740 字节
               3 个目录 37,539,487,744 可用字节
df1.to_excel('df1.xlsx',index=False)
还有很多其他的格式的读写操作,这里不一一列举了

Series和Dataframe的排序

import numpy as np
import pandas as pd
from pandas import Series,DataFrame

# Series的排序
s1 = Series(np.random.randn(10))
s1
0   -0.424930
1   -0.279580
2   -0.763344
3   -0.018883
4   -0.217693
5   -1.367801
6   -1.402714
7   -1.125427
8   -0.560325
9    0.037897
dtype: float64

s1.values
array([-0.42493031, -0.27958011, -0.76334433, -0.01888319, -0.21769267,
       -1.36780083, -1.40271395, -1.1254272 , -0.56032516,  0.03789725])

s1.index
RangeIndex(start=0, stop=10, step=1)

s2 = s1.sort_values(ascending=False) # 通过值排序,默认是升序
s2
9    0.037897
3   -0.018883
4   -0.217693
1   -0.279580
0   -0.424930
8   -0.560325
2   -0.763344
7   -1.125427
5   -1.367801
6   -1.402714
dtype: float64

s2.sort_index(ascending=False) # 通过index排序,默认升序
9    0.037897
8   -0.560325
7   -1.125427
6   -1.402714
5   -1.367801
4   -0.217693
3   -0.018883
2   -0.763344
1   -0.279580
0   -0.424930
dtype: float64

# DataFrame的排序

df1 = DataFrame(np.random.randn(40).reshape(8,5),columns=['A','B','C','D','E'])
df1
A	B	         C	                     D	         E
0	-0.971041	-1.006900	1.824719	0.362461	0.010793
1	-1.202866	-0.520920	0.603463	-2.222645	0.085995
2	0.803201	1.664502	0.866629	-3.014892	-0.026222
3	-2.737011	-0.844403	-0.565923	0.956517	2.366085
4	0.183618	-0.731790	1.449652	0.440694	0.749678
5	-0.852958	-2.110449	-0.872135	2.816473	0.402317
6	-0.064767	-0.461569	-1.090588	-1.441483	0.029359
7	-0.620219	-0.350775	-0.190072	-1.475141	-1.147963
 # 可以通colums的值进行排序也可以按照index排序
    
df1['A'].sort_values()  # 只调整一列series,无法调整根据要求调整整个dataframe
3   -2.737011
1   -1.202866
0   -0.971041
5   -0.852958
7   -0.620219
6   -0.064767
4    0.183618
2    0.803201
Name: A, dtype: float64

df2 = df1.sort_values('A') # 这次可以整体排序
df2
A	B	C	D	E
3	-2.737011	-0.844403	-0.565923	0.956517	2.366085
1	-1.202866	-0.520920	0.603463	-2.222645	0.085995
0	-0.971041	-1.006900	1.824719	0.362461	0.010793
5	-0.852958	-2.110449	-0.872135	2.816473	0.402317
7	-0.620219	-0.350775	-0.190072	-1.475141	-1.147963
6	-0.064767	-0.461569	-1.090588	-1.441483	0.029359
4	0.183618	-0.731790	1.449652	0.440694	0.749678
2	0.803201	1.664502	0.866629	-3.014892	-0.026222

df2.sort_index() # 应该和df1相同
A	B	C	D	E
0	-0.971041	-1.006900	1.824719	0.362461	0.010793
1	-1.202866	-0.520920	0.603463	-2.222645	0.085995
2	0.803201	1.664502	0.866629	-3.014892	-0.026222
3	-2.737011	-0.844403	-0.565923	0.956517	2.366085
4	0.183618	-0.731790	1.449652	0.440694	0.749678
5	-0.852958	-2.110449	-0.872135	2.816473	0.402317
6	-0.064767	-0.461569	-1.090588	-1.441483	0.029359
7	-0.620219	-0.350775	-0.190072	-1.475141	-1.147963
df1
A	B	C	D	E
0	-0.971041	-1.006900	1.824719	0.362461	0.010793
1	-1.202866	-0.520920	0.603463	-2.222645	0.085995
2	0.803201	1.664502	0.866629	-3.014892	-0.026222
3	-2.737011	-0.844403	-0.565923	0.956517	2.366085
4	0.183618	-0.731790	1.449652	0.440694	0.749678
5	-0.852958	-2.110449	-0.872135	2.816473	0.402317
6	-0.064767	-0.461569	-1.090588	-1.441483	0.029359
7	-0.620219	-0.350775	-0.190072	-1.475141	-1.147963

猜你喜欢

转载自blog.csdn.net/weixin_42398658/article/details/82925410