python数据分析-03pandas库

#Series
import numpy as np
import pandas as pd

# s1 = pd.Series([1,2,3,4])
# print(s1)
# # 0    1
# # 1    2
# # 2    3
# # 3    4
# # dtype: int64
# print(s1.values) #[1 2 3 4]
# print(s1.index) #RangeIndex(start=0, stop=4, step=1)

#传入数组
# s2 = pd.Series(np.arange(10))
# print(s2)
# 0    0
# 1    1
# 2    2
# 3    3
# 4    4
# 5    5
# 6    6
# 7    7
# 8    8
# 9    9
# dtype: int32

#传入字典
# s3 = pd.Series({"1":1,"2":2,"3":3})
# print(s3)
# # 1    1
# # 2    2
# # 3    3
# # dtype: int64
# print(s3.values) #[1 2 3]
# print(s3.index) #Index(['1', '2', '3'], dtype='object')

# s4 = pd.Series([1,2,3,4],index=['A','B','C','D'])
# print(s4)
# # A    1
# # B    2
# # C    3
# # D    4
# # dtype: int64
# print(s4["A"]) #1
# print(s4[s4>2])
# # C    3
# # D    4
# # dtype: int64
# print(s4.to_dict()) #{'A': 1, 'B': 2, 'C': 3, 'D': 4}

# s4 = pd.Series([1,2,3,4],index=['A','B','C','D'])
# print(s4)
# A    1
# B    2
# C    3
# D    4
# dtype: int64
# index_1 = ['A','B','C','D','E']
# s5 = pd.Series(s4,index=index_1)
#print(s5)
# A    1.0
# B    2.0
# C    3.0
# D    4.0
# E    NaN
# dtype: float64
#print(s5.isnull())
# A    False
# B    False
# C    False
# D    False
# E     True
# dtype: bool
# print(s5.notnull())
# A     True
# B     True
# C     True
# D     True
# E    False
# dtype: bool

# s5.name = 'demo'
# print(s5)
# A    1.0
# B    2.0
# C    3.0
# D    4.0
# E    NaN
# Name: demo, dtype: float64
# s5.index.name = "demo index"
# print(s5)
# Name: demo, dtype: float64
# demo index
# A    1.0
# B    2.0
# C    3.0
# D    4.0
# E    NaN
# Name: demo, dtype: float64


from pandas import Series,DataFrame

# import webbrowser
# link = "https://www.tiobe.com/tiobe-index/"
# webbrowser.open(link) #打开这个网页，然后使用鼠标进行复制操作
# df = pd.read_clipboard()#这里是读取鼠标复制的数据
#print(df)
#     Apr 2019  Apr 2018  Change  Programming Language  Ratings Change.1
# 0          1         1     NaN                  Java  15.035%   -0.74%
# 1          2         2     NaN                     C  14.076%   +0.49%
# 2          3         3     NaN                   C++   8.838%   +1.62%
# 3          4         4     NaN                Python   8.166%   +2.36%
# 4          5         6  change     Visual Basic .NET   5.795%   +0.85%
# 5          6         5  change                    C#   3.515%   -1.75%
# 6          7         8  change            JavaScript   2.507%   -0.99%
# 7          8         9  change                   SQL   2.272%   -0.38%
# 8          9         7  change                   PHP   2.239%   -1.98%
# 9         10        14  change     Assembly language   1.710%   +0.05%
# 10        11        18  change           Objective-C   1.505%   +0.25%
# 11        12        17  change                MATLAB   1.285%   -0.17%
# 12        13        10  change                  Ruby   1.277%   -0.74%
# 13        14        16  change                  Perl   1.269%   -0.26%
# 14        15        11  change  Delphi/Object Pascal   1.264%   -0.70%
# 15        16        12  change                     R   1.181%   -0.63%
# 16        17        13  change          Visual Basic   1.060%   -0.74%
# 17        18        19  change                    Go   1.009%   -0.17%
# 18        19        15  change                 Swift   0.978%   -0.56%
# 19        20        68  change                Groovy      NaN      NaN
#print(type(df)) #<class 'pandas.core.frame.DataFrame'>
#print(df.columns)
# Index(['Apr 2019', 'Apr 2018', 'Change', 'Programming Language', 'Ratings',
#        'Change.1'],
#       dtype='object')
#print(df.Ratings) #直接获取Ratings列 或者print(df["Ratings"])
# 0    15.035%
# 1    14.076%
# 2     8.838%
# 3     8.166%
# 4     5.795%
# 5     3.515%
# 6     2.507%
# 7     2.272%
# 8     2.239%
# Name: Ratings, dtype: object
#print(DataFrame(df,columns=["Programming Language","Ratings"]))#获取多列
#   Programming Language  Ratings
# 0                 Java  15.035%
# 1                    C  14.076%
# 2                  C++   8.838%
# 3               Python   8.166%
# 4    Visual Basic .NET   5.795%
# 5                   C#   3.515%
# 6           JavaScript   2.507%
# 7                  SQL   2.272%
# 8                  PHP   2.239%

#填充一列新列：Apr 2020，数据是用NaN填充的
#df_new = DataFrame(df,columns=["Programming Language","Ratings","Apr 2020"])
# print(df_new)
#   Programming Language  Ratings  Apr 2020
# 0                 Java  15.035%       NaN
# 1                    C  14.076%       NaN
# 2                  C++   8.838%       NaN
# 3               Python   8.166%       NaN
# 4    Visual Basic .NET   5.795%       NaN
# 5                   C#   3.515%       NaN
# 6           JavaScript   2.507%       NaN
# 7                  SQL   2.272%       NaN
# 8                  PHP   2.239%       NaN

# df_new["Apr 2020"] = range(9)
# print(df_new)#给新的一列赋值
#   Programming Language  Ratings  Apr 2020
# 0                 Java  15.035%         0
# 1                    C  14.076%         1
# 2                  C++   8.838%         2
# 3               Python   8.166%         3
# 4    Visual Basic .NET   5.795%         4
# 5                   C#   3.515%         5
# 6           JavaScript   2.507%         6
# 7                  SQL   2.272%         7
# 8                  PHP   2.239%         8

#把数组赋值过来
# df_new["Apr 2020"] = np.arange(9)
# print(df_new)
#   Programming Language  Ratings  Apr 2020
# 0                 Java  15.035%         0
# 1                    C  14.076%         1
# 2                  C++   8.838%         2
# 3               Python   8.166%         3
# 4    Visual Basic .NET   5.795%         4
# 5                   C#   3.515%         5
# 6           JavaScript   2.507%         6
# 7                  SQL   2.272%         7
# 8                  PHP   2.239%         8

#由于其本身每列都是Series，所以可以用Series赋值
# df_new["Apr 2020"] = pd.Series(np.arange(9))
# print(df_new)
#   Programming Language  Ratings  Apr 2020
# 0                 Java  15.035%         0
# 1                    C  14.076%         1
# 2                  C++   8.838%         2
# 3               Python   8.166%         3
# 4    Visual Basic .NET   5.795%         4
# 5                   C#   3.515%         5
# 6           JavaScript   2.507%         6
# 7                  SQL   2.272%         7
# 8                  PHP   2.239%         8

#只赋值给某一行
# df_new["Apr 2020"] = pd.Series([100,200],index=[1,2])
# print(df_new)
#   Programming Language  Ratings  Apr 2020
# 0                 Java  15.035%       NaN
# 1                    C  14.076%     100.0
# 2                  C++   8.838%     200.0
# 3               Python   8.166%       NaN
# 4    Visual Basic .NET   5.795%       NaN
# 5                   C#   3.515%       NaN
# 6           JavaScript   2.507%       NaN
# 7                  SQL   2.272%       NaN
# 8                  PHP   2.239%       NaN



data = {'country':['belgium','India','Brazil'],
        'Capital':['Brussels','New Delhi','Brasilia'],
        'Population':[11190846,1303171035,207847528]
}
# s1 = pd.Series(data['country'])
# print(s1)
# 0    belgium
# 1      India
# 2     Brazil
# dtype: object
# print(s1.values)#['belgium' 'India' 'Brazil']
# print(s1.index)#RangeIndex(start=0, stop=3, step=1)
# s1 = pd.Series(data['country'],index=['A','B','C'])
# print(s1)
# A    belgium
# B      India
# C     Brazil
# dtype: object

#DataFrame
# df1 = DataFrame(data)
# print(df1)
#    country    Capital  Population
# 0  belgium   Brussels    11190846
# 1    India  New Delhi  1303171035
# 2   Brazil   Brasilia   207847528
# cou = df1["country"]
# print(type(cou)) #<class 'pandas.core.series.Series'>
# print(df1.iterrows()) #<generator object DataFrame.iterrows at 0x000000000B6F4DE0>
# for row in df1.iterrows():
#     print(row)
#     # (0, country        belgium
#     # Capital       Brussels
#     # Population    11190846
#     # Name: 0, dtype: object)
#     print(row[0])#0
#     print(row[1])
#     # country belgium
#     # Capital Brussels
#     # Population  11190846
#     # Name: 0, dtype: object
#     print(type(row[1])) #<class 'pandas.core.series.Series'>
#     break


# s1 = pd.Series(data["country"])
# s2 = pd.Series(data["Capital"])
# s3 = pd.Series(data["Population"])
#print(s1)
# 0    belgium
# 1      India
# 2     Brazil
# dtype: object
# df_new = DataFrame([s1,s2,s3],index=['country','Capital','Population'])
# df_new = df_new.T
# print(df_new)
#    country    Capital  Population
# 0  belgium   Brussels    11190846
# 1    India  New Delhi  1303171035
# 2   Brazil   Brasilia   207847528


#DataFrame的IO操作
# import webbrowser
# link = "http://pandas.pydata.org/pandas-docs/version/0.20/io.html"
# webbrowser.open(link)
#
# df1 = pd.read_clipboard()
#print(df1)
#   Format Type      Data Description          Reader        Writer
# 0         text                   CSV        read_csv        to_csv
# 1         text                  JSON       read_json       to_json
# 2         text                  HTML       read_html       to_html
# 3         text       Local clipboard  read_clipboard  to_clipboard
# 4       binary              MS Excel      read_excel      to_excel
# 5       binary           HDF5 Format        read_hdf        to_hdf
# 6       binary        Feather Format    read_feather    to_feather
# 7       binary               Msgpack    read_msgpack    to_msgpack
# 8       binary                 Stata      read_stata      to_stata
# 9       binary                   SAS        read_sas
# 10      binary  Python Pickle Format     read_pickle     to_pickle
# 11         SQL                   SQL        read_sql        to_sql
# 12         SQL      Google Big Query        read_gbq        to_gbq

# df1.to_csv("df1.csv",index=False)#去掉前面的index
# df2 = pd.read_csv("df1.csv")
# print(df2)
  # Format Type      Data Description          Reader        Writer
# 0         text                   CSV        read_csv        to_csv
# 1         text                  JSON       read_json       to_json
# 2         text                  HTML       read_html       to_html
# 3         text       Local clipboard  read_clipboard  to_clipboard
# 4       binary              MS Excel      read_excel      to_excel
# 5       binary           HDF5 Format        read_hdf        to_hdf
# 6       binary        Feather Format    read_feather    to_feather
# 7       binary               Msgpack    read_msgpack    to_msgpack
# 8       binary                 Stata      read_stata      to_stata
# 9       binary                   SAS        read_sas
# 10      binary  Python Pickle Format     read_pickle     to_pickle
# 11         SQL                   SQL        read_sql        to_sql
# 12         SQL      Google Big Query        read_gbq        to_gbq

# print(df1.to_json())
# {"Format":{"0":"text","1":"text","2":"text","3":"text","4":"binary","5":"binary","6":"binary","7":"binary","8":"binary","9":"binary","10":"binary","11":"SQL","12":"SQL"},"Type":{"0":"CSV","1":"JSON","2":"HTML","3":"Local","4":"MS","5":"HDF5","6":"Feather","7":"Msgpack","8":"Stata","9":"SAS","10":"Python","11":"SQL","12":"Google"},"Data":{"0":"read_csv","1":"read_json","2":"read_html","3":"clipboard","4":"Excel","5":"Format","6":"Format","7":"read_msgpack","8":"read_stata","9":"read_sas","10":"Pickle","11":"read_sql","12":"Big"},"Description":{"0":"to_csv","1":"to_json","2":"to_html","3":"read_clipboard","4":"read_excel","5":"read_hdf","6":"read_feather","7":"to_msgpack","8":"to_stata","9":null,"10":"Format","11":"to_sql","12":"Query"},"Reader":{"0":null,"1":null,"2":null,"3":"to_clipboard","4":"to_excel","5":"to_hdf","6":"to_feather","7":null,"8":null,"9":null,"10":"read_pickle","11":null,"12":"read_gbq"},"Writer":{"0":null,"1":null,"2":null,"3":null,"4":null,"5":null,"6":null,"7":null,"8":null,"9":null,"10":"to_pickle","11":null,"12":"to_gbq"}}

# print(pd.read_json(df1.to_json()))
#    Format Type      Data Description          Reader        Writer
# 0         text                   CSV        read_csv        to_csv
# 1         text                  JSON       read_json       to_json
# 10      binary  Python Pickle Format     read_pickle     to_pickle
# 11         SQL                   SQL        read_sql        to_sql
# 12         SQL      Google Big Query        read_gbq        to_gbq
# 2         text                  HTML       read_html       to_html
# 3         text       Local clipboard  read_clipboard  to_clipboard
# 4       binary              MS Excel      read_excel      to_excel
# 5       binary           HDF5 Format        read_hdf        to_hdf
# 6       binary        Feather Format    read_feather    to_feather
# 7       binary               Msgpack    read_msgpack    to_msgpack
# 8       binary                 Stata      read_stata      to_stata
# 9       binary                   SAS        read_sas


#假设有movie_metadata.csv 文件
# imbd = pd.read_csv("movie_metadata.csv")
# print(imbd.shape)#(5043,28)
# print(imbd.head())
# print(imbd[["color","director_name"]])#q取出两列数据

# sub_df = imbd["director_name","movie_title","imbd_score"]
# print(sub_df.head(5))
# print(sub_df.iloc[10:20,:])
# print(sub_df.iloc[10:20,0:2])
# print(sub_df.loc[10:20,:])#和iloc类似，只是多了第20行
# print(sub_df.loc[10:20,:"director_name"]) #可以使使用key键


#Series Reindex
#s1 = Series([1,2,3,4],index=['A','B','C','D'])
# print(s1)
# A    1
# B    2
# C    3
# D    4
# dtype: int64

#print(s1.reindex(index=['A','B','C','D','E']))
# A    1.0
# B    2.0
# C    3.0
# D    4.0
# E    NaN
# dtype: float64

#print(s1.reindex(index=['A','B','C','D','E'],fill_value=10))
# A     1
# B     2
# C     3
# D     4
# E    10
# dtype: int64

#s2 = Series(['A','B','C'],index=[1,5,10])
#print(s2)
# 1     A
# 5     B
# 10    C
# dtype: object

#print(s2.reindex(index=range(15)))
# 1     A
# 5     B
# 10    C
# dtype: object
# 0     NaN
# 1       A
# 2     NaN
# 3     NaN
# 4     NaN
# 5       B
# 6     NaN
# 7     NaN
# 8     NaN
# 9     NaN
# 10      C
# 11    NaN
# 12    NaN
# 13    NaN
# 14    NaN
# dtype: object

#print(s2.reindex(index=range(15),method="ffill"))
# 0     NaN
# 1       A
# 2       A
# 3       A
# 4       A
# 5       B
# 6       B
# 7       B
# 8       B
# 9       B
# 10      C
# 11      C
# 12      C
# 13      C
# 14      C
# dtype: object


#Reindex dataframe
# df1 = DataFrame(np.random.rand(25).reshape(5,5))
# print(df1)
#           0         1         2         3         4
# 0  0.150685  0.741189  0.642348  0.625132  0.318640
# 1  0.781998  0.793684  0.434840  0.053550  0.076352
# 2  0.657116  0.261819  0.089875  0.298170  0.035670
# 3  0.408057  0.550972  0.298262  0.734598  0.920229
# 4  0.707607  0.163687  0.861138  0.553325  0.439473

# df2 = DataFrame(np.random.rand(25).reshape(5,5),index=['A','B','D','E','F'],columns=['c1','c2','c3','c4','c5'])
# print(df2)
#         c1        c2        c3        c4        c5
# A  0.096956  0.687012  0.242486  0.106347  0.951611
# B  0.534206  0.555345  0.743860  0.156659  0.228296
# D  0.963385  0.648523  0.603671  0.904279  0.161911
# E  0.549797  0.987869  0.048364  0.706606  0.820717
# F  0.003817  0.923006  0.611485  0.986054  0.160444

# print(df2.reindex(index=['A','B','D','C','E','F']))
#          c1        c2        c3        c4        c5
# A  0.745011  0.621461  0.288680  0.177793  0.013119
# B  0.431538  0.170305  0.780363  0.007156  0.139781
# D  0.663396  0.807862  0.732135  0.347896  0.959864
# C       NaN       NaN       NaN       NaN       NaN
# E  0.145247  0.191087  0.811372  0.648703  0.697846
# F  0.742532  0.439197  0.612185  0.114661  0.221951

# print(df2.reindex(columns=['c1','c2','c3','c4','c5','c6']))
#          c1        c2        c3        c4        c5  c6
# A  0.287383  0.910655  0.418470  0.613704  0.200391 NaN
# B  0.942793  0.389105  0.619344  0.076861  0.474860 NaN
# D  0.945629  0.308200  0.165710  0.152989  0.552817 NaN
# E  0.876477  0.138687  0.838985  0.656992  0.773661 NaN
# F  0.866165  0.539998  0.500313  0.540542  0.002450 NaN

# print(df2.reindex(index=['A','B','D','C','E','F'],columns=['c1','c2','c3','c4','c5','c6']))
#          c1        c2        c3        c4        c5  c6
# A  0.978832  0.807321  0.366297  0.148317  0.308838 NaN
# B  0.905668  0.114278  0.368676  0.428269  0.162910 NaN
# D  0.930796  0.963658  0.902773  0.584296  0.295554 NaN
# C       NaN       NaN       NaN       NaN       NaN NaN
# E  0.101119  0.000268  0.301075  0.697321  0.121599 NaN
# F  0.402271  0.660168  0.477529  0.590062  0.459596 NaN

# print(df2.reindex(index=['A','B']))
#          c1        c2        c3        c4        c5
# A  0.855483  0.462398  0.282791  0.454249  0.027320
# B  0.223694  0.827418  0.368981  0.867265  0.471167

# print(df2.drop("A"))
#          c1        c2        c3        c4        c5
# B  0.047756  0.880659  0.744061  0.012340  0.216161
# D  0.603093  0.769085  0.526477  0.187897  0.991472
# E  0.159034  0.909088  0.765743  0.428868  0.972190
# F  0.239292  0.982104  0.802697  0.848463  0.503050

# print(df2.drop("A",axis=0))
#         c1        c2        c3        c4        c5
# B  0.474883  0.859859  0.594369  0.077369  0.616871
# D  0.562033  0.190256  0.882217  0.810458  0.855765
# E  0.545617  0.872125  0.406509  0.544556  0.718795
# F  0.944125  0.268808  0.070181  0.351121  0.040010

# print(df2.drop("c1",axis=1))
#          c2        c3        c4        c5
# A  0.404537  0.646484  0.319498  0.818558
# B  0.231232  0.132706  0.851948  0.061789
# D  0.067037  0.789874  0.368729  0.761373
# E  0.176873  0.294302  0.818214  0.284220
# F  0.378809  0.835109  0.124004  0.857353
python数据分析-03pandas库

猜你喜欢