python pandas笔记1

1.pandas的Seriess使用介绍

1.1 Series数据结构

import pandas as pd
#pd.Series?
animals = ['Tiger', 'Bear', 'Moose']
pd.Series(animals)
#output: 
"""
0    Tiger
1     Bear
2    Moose
dtype: object
"""
numbers = [1, 2, None]
pd.Series(numbers)
#output:
"""
0    1.0
1    2.0
2    NaN
dtype: float64
"""

1.2 numpy使用

import numpy as np
np.nan == None
#output: False
np.nan == np.nan
#output: False
np.isnan(np.nan)
#output: True

1.3 字典和Series使用

sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s
#output:
"""
Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object
"""
#索引
s.index
"""
Index(['Archery', 'Golf', 'Sumo', 'Taekwondo'], dtype='object')
"""
s = pd.Series(['Tiger', 'Bear', 'Moose'], index=['India', 'America', 'Canada'])
s
#output:
"""
India      Tiger
America     Bear
Canada     Moose
dtype: object
"""

sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports, index=['Golf', 'Sumo', 'Hockey'])
s
#output:
"""
Golf      Scotland
Sumo         Japan
Hockey         NaN
dtype: object
"""

2. 对Series的索引操作

sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s
# ouptut:
"""
sports = {'Archery': 'Bhutan',

          'Golf': 'Scotland',

          'Sumo': 'Japan',

          'Taekwondo': 'South Korea'}

s = pd.Series(sports)

s

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object
"""

2.1 iloc()和 loc()方法

s.iloc[3] #数字索引
#output: 'South Korea'
s.loc['Golf'] #键值
#output : 'Scotland'

2.2 关于向量化操作

#this creates a big series of random numbers
s = pd.Series(np.random.randint(0,1000,10000))
s.head()
#output: 
"""
0    396
1    779
2    752
3     30
4    493
dtype: int64
"""

时间对比 :

import numpy as np
%%timeit -n 100
summary = 0
for item in s:
    summary+=item
# 100 loops, best of 3: 1.87 ms per loop

#向量化操作
%%timeit -n 100
summary = np.sum(s)
# 100 loops, best of 3: 100 µs per loop

#broadcasting操作
s+=2 #adds two to each item in s using broadcasting
s.head()
"""
0    398
1    781
2    754
3     32
4    495
dtype: int64
"""

迭代：

for label, value in s.iteritems():
    s.set_value(label, value+2)
s.head()
#output:
"""
0    400
1    783
2    756
3     34
4    497
dtype: int64
"""

时间对比：

#迭代的方法
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
for label, value in s.iteritems():
    s.loc[label]= value+2
#时间: 10 loops, best of 3: 1.62 s per loop
# broadcasting 
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
s+=2
# 10 loops, best of 3: 472 µs per loop

2.3 元素操作

s = pd.Series([1, 2, 3])
s.loc['Animal'] = 'Bears'
s
#output
"""
0             1
1             2
2             3
Animal    Bears
dtype: object
"""

2.4 Series的append()方法

original_sports = pd.Series({'Archery': 'Bhutan',
                             'Golf': 'Scotland',
                             'Sumo': 'Japan',
                             'Taekwondo': 'South Korea'})
cricket_loving_countries = pd.Series(['Australia',
                                      'Barbados',
                                      'Pakistan',
                                      'England'], 
                                   index=['Cricket',
                                          'Cricket',
                                          'Cricket',
                                          'Cricket'])
all_countries = original_sports.append(cricket_loving_countries)

original_sports未发生改变

#未发生变化
print(original_sports)
"""
Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object
"""

cricket_loving_countries的值：

print(cricket_loving_countries)
"""
Cricket    Australia
Cricket     Barbados
Cricket     Pakistan
Cricket      England
dtype: object
"""

all_countries的值

print(all_countries)
"""
Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
Cricket        Australia
Cricket         Barbados
Cricket         Pakistan
Cricket          England
dtype: object
"""

print(all_countries.loc['Cricket'])
"""
Cricket    Australia
Cricket     Barbados
Cricket     Pakistan
Cricket      England
dtype: object
"""

3 .DataFrame数据结构

3.1 可以看做是多维的Series.

import pandas as pd
purchase_1 = pd.Series({'Name': 'Chris',
                        'Item Purchased': 'Dog Food',
                        'Cost': 22.50})
purchase_2 = pd.Series({'Name': 'Kevyn',
                        'Item Purchased': 'Kitty Litter',
                        'Cost': 2.50})
purchase_3 = pd.Series({'Name': 'Vinod',
                        'Item Purchased': 'Bird Seed',
                        'Cost': 5.00})
df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2'])
print(df.head())
"""
   Cost Item Purchased   Name
Store 1  22.5       Dog Food  Chris
Store 1   2.5   Kitty Litter  Kevyn
Store 2   5.0      Bird Seed  Vinod
"""

3.2 loc()操作

print(df.loc['Store 2'])
"""
Cost                      5
Item Purchased    Bird Seed
Name                  Vinod
Name: Store 2, dtype: object
"""

print(df.loc['Store 1'])
"""
        Cost   Item       Purchased     Name
Store  1      22.5       Dog Food       Chris
Store  1      2.5         Kitty Litter      Kevyn
"""

df.loc['Store 1', 'Cost']
"""
Store 1    22.5
Store 1     2.5
Name: Cost, dtype: float64
"""

3.3 转置

print(df.T)
"""
                        Store 1        Store 1       Store 2
Cost                  22.5            2.5             5
Item Purchased  Dog Food     Kitty Litter   Bird Seed
Name                Chris           Kevyn         Vinod
"""

print(df.T.loc['Cost'])
"""
print(df.T.loc['Cost'])

Store 1    22.5
Store 1     2.5
Store 2       5
Name: Cost, dtype: object
"""
print(df['Cost'])
"""
Store 1    22.5
Store 1     2.5
Store 2     5.0
Name: Cost, dtype: float64
"""
print(df.loc['Store 1']['Cost'])
"""
Store 1    22.5
Store 1     2.5
Name: Cost, dtype: float64
"""
print(df.loc[:,['Name', 'Cost']])
"""
            Name   Cost
Store 1  Chris    22.5
Store 1  Kevyn   2.5
Store 2  Vinod    5.0
"""

3.4 关于drop()方法

print(df.drop('Store 1'))
"""
         Cost Item Purchased   Name
Store 2   5.0      Bird Seed  Vinod
"""
#但是原来的df没有发生变化
print(df)
"""
            Cost Item   Purchased    Name
Store 1  22.5          Dog Food     Chris
Store 1   2.5           Kitty Litter   Kevyn
Store 2   5.0           Bird Seed     Vinod
"""

3.5 copy()方法

copy_df = df.copy()
copy_df = copy_df.drop('Store 1')
print(copy_df)
"""
             Cost Item  Purchased    Name     Location
Store 2   5.0           Bird Seed     Vinod     None
"""
# copy_df.drop?

3.6 del 操作和加列操作

del copy_df['Name']
print(copy_df) 
"""
             Cost Item   Purchased    Location
Store 2   5.0            Bird Seed     None
"""
df['Location'] = None
print(df)
"""
            Cost Item Purchased   Name      Location
Store 1  22.5        Dog Food     Chris       None
Store 1   2.5         Kitty Litter    Kevyn     None
Store 2   5.0         Bird Seed     Vinod      None
"""

猜你喜欢