python_数据_pandas_1

pandas

start

pandas_serieas

import numpy as np
from pandas import Series,DataFrame

声明一个Series类型的数据

sr = Series(np.random.normal(175,size=10),index=list('abcdefghij'))
sr
  • a 177.955666
    b 175.659703
    c 174.452552
    d 175.730355
    e 173.443535
    f 175.653806
    g 174.199460
    h 174.011293
    i 175.902427
    j 176.296591
    dtype: float64
sr['a']
sr[1]
sr.loc['a']
sr.iloc[1]
sr[['a','b']]
sr[[1,2]]
sr.loc[['a','b']]
sr.iloc[[1,2]]

pandas_Dataframe

声明与定义

df = DataFrame(data = np.random.randint(0,150,size=(10,3)),index=list('ABCDEFGHIJ'),columns=['Python','Math','Chinese'])
# 列是属性,行是样本,
df2 = DataFrame(data={
    'python':np.random.randint(0,150,size = 10),
    'xiaoming':np.random.randint(0,150,size = 10),
    'xiaohong':np.random.randint(0,150,size = 10),
})
df3 = DataFrame(data={
    'python':np.random.randint(0,150,size = 10),
    'xiaoming':np.random.randint(0,150,size = 10),
    'xiaohong':np.random.randint(0,150,size = 10),
    },
    index = list('xiaobingab')
)
df4 = DataFrame(data={
    'python':np.random.randint(0,150,size = 10),
    'xiaoming':np.random.randint(0,150,size = 10),
    'xiaohong':np.random.randint(0,150,size = 10),
    },
    index = list('xiaobingab'),
    columns = ['A', 'xiaoming', 'C', 'D']
)
display(df2,df3,df4)  # 行索引不指定,则用自然数填充

查询

display(type(df2['python']))
df2['python']    # pandas.core.series.Series
df2[['python']]   # 仍然是DataFrame
df2[['python','xiaoming']]    # 相当于定向的切片
df2.python
df2.loc[1]   # 这里的1是index索引名
df2.iloc[1]   # 这里的1是索引顺序为1(从0开始)
df4 = df2.transpose()   # 交换index 与 column   (转置)
		|    0 |   1 |   2  |  3   | 4    |	5 | 6    | 7   | 8 |	9 
----- | ---- | ---- | ---- | ---- | ---- | --- | ---- | ---- | ---- | ----

python | 17 | 110 | 115| 79| 52| 121| 137| 41| 29 |37
xiaoming |41| 104 |48 |96| 108| 85| 148| 42| 40 |1
xiaohong |128 |48| 0| 96 |5 |115| 23| 64| 51| 66

df4[2]['python']   # 115
df4.loc['python'][1]  # 110
df4.loc['python',1]   # 110
df4.iloc[2,1]    # 48
data = df4.values
data
  • array([[ 17, 110, 115, 79, 52, 121, 137, 41, 29, 37],
    [ 41, 104, 48, 96, 108, 85, 148, 42, 40, 1],
    [128, 48, 0, 96, 5, 115, 23, 64, 51, 66]])

切片

df5 = df4.iloc[:,2:]
df5 = df4.loc[:,0:5]
df4['python':'xiaoming']    # *** 行切片
df4[1:2]

df1 = DataFrame(np.random.normal(100,scale=20,size=(6,3)),index = list('zxcvnm'),columns=['qq','wechat','email'])
df1
  • qq wechat email
    z 92.386231 88.978414 115.526063
    x 110.847067 105.462850 131.447248
    c 88.559184 75.191029 113.287049
    v 82.812931 105.350996 118.224760
    n 106.242152 114.609146 124.179372
    m 100.107778 85.918584 74.234945
df2 = df.add(df1,fill_value=0)
df2
  • Chinese Math Python email qq wechat
    A 84.0 130.0 95.0 NaN NaN NaN
    B 39.0 69.0 149.0 NaN NaN NaN
    C 71.0 16.0 146.0 NaN NaN NaN
    D 89.0 131.0 7.0 NaN NaN NaN
    E 57.0 30.0 1.0 NaN NaN NaN
    F 89.0 85.0 55.0 NaN NaN NaN
    G 22.0 134.0 99.0 NaN NaN NaN
    H 20.0 40.0 28.0 NaN NaN NaN
    I 90.0 95.0 106.0 NaN NaN NaN
    J 29.0 54.0 106.0 NaN NaN NaN
    c NaN NaN NaN 113.287049 88.559184 75.191029
    m NaN NaN NaN 74.234945 100.107778 85.918584
    n NaN NaN NaN 124.179372 106.242152 114.609146
    v NaN NaN NaN 118.224760 82.812931 105.350996
    x NaN NaN NaN 131.447248 110.847067 105.462850
    z NaN NaN NaN 115.526063 92.386231 88.978414
df2['A':'E'] = df2.loc['A':'E']/2 +25
Python Operator Pandas Method(s)
+ add()
- sub(), subtract()
* mul(), multiply()
/ truediv(), div(), divide()
// floordiv()
% mod()
** pow()

NaN

type(np.NaN)  # float
a = np.array([1,3,np.NaN,np.NaN,6])
a
  • array([ 1., 3., nan, nan, 6.])
np.sum(a)    # nan
np.nansum(a)   # 10    此方法会忽略nan 值

Dataframe中的nan

df = DataFrame(np.random.normal(100,scale=20,size=(10,3)),index=list('ABCDEFGHIJ'),columns=['Python','Math','Eng'])
df
  • Python Math Eng
    A 107.949245 113.838169 83.597526
    B 87.672478 110.768434 70.705457
    C 115.130985 117.322252 96.552291
    D 124.733534 104.409723 67.797502
    E 69.930626 113.678532 107.663599
    F 96.381046 92.938791 98.805056
    G 95.943738 115.415718 131.666832
    H 83.262271 112.149921 98.384134
    I 129.727339 102.010093 107.216974
    J 101.591134 106.435605 88.042077
df['Python']['F'] = np.NaN
df.iloc[8,2] = np.NaN
df.loc['C','Math'] = np.NaN
df.isnull().any()
  • Python True
    Math True
    Eng True
    dtype: bool
df.isnull().any(axis = 1)  # 判断哪一行有空数据
  • A False
    B False
    C True
    D False
    E False
    F True
    G False
    H False
    I True
    J False
    dtype: bool
cond = df.notnull().all(axis=1)
df[cond]
  • 所有不含空的index
  • Python Math Eng
    A 107.949245 113.838169 83.597526
    B 87.672478 110.768434 70.705457
    D 124.733534 104.409723 67.797502
    E 69.930626 113.678532 107.663599
    G 95.943738 115.415718 131.666832
    H 83.262271 112.149921 98.384134
    J 101.591134 106.435605 88.042077
pandas中None与np.nan都视作np.nan
fillna() 与 dropna()
df.dropna()   # 过滤丢失数据的样本
  • Python Math Eng
    A 107.949245 113.838169 83.597526
    B 87.672478 110.768434 70.705457
    D 124.733534 104.409723 67.797502
    E 69.930626 113.678532 107.663599
    G 95.943738 115.415718 131.666832
    H 83.262271 112.149921 98.384134
    J 101.591134 106.435605 88.042077
df.fillna(value=df.mean())   # 填充平均值
  • Python | Math | Eng
    A 107.949245 113.838169 83.597526
    B 87.672478 110.768434 70.705457
    C 115.130985 107.960554 96.552291
    D 124.733534 104.409723 67.797502
    E 69.930626 113.678532 107.663599
    F 101.771261 92.938791 98.805056
    G 95.943738 115.415718 131.666832
    H 83.262271 112.149921 98.384134
    I 129.727339 102.010093 93.690497
    J 101.591134 106.435605 88.042077
当然也可以添充中位数 等等
df.median()    # 中位数

Python 101.6
Math 110.8
Eng 96.6
dtype: float64

猜你喜欢

转载自blog.csdn.net/sinat_39045958/article/details/86513821