In [2]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
有两种丢失数据:
- None
- np.nan(NaN)
In [3]:
type(None)
Out[3]:
NoneType
In [4]:
type(np.nan)
Out[4]:
float
In [5]:
type(1000)
Out[5]:
int
In [6]:
type("hello")
Out[6]:
str
In [7]:
np.nan + 100
Out[7]:
nan
In [8]:
100 + "dsaf"
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-8-fb724d538b75> in <module>() ----> 1 100 + "dsaf" TypeError: unsupported operand type(s) for +: 'int' and 'str'
In [ ]:
None + 100
object类型的运算要比int类型的运算慢得多
计算不同数据类型求和时间
%timeit np.arange(1e5,dtype=xxx).sum()
In [ ]:
1e5
In [ ]:
%timeit np.arange(1e6,dtype="int").sum()
In [ ]:
%timeit np.arange(1e6,dtype="float").sum()
In [ ]:
%timeit np.arange(1e6,dtype="object").sum()
np.nan是浮点类型,能参与到计算中。但计算的结果总是NaN。
但可以使用np.nan*()函数来计算nan,此时视nan为0。
In [ ]:
nd = np.array([10,20,30,np.nan])
nd
In [ ]:
nd.sum()
In [ ]:
np.nansum(nd)
In [ ]:
np.nanmean(nd)
In [ ]:
np.array([1,2,3,np.nan,None])
In [ ]:
创建DataFrame
In [9]:
df = DataFrame([[10,20,34,None,23,np.nan],
[10,20,134,312,None,20],
[20,56,98,np.nan,np.nan,None]
],
index=list("abc"),
columns=list("ABCDEF"))
df
Out[9]:
A | B | C | D | E | F | |
---|---|---|---|---|---|---|
a | 10 | 20 | 34 | NaN | 23.0 | NaN |
b | 10 | 20 | 134 | 312.0 | NaN | 20.0 |
c | 20 | 56 | 98 | NaN | NaN | NaN |
使用DataFrame行索引与列索引修改DataFrame数据
In [10]:
df.sum(axis=0) # axis=0把行加起来
Out[10]:
A 40.0 B 96.0 C 266.0 D 312.0 E 23.0 F 20.0 dtype: float64
In [11]:
df.sum(axis=1) # 把列加起来
Out[11]:
a 87.0 b 496.0 c 174.0 dtype: float64
【注】pandas中的nan在运算的时候被视作0
isnull()
notnull()
dropna()
: 过滤丢失数据fillna()
: 填充丢失数据
(1)判断函数
isnull()
notnull()
In [12]:
df
Out[12]:
A | B | C | D | E | F | |
---|---|---|---|---|---|---|
a | 10 | 20 | 34 | NaN | 23.0 | NaN |
b | 10 | 20 | 134 | 312.0 | NaN | 20.0 |
c | 20 | 56 | 98 | NaN | NaN | NaN |
In [13]:
df.isnull()
Out[13]:
A | B | C | D | E | F | |
---|---|---|---|---|---|---|
a | False | False | False | True | False | True |
b | False | False | False | False | True | False |
c | False | False | False | True | True | True |
In [14]:
df.isnull().all(axis=0) # 判断每一列中的所有元素是否全为True,如果是则为True
Out[14]:
A False B False C False D False E False F False dtype: bool
In [15]:
df.isnull().any(axis=0)
# 判断每一列中是否有为True的元素,如果有一个为True,则为True
Out[15]:
A False B False C False D True E True F True dtype: bool
In [16]:
df.isnull().all(axis=1)
Out[16]:
a False b False c False dtype: bool
In [17]:
df.isnull().any(axis=1)
Out[17]:
a True b True c True dtype: bool
In [18]:
# 输出有缺失行
cond = df.isnull().any(axis=0)
cond
Out[18]:
A False B False C False D True E True F True dtype: bool
In [19]:
df[cond]
d:\Anaconda\lib\site-packages\ipykernel_launcher.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index. """Entry point for launching an IPython kernel.
--------------------------------------------------------------------------- IndexingError Traceback (most recent call last) <ipython-input-19-023e73c851e3> in <module>() ----> 1 df[cond] d:\Anaconda\lib\site-packages\pandas\core\frame.py in __getitem__(self, key) 1956 if isinstance(key, (Series, np.ndarray, Index, list)): 1957 # either boolean or fancy integer index -> 1958 return self._getitem_array(key) 1959 elif isinstance(key, DataFrame): 1960 return self._getitem_frame(key) d:\Anaconda\lib\site-packages\pandas\core\frame.py in _getitem_array(self, key) 1996 # check_bool_indexer will throw exception if Series key cannot 1997 # be reindexed to match DataFrame rows -> 1998 key = check_bool_indexer(self.index, key) 1999 indexer = key.nonzero()[0] 2000 return self.take(indexer, axis=0, convert=False) d:\Anaconda\lib\site-packages\pandas\core\indexing.py in check_bool_indexer(ax, key) 1937 mask = isnull(result._values) 1938 if mask.any(): -> 1939 raise IndexingError('Unalignable boolean Series provided as ' 1940 'indexer (index of the boolean Series and of ' 1941 'the indexed object do not match') IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match
In [20]:
df[df.columns[cond]] # 输出所有缺失的列
Out[20]:
D | E | F | |
---|---|---|---|
a | NaN | 23.0 | NaN |
b | 312.0 | NaN | 20.0 |
c | NaN | NaN | NaN |
判断条件
In [21]:
(df>30).any(axis=0)
Out[21]:
A False B True C True D True E False F False dtype: bool
In [22]:
(df<50).all(axis=1)
Out[22]:
a False b False c False dtype: bool
In [23]:
# 输出所有的值都大于100的那些列
c = (df>20).all(axis=0)
In [24]:
df[df.columns[c]]
Out[24]:
C | |
---|---|
a | 34 |
b | 134 |
c | 98 |
In [ ]:
(2) 过滤函数
dropna()
可以选择过滤的是行还是列(默认为行)
也可以选择过滤的方式 how = 'all'
In [25]:
df
Out[25]:
A | B | C | D | E | F | |
---|---|---|---|---|---|---|
a | 10 | 20 | 34 | NaN | 23.0 | NaN |
b | 10 | 20 | 134 | 312.0 | NaN | 20.0 |
c | 20 | 56 | 98 | NaN | NaN | NaN |
In [26]:
df.dropna(axis=1,how="any")
# axis 0 默认,代表操作行 1 代表操作列
# how "any"默认只要有缺失就丢弃 "all"所有的都缺失才丢掉
Out[26]:
A | B | C | |
---|---|---|---|
a | 10 | 20 | 34 |
b | 10 | 20 | 134 |
c | 20 | 56 | 98 |
(3) 填充函数 Series/DataFrame
fillna()
In [27]:
df
Out[27]:
A | B | C | D | E | F | |
---|---|---|---|---|---|---|
a | 10 | 20 | 34 | NaN | 23.0 | NaN |
b | 10 | 20 | 134 | 312.0 | NaN | 20.0 |
c | 20 | 56 | 98 | NaN | NaN | NaN |
In [28]:
df.fillna(1000) # 直接填补
Out[28]:
A | B | C | D | E | F | |
---|---|---|---|---|---|---|
a | 10 | 20 | 34 | 1000.0 | 23.0 | 1000.0 |
b | 10 | 20 | 134 | 312.0 | 1000.0 | 20.0 |
c | 20 | 56 | 98 | 1000.0 | 1000.0 | 1000.0 |
可以选择前向填充还是后向填充
In [29]:
df.fillna(method="bfill",axis=0) # 拿后面的行标对应的值填充到前面的nan位置
Out[29]:
A | B | C | D | E | F | |
---|---|---|---|---|---|---|
a | 10 | 20 | 34 | 312.0 | 23.0 | 20.0 |
b | 10 | 20 | 134 | 312.0 | NaN | 20.0 |
c | 20 | 56 | 98 | NaN | NaN | NaN |
In [30]:
df.fillna(method="ffill",axis=0)
Out[30]:
A | B | C | D | E | F | |
---|---|---|---|---|---|---|
a | 10 | 20 | 34 | NaN | 23.0 | NaN |
b | 10 | 20 | 134 | 312.0 | 23.0 | 20.0 |
c | 20 | 56 | 98 | 312.0 | 23.0 | 20.0 |
In [31]:
df.fillna(method="ffill",axis=1)
Out[31]:
A | B | C | D | E | F | |
---|---|---|---|---|---|---|
a | 10.0 | 20.0 | 34.0 | 34.0 | 23.0 | 23.0 |
b | 10.0 | 20.0 | 134.0 | 312.0 | 312.0 | 20.0 |
c | 20.0 | 56.0 | 98.0 | 98.0 | 98.0 | 98.0 |
对于DataFrame来说,还要选择填充的轴axis。记住,对于DataFrame来说:
- axis=0:index/行
- axis=1:columns/列
============================================
练习7:
简述None与NaN的区别
假设张三李四参加模拟考试,但张三因为突然想明白人生放弃了英语考试,因此记为None,请据此创建一个DataFrame,命名为ddd3
老师决定根据用数学的分数填充张三的英语成绩,如何实现? 用李四的英语成绩填充张三的英语成绩?
============================================
In [ ]: