1.Pandas库简介
-
-
它是一个结构化数据的工作集,用于数据分析
2.数据读取
import pandas as pd
#########################################################################
#读取.CSV数据
filecsv = pd.read_csv("E:\\test.csv")
print (filecsv)
print (filecsv.dtypes)
print (help(pd.read_csv))
#输出
'''
<class 'pandas.core.frame.DataFrame'>
0.000000000000000000e+00 float64
0.000000000000000000e+00.1 float64
0.000000000000000000e+00.2 float64
0.000000000000000000e+00.3 float64
0.000000000000000000e+00.4 float64
0.000000000000000000e+00.5 float64
1.000000000000000000e+00 float64
0.000000000000000000e+00.6 float64
0.000000000000000000e+00.7 float64
0.000000000000000000e+00.8 float64
dtype: object
0.000000000000000000e+00 ... 0.000000000000000000e+00.8
0 1.0 ... 1.0
1 0.0 ... 0.0
[2 rows x 10 columns]
Help on function read_csv in module pandas.io.parsers:
read_csv(filepath_or_buffer: Union[str, pathlib.Path, IO[~AnyStr]], sep=',', delimiter=None, header='infer', names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, skipfooter=0, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, cache_dates=True, iterator=False, chunksize=None, compression='infer', thousands=None, decimal: str = '.', lineterminator=None, quotechar='"', quoting=0, doublequote=True, escapechar=None, comment=None, encoding=None, dialect=None, error_bad_lines=True, warn_bad_lines=True, delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None)
'''
#########################################################################
#查看数据几行几列
print (filecsv.shape)
#输出
'''
(29, 10)
'''
#########################################################################
#定位到某一行&某几行
print(filecsv.loc[15])
#print(filecsv.loc[2:5])
#输出
'''
0.000000000000000000e+00 0.0
0.000000000000000000e+00.1 0.0
0.000000000000000000e+00.2 0.0
0.000000000000000000e+00.3 1.0
0.000000000000000000e+00.4 0.0
0.000000000000000000e+00.5 0.0
1.000000000000000000e+00 1.0
0.000000000000000000e+00.6 0.0
0.000000000000000000e+00.7 0.0
0.000000000000000000e+00.8 0.0
Name: 15, dtype: float64
'''
#########################################################################
#数据类型
#object
#int
#float
#datetime
#pool
3.数据预处理
import pandas as pd
#########################################################################
#基本运算 加减乘除
# + - * /
#########################################################################
#最大值、最小值
filehd = [1, 2, 23, 34, 13]
fmax = max(filehd)
fmin = min(filehd)
print (fmax)
print (fmin)
#输出
'''
34
1
'''
#########################################################################
#排序(以某列为索引)
filecsv = pd.read_csv("E:\\00Going on\\test.csv")
print (filecsv)
filecsv.sort_values(by = ["first"], inplace = True)#第一个参数为索引(第一列名称),第二个参数为是否建立新的存储,第三个参数(ascending)默认为升序
filecsv.sort_values(by = ["second"], inplace = True, ascending= False)#第一个参数为索引(第二列名称),第二个参数为是否建立新的存储,第三个参数改为降
# fsortup = filecsv["firstcol"]
print (filecsv["first"])
print (filecsv["second"])
#输出
'''
first second third
0 35.5 1.77 1.23473
1 37.0 1.80 1.72170
2 31.5 1.57 1.32277
3 37.0 1.80 1.33770
4 37.0 1.80 1.47370
5 37.0 1.80 1.37470
1 37.0
3 37.0
4 37.0
5 37.0
0 35.5
2 31.5
Name: first, dtype: float64
1 1.80
3 1.80
4 1.80
5 1.80
0 1.77
2 1.57
Name: second, dtype: float64
'''
#########################################################################
#缺失值处理
#缺失值处理-过滤掉
filecsv = pd.read_csv("E:\\00Going on\\test.csv")
print (filecsv)#pandas里缺失值是NAN表示,如果要fa的平均值会出错
mean_fa = filecsv["fa"].mean()#mean()是求均值,而且自动过滤掉缺失值使用剩下的正确值求解
print (mean_fa)
#输出
'''
fa sa ta
0 35.5 1.77 1.23473
1 NaN 1.80 1.72170
2 31.5 1.57 1.32277
3 37.0 1.80 1.33770
4 NaN 1.80 1.47370
5 37.0 1.80 1.37470
35.25
'''
#缺失值处理-删除缺失的样本(.dropna)
filecsv = pd.read_csv("E:\\00Going on\\000编程项目\\0001Python File\\00003Python入门\\test.csv")
print (filecsv)#pandas里缺失值是NAN表示
drop_fa = filecsv.dropna(axis = 0, subset = ["fa"])#subset表示fa里缺失值删除这一行
print (drop_fa)
#输出
'''
indexs fa sa ta
0 1 35.5 1.77 1.23473
1 2 NaN 1.80 1.72170
2 1 31.5 1.57 1.32277
3 1 37.0 1.80 1.33770
4 3 NaN 1.80 1.47370
5 2 37.0 1.80 1.37470
indexs fa sa ta