Python pandas入门

import numpy as np
import pandas as pd
from pandas import Series,DataFrame

# Series可以看做定长的有序字典
# 传入列表，不指定index，默认是0,1,2...
a = Series([1,2,3.0,'abc'])
print(a)
# 指定index
b = Series(data = [1,2,3,4],index = ['a','b','c','d'])
print(b)
print(b.values)
print(b.index)
# 传入字典
c = Series({'a':1,'b':2})
print(c)
# 为series和index设置name
c.name = 'cSeries'
c.index.name = 'cIndex'
print(c)

# DataFrame 是一个表格型的数据结构，它含有一组有序的列（类似于 index），每列可以是不同的值类型(不同于ndarray，只能有一个dtype)
# 可以把 DataFrame 看成是共享同一个 index 的 Series 的集合
# 字典的key(state year pop)充当Series的name属性，index依旧是0,1,2...
data = {'state':['Ohino','Ohino','Ohino','Nevada','Nevada'],'year':[2000,2001,2002,2001,2002],'pop':[1.5,1.7,3.6,2.4,2.9]}
df = DataFrame(data)
print(df)
# 完整的构造函数 data是传入的字典，index是每一行的key，columns是每一列的名字(没有值用NaN补充)
df = DataFrame(data,index=['one','two','three','four','five'],columns=['year','state','pop','debt'])
print(df)
print(df.index)
print(df.columns)
# 每一列是一个Series
print(type(df['pop']))
print(df['pop'])

# Series重新索引，返回一个新的对象 
# fill_value, method可选参数，处理默认值
print(b)
e = b.reindex(['b','d','a','c','e'],fill_value=0) #a,b,c,d是key
print(e)
e = b.reindex(['b','d','a','c','e'],method='ffill')
print(e)

# DataFrame重新索引
f = DataFrame(np.arange(9).reshape((3,3)),index = ['a','b','c'],columns=['c1','c2','c3'])
print(f)
f1 = f.reindex(columns=['c2','c3','c1']) #改变列的顺序
print(f1)
f3 = f.reindex(index=['c','b','a']) #改变行的顺序
print(f3)
f2 = f.reindex(index = ['c','d','a','b'],columns=['c2','c3','c1'],fill_value=0) #新增一行d
print(f2)

# 删除drop，返回新对象，原对象不会被改变
# .drop(labels, axis=0) 0行 1列 labels可以是列表
# 删除行
f4 = f.drop('c') # axis默认为0
print(f4)
# 删除列
f5 = f.drop(['c1','c2'],1)
print(f5)
f5 = f.drop('c1',1)
print(f5)
# Series
b1 = b.drop(['a','c'])
print(b1)

# 切片 索引
g= DataFrame(np.arange(16).reshape((4,4)),index = ['a','b','c','d'],columns=['c1','c2','c3','c4'])
print(g)
# loc值 iloc索引
g1 = g.loc[:,:'c2'] # 获取切片
print(g1)
g2 = g.iloc[:,:2]
print(g2)
print(g.iloc[1,1]) # 获取某个位置的值
print(g.loc['b','c2'])

# 索引时获取的是列
print(g['c1'])
# 结合bool 
print(g['c2']>5)  # 返回bool数组
g3 = g[g['c2']>5] # 挑出c2列中大于5的行
print(g3)

# 切片时获取的是行
print(g[:'c'])

# 获取某一行  注：loc iloc好用！
print(g.loc['c',:])
# 获取某一列
print(g.loc[:,'c1'])

# 算术运算和数据对齐
# DataFrame 的对齐操作会同时发生在行和列上
h1 = DataFrame(np.arange(16).reshape((4,4)),index = ['a','b','c','d'],columns=['c1','c2','c3','c4'])
h2 = DataFrame(np.arange(16).reshape((4,4)),index = ['b','c','d','e'],columns=['c1','c2','c3','c4'])
print(h1+h2)
h3 = h1.add(h2,fill_value=0) # 不对齐的值设置为0
print(h3)

#  排序 sort_index
#  排名 rank

# 统计方法
# mean min max...
print(h1.mean()) # 按列求平均值
print(h1.min())  # 按列求最小值

# inplace参数
# 凡是会对数组作出修改并返回一个新数组的，往往都有一个 inplace=False 的可选参数。如果手动设定为 True，那么原数组就可以被替换。
h1.drop(["c2"],axis=1) # 不对原数组改变
print(h1)
h1.drop(["c2"],axis=1,inplace=True) # 对原数组进行了改变
print(h1)
Python pandas入门

Python pandas入门

猜你喜欢