pandas模块是python用于数据导入与整理的模块，对数据挖掘前期数据的处理工作十分有用。
pandas模块的数据结构主要有两种：
1.Series
2.DataFrame

Series

注：以下的所有pandas都简称为pd(import pandas as pd)

介绍：
series结构是一个一维的标签矩阵，类似于python里面的字典key-value结构。
常用方法：

创建Series对象

# 用列表创建series对象
array=[" 粉条 " , " 粉丝 " , " 粉带 " ]
s1=pd.series(data=array)
print(s1)
"""
0    粉条
1    粉带
2    粉丝
dtype: object
"""
# 如果不指定索引默认从0开始,dtype为数据的类型
ss1=pd.Series(data=array,index=[ ' A ' , ' B ' , ' C '])
print(ss1)
"""
A    粉条
B    粉带
C    粉丝
dtype: object
"""

# 通过numpy的对象Ndarray创建Series:
n=np.random.randn(5)    # 随机创建一个ndarray对象;
s2=pd.Series(data=n)
print(s2)
"""
0   -1.387049
1   -0.527612
2   -0.389382
3    0.549090
4    0.122328
dtype: float64
"""

# 修改元素的数据类型
ss2=s2.astype(np.int)    # 也可以修改为nan缺失值
print(ss2)
"""
0   -1
1    0
2    0
3    0
4    0
dtype: int64
"""


# 通过字典创建Series对象
dict={string.ascii_lowercase[i]:i for i in range(10)}
s3=pd.Series(dict)
print(s3)
"""
a    0
b    1
c    2
d    3
e    4
f    5
g    6
h    7
i    8
j    9
dtype: int64
"""

对Series元素进行操作

array=[' 粉条 ' , ' 粉丝 ' , ' 粉带 ']
s1=pd.Series()

# 修改Series的索引值,默认索引为0，1，2，3...
s1.index=[ 'A' , 'B' , 'C']
print(s1)
"""
A    粉条
B    粉带
C    粉丝
dtype: object
"""

# Series纵向拼接
array=[' 粉条 ' , ' 粉丝 ' , ' 粉带 ']
s2=pd.Series(data=array)
s3=s1.append(s2) 
print(s3)
"""
A    粉条
B    粉带
C    粉丝
0    粉条
1    粉带
2    粉丝
dtype: object
"""

#删除指定索引对应的元素
s3=s3.drop('C') 
print(s3)
"""
A    粉条
B    粉带
0    粉条
1    粉带
2    粉丝
dtype: object
"""

# 根据指定的索引查找元素
print(s3['B'])
"""
粉带
"""
# 切片操作 
print(s3[:2])
"""
A    粉条
B    粉带
dtype: object
"""

print(s3[::2])
"""
2    粉丝
1    粉带
0    粉条
B    粉带
A    粉条
dtype: object
"""

print(s3[-2:])
"""
1    粉带
2    粉丝
dtype: object

"""

Series运算(Series中的很多运算与numpy的方法一致)

s1  = pd.Series(np.arange(5), index=list(string.ascii_lowercase[:5]))
s2  = pd.Series(np.arange(2, 8), index=list(string.ascii_lowercase[2:8]))
print(s1)
print(s2)
"""
a    0
b    1
c    2
d    3
e    4
dtype: int64
c    2
d    3
e    4
f    5
g    6
h    7
dtype: int64
"""

# 按照对应的索引进行计算，如果索引不同则填充为nan
# 加法  下面两种方法效果一样，任选一种即可
print(s1 + s2)
print(s1.add(s2))  
"""
a    NaN
b    NaN
c    4.0
d    6.0
e    8.0
f    NaN
g    NaN
h    NaN
dtype: float64

"""
# 减法，乘法，除法同理
# 减法
print(s1-s2)
print(s1.sub(s2))

# 乘法
print(s1*s2)
print(s1.mul(s2))

# 除法
print(s1/s2)
print(s1.div(s2))

# 求中位数
print(s1)
print(s1.median())
"""
a    0
b    1
c    2
d    3
e    4
dtype: int64
2.0
"""

# 求和
print(s1.sum())

# 最大值
print(s1.max())

# 最小值
print(s1.min())

特殊的where方法（Series中的where方法与numpy的where方法结论刚好相反）

s1 = pd.Series(np.arange(5), index=list(string.ascii_lowercase[:5]))
print(s1)
"""
a    0
b    1
c    2
d    3
e    4
dtype: int64
"""

print(s1.where(s1 > 3))
"""
a    NaN
b    NaN
c    NaN
d    NaN
e    4.0
dtype: float64
"""

# 对象中不大于三的元素赋值，与numpy相反
print(s1.where(s1 > 3 , 10))
"""
a    10
b    10
c    10
d    10
e     4
dtype: int64
"""

# 给对象中大于三的元素赋值使用mask方法
print(s1.mask(s1 > 3 , 10))
"""
a     0
b     1
c     2
d     3
e    10
dtype: int64

"""

DataFrame数据类型

DataFrame是一种表格型的数据结构,包含行索引和列索引,是二维数据结构。

创建DataFrame数据

# 通过列表创建
li=[
[1 , 2 , 3 , 4],
[2 , 3 , 4 , 5]
]
# DataFrame对象包含两个索引，行索引（0轴，axis=0），列索引（1轴，axis=1）
d1=pd.DataFrame(data=li,index=[' A ' , ' B '] , columns=['views' , 'loves' , 'comments' , 'tranfers'])
print(d1)
"""
   views  loves  comments  tranfers
A      1      2         3         4
B      2      3         4         5
"""

# 通过numpy对象创建
narr = np.arange(8).reshape(2, 4)
d2 = pd.DataFrame(data=narr, index=['A', 'B'], columns=['views', 'loves', 'comments', 'tranfers'])
print(d2)

# 通过字典的方式创建;
dict = {
    'views': [1, 2, ],
    'loves': [2, 3, ],
    'comments': [3, 4, ]

}
d3 = pd.DataFrame(data=dict, index=['粉条', "粉丝"])
print(d3)
"""
    views  loves  comments
粉条      1      2         3
粉丝      2      3         4
"""

# 行索引
dates = pd.date_range(start='today', periods=6)
# 数据
data_arr = np.random.randn(6, 4)
# 列索引
columns = ['A', 'B', 'C', 'D']
d4 = pd.DataFrame(data_arr, index=dates, columns=columns)
print(d4)


# 创建日期
# freq为单位
dates = pd.date_range(start='1/1/2019', end='12/31/2019', freq='D')
print(dates)
"""
DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06', '2019-01-07', '2019-01-08',
               '2019-01-09', '2019-01-10',
               ...
               '2019-12-22', '2019-12-23', '2019-12-24', '2019-12-25',
               '2019-12-26', '2019-12-27', '2019-12-28', '2019-12-29',
               '2019-12-30', '2019-12-31'],
              dtype='datetime64[ns]', length=365, freq='D')
"""

DataFrame的基本操作

narr = np.arange(8).reshape(2, 4)
d2 = pd.DataFrame(data=narr, index=['A', 'B'], columns=['views', 'loves', 'comments', 'tranfers'])

# 查看基础属性
print(d2.shape)  # 获取行数和列数;
print(d2.dtypes)  # 列数据类型
print(d2.ndim)  # 获取数据的维度
print(d2.index) # 行索引
print(d2.columns) # 列索引
print(d2.values, type(d2.values))   # 对象的值， 二维ndarray数组;


# 数据整体状况的查询
print(d2.head(1))  # 显示头部的几行，默认为五行
print(d2.tail(1))     # 显示尾部的几行，默认为五行

# 相关信息的浏览：行数，列数，列类型，内存占用
print(d2.info())
"""
<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, A to B
Data columns (total 4 columns):
views       2 non-null int64
loves       2 non-null int64
comments    2 non-null int64
tranfers    2 non-null int64
dtypes: int64(4)
memory usage: 80.0+ bytes
info: None

"""


# 快速综合用计结果：计数，均值，标准差，最小值，1/4位数，中位数，3/4位数，最大值；
print(d2.describe())
"""
          views     loves  comments  tranfers
count  2.000000  2.000000  2.000000  2.000000
mean   2.000000  3.000000  4.000000  5.000000
std    2.828427  2.828427  2.828427  2.828427
min    0.000000  1.000000  2.000000  3.000000
25%    1.000000  2.000000  3.000000  4.000000
50%    2.000000  3.000000  4.000000  5.000000
75%    3.000000  4.000000  5.000000  6.000000
max    4.000000  5.000000  6.000000  7.000000
"""

# 转置操作
print(d2.T)

# 按照指定列进行排序， 默认是升序， 如果需要降序显示，设置ascending=False;
print(d2.sort_values(by="views", ascending=False))


# 切片以及查询
print(d2[:1])   # 可以实现切片， 但是不能索引;
print(d2['views'])   # 通过标签查询， 获取单列信息；
print(d2.views)   # 和上面是等价的;
print(d2[['views', 'comments']])  # 通过标签查询多列信息；

# 通过类似索引的方式查询;
# - iloc(通过位置进行行数据的获取),
# - loc(t通过标签索引行数据)
print(d2.iloc[0])
print(d2.iloc[-1:])
"""
views       0
loves       1
comments    2
tranfers    3
Name: A, dtype: int64
   views  loves  comments  tranfers
B      4      5         6         7
"""


print(d2.loc['A'])
"""
views       0
loves       1
comments    2
tranfers    3
Name: A, dtype: int64
"""

# 更改pandas的值；
d2.loc['A'] = np.nan
print(d2)
"""
   views  loves  comments  tranfers
A    NaN    NaN       NaN       NaN
B    4.0    5.0       6.0       7.0
"""

从文件中读取数据

# csv文件的写入
df = pd.DataFrame(
    {'province': ['陕西', '陕西', '四川', '四川', '陕西'],
     'city': ['咸阳', '宝鸡', '成都', '成都', '宝鸡'],
     'count1': [1, 2, 3, 4, 5],
     'count2': [1, 2, 33, 4, 5]
     }
)

df.to_csv('doc/csvFile.csv')
print("csv文件保存成功")

# csv文件的读取
df2 = pd.read_csv('doc/csvFile.csv')
print(df2)

# excel文件的写入
df2.to_excel("/tmp/excelFile.xlsx", sheet_name="省份统计")
print("excel文件保存成功")

groupby

pandas提供了一个灵活高效的groupby功能，
1). 它使你能以一种自然的方式对数据集进行切片、切块、摘要等操作。
2). 根据一个或多个键（可以是函数、数组或DataFrame列>名）拆分pandas对象。
3). 计算分组摘要统计，如计数、平均值、标准差，或用户自定义函数。

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

df = pd.DataFrame(
    {'province': ['陕西', '陕西', '四川', '四川', '陕西'],
     'city': ['咸阳', '宝鸡', '成都', '成都', '宝鸡'],
     'count1': [1, 2, 3, 4, 5],
     'count2': [1, 2, 33, 4, 5]
     }
)

print(df)
# 根据某一列的key值进行统计分析;
grouped = df['count1'].groupby(df['province'])
print(grouped.describe())
print(grouped.median())

# 根据城市统计分析cpunt1的信息;
grouped = df['count1'].groupby(df['city'])
print(grouped.max())


# 指定多个key值进行分类聚合;
grouped = df['count1'].groupby([df['province'], df['city']])
print(grouped)
print(grouped.max())
print(grouped.sum())
print(grouped.count())

#  通过unstack方法， 实现层次化的索引;
print(grouped.max().unstack())

pandas模块--Series数据与DataFrame数据