基于pandas数据分析

import pandas as pd
import numpy as np


# 创建一个空的系列
s = pd.Series()
# 从ndarray创建一个Series
data = np.array(['张三','李四','王五','赵柳'])
s = pd.Series(data)
s = pd.Series(data,index=['100','101','102','103'])
# 从字典创建一个Series
data = {'100' : '张三', '101' : '李四', '102' : '王五'}
s = pd.Series(data)
# 从标量创建一个Series
s = pd.Series(5, index=[0, 1, 2, 3])

访问Series中的数据：

# 使用索引检索元素
s = pd.Series([1,2,3,4,5],index = ['a','b','c','d','e'])
print(s[0], s[:3], s[-3:])
# 使用标签检索数据
print(s['a'], s[['a','c','d']])

Series常用属性：

s1.values
s1.index
s1.dtype
s1.size
s1.ndim
s1.shape

pandas日期类型数据处理

# pandas识别的日期字符串格式
dates = pd.Series(['2011', '2011-02', '2011-03-01', '2011/04/01', '2011/05/01 01:01:01', '01 Jun 2011'])
# to_datetime() 转换日期数据类型
dates = pd.to_datetime(dates)
print(dates, dates.dtype, type(dates))
# 获取时间的某个日历字段的数值
print(dates.dt.day)

Series.dt提供了很多日期相关操作，如下：

Series.dt.year  The year of the datetime.
Series.dt.month The month as January=1, December=12.
Series.dt.day   The days of the datetime.
Series.dt.hour  The hours of the datetime.
Series.dt.minute    The minutes of the datetime.
Series.dt.second    The seconds of the datetime.
Series.dt.microsecond   The microseconds of the datetime.
Series.dt.week  The week ordinal of the year.
Series.dt.weekofyear    The week ordinal of the year.
Series.dt.dayofweek The day of the week with Monday=0, Sunday=6.
Series.dt.weekday   The day of the week with Monday=0, Sunday=6.
Series.dt.dayofyear The ordinal day of the year.
Series.dt.quarter   The quarter of the date.
Series.dt.is_month_start    Indicates whether the date is the first day of the month.
Series.dt.is_month_end  Indicates whether the date is the last day of the month.
Series.dt.is_quarter_start  Indicator for whether the date is the first day of a quarter.
Series.dt.is_quarter_end    Indicator for whether the date is the last day of a quarter.
Series.dt.is_year_start Indicate whether the date is the first day of a year.
Series.dt.is_year_end   Indicate whether the date is the last day of the year.
Series.dt.is_leap_year  Boolean indicator if the date belongs to a leap year.
Series.dt.days_in_month The number of days in the month.

日期运算：

# datetime日期运算
delta = dates - pd.to_datetime('1970-01-01')
print(delta, delta.dtype, type(delta))
# 把时间偏移量换算成天数
print(delta.dt.days)

通过指定周期和频率，使用date_range()函数就可以创建日期序列。默认情况下，频率是'D'。

import pandas as pd
# 以日为频率
datelist = pd.date_range('2019/08/21', periods=5)
print(datelist)
# 以月为频率
datelist = pd.date_range('2019/08/21', periods=5,freq='M')
print(datelist)
# 构建某个区间的时间序列
start = pd.datetime(2017, 11, 1)
end = pd.datetime(2017, 11, 5)
dates = pd.date_range(start, end)
print(dates)

bdate_range()用来表示商业日期范围，不同于date_range()，它不包括星期六和星期天。

import pandas as pd
datelist = pd.bdate_range('2011/11/03', periods=5)
print(datelist)

DataFrame

DataFrame是一个类似于表格的数据类型，可以理解为一个二维数组，索引有两个维度，可更改。DataFrame具有以下特点：

列可以是不同的类型
大小可变
标记轴(行和列)
针对行与列进行轴向统计

import pandas as pd

# 创建一个空的DataFrame
df = pd.DataFrame()
print(df)

# 从列表创建DataFrame
data = [1,2,3,4,5]
df = pd.DataFrame(data)
print(df)
data = [['Alex',10],['Bob',12],['Clarke',13]]
df = pd.DataFrame(data,columns=['Name','Age'])
print(df)
data = [['Alex',10],['Bob',12],['Clarke',13]]
df = pd.DataFrame(data,columns=['Name','Age'],dtype=float)
print(df)
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df = pd.DataFrame(data)
print(df)

# 从字典来创建DataFrame
data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age':[28,34,29,42]}
df = pd.DataFrame(data, index=['s1','s2','s3','s4'])
print(df)
data = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
        'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(data)
print(df)

DataFrame常用属性

编号	属性或方法	描述
1	`axes`	返回行/列标签（index）列表。
2	`columns`	返回列标签
3	`index`	返回行标签
4	`dtype`	返回对象的数据类型(`dtype`)。
5	`empty`	如果系列为空，则返回`True`。
6	`ndim`	返回底层数据的维数，默认定义：`1`。
7	`size`	返回基础数据中的元素数。
8	`values`	将系列作为`ndarray`返回。
9	`head(n)`	返回前`n`行。
10	`tail(n)`	返回最后`n`行。

实例代码：

import pandas as pd

data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age':[28,34,29,42]}
df = pd.DataFrame(data, index=['s1','s2','s3','s4'])
df['score']=pd.Series([90, 80, 70, 60], index=['s1','s2','s3','s4'])
print(df)
print(df.axes)
print(df['Age'].dtype)
print(df.empty)   
print(df.ndim)
print(df.size)
print(df.values)
print(df.head(3)) # df的前三行
print(df.tail(3)) # df的后三行

核心数据结构操作

列访问

DataFrame的单列数据为一个Series。根据DataFrame的定义可以知晓DataFrame是一个带有标签的二维数组，每个标签相当每一列的列名。

d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
     'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']), 
     'three' : pd.Series([1, 3, 4], index=['a', 'c', 'd'])}

df = pd.DataFrame(d)
print(df[df.columns[:2]])

列添加

DataFrame添加一列的方法非常简单，只需要新建一个列索引。并对该索引下的数据进行赋值操作即可。

import pandas as pd

df['four']=pd.Series([90, 80, 70, 60], index=['a', 'b', 'c', 'd'])
print(df)

列删除

删除某列数据需要用到pandas提供的方法pop，pop方法的用法如下：

import pan das as pd

d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 
     'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']), 
     'three' : pd.Series([10, 20, 30], index=['a', 'b', 'c'])}
df = pd.DataFrame(d)
print("dataframe is:")
print(df)

#推荐使用这个方法，可以删除多列 axis=1（每一行的后两个字段，其实就是后两列）
print(df.drop(['three', 'four'], axis=1))

# 删除一列： one
del(df['one'])
print(df)

#调用pop方法删除一列
df.pop('two')
print(df)

行访问

如果只是需要访问DataFrame某几行数据的实现方式则采用数组的选取方式，使用 ":" 即可：

import pandas as pd

d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 
    'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
print(df[2:4])

loc是针对DataFrame索引名称的切片方法。loc方法使用方法如下：

import pandas as pd

d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 
     'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
print(df.loc['b'])
print(df.loc[['a', 'b']])

iloc和loc区别是iloc接收的必须是行索引和列索引的位置。iloc方法的使用方法如下：

import pandas as pd

d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
     'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
print(df.iloc[2])
print(df.iloc[[2, 3]])

行添加

import pandas as pd

df = pd.DataFrame([['zs', 12], ['ls', 4]], columns = ['Name','Age'])
df2 = pd.DataFrame([['ww', 16], ['zl', 8]], columns = ['Name','Age'])

df = df.append(df2)
print(df)

行删除

使用索引标签从DataFrame中删除或删除行。如果标签重复，则会删除多行。

import pandas as pd

df = pd.DataFrame([['zs', 12], ['ls', 4]], columns = ['Name','Age'])
df2 = pd.DataFrame([['ww', 16], ['zl', 8]], columns = ['Name','Age'])
df = df.append(df2)
# 删除index为0的行 默认就是删除行axis=0（沿着列的方向，其实就是删除行）
df = df.drop(0) 
print(df)
# 删除index为0、index为'd'的两行
print(df.drop([0, 'd'], axis=0))

修改DataFrame中的数据

更改DataFrame中的数据，原理是将这部分数据提取出来，重新赋值为新的数据。

import pandas as pd

df = pd.DataFrame([['zs', 12], ['ls', 4]], columns = ['Name','Age'])
df2 = pd.DataFrame([['ww', 16], ['zl', 8]], columns = ['Name','Age'])
df = df.append(df2)
df['Name'][0] = 'Tom'
print(df)

复合索引

DataFrame的行级索引与列级索引都可以设置为复合索引，表示从不同的角度记录数据。

90 80 70 90 80 77 78 79 69 78 79 70

# 生成一组（6，3）6行3列的随机数，服从期望（概率最高）值为5，标准差为3的正态分布（normal）。

data = np.floor(np.random.normal(85, 3, (6,3)))

df = pd.DataFrame(data)

index = [('classA', 'F'), ('classA', 'M'), ('classB', 'F'), ('classB', 'M'), ('classC', 'F'), ('classC', 'M')]

df.index = pd.MultiIndex.from_tuples(index)

columns = [('Age', '20+'), ('Age', '30+'), ('Age', '40+')]

df.columns = pd.MultiIndex.from_tuples(columns)

复合索引的访问：

# 访问行
df.loc['classA']
df.loc['classA', 'F']
df.loc[['classA', 'classC']]

# 访问列
df.Age
df.Age['20+']
df['Age']
df['Age', '20+']

2020-08-30 基于pandas数据分析

基于pandas数据分析

猜你喜欢