《封号码罗》数据分析与人工智能之pandas（三）

第一部分

import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
from matplotlib import pyplot
import time

# 一、Series是一种类似与一维数组的对象，有下面两部分组成：
#               values：一组数据（ndarray类型）
#               index：相关的数据索引标签
# ndarray创建
s1 = Series(data=np.random.randint(0, 150, size=10), index=list("abcdefhijk"), name="Python")
# 字典创建  #   type(s2) <class 'pandas.core.series.Series'>  core核心的意思
s2 = Series({"A": 110, "B": 119, "C": 120, "D": 130, "E": 150}, name="s2")
s3 = Series({"A": 89, "B": 128, "C": 135, "D": 148}, name="s3")

# 索引和切片
# 显式索引：使用index或者字典的key作为索引值；          使用.loc["A"]推荐
# 隐式索引：使用0，1...                                 使用.iloc[0,1,2...]
# 切片 s2["A":"D"]
# s2.shape  (5,)    返回形状
# s2.size   5   返回长度，元素个数
# s2.index  Index(['A', 'B', 'C', 'D', 'E'], dtype='object')
# s2.values     [110 119 120 130 150]   series的值是ndarray
# s2.head()     s2.tail()   默认头部五个数据  尾部五个数据
# s2.isnull()    不是空值就返回False
# s2.notnull()      不是空值就返回True

# Series运算  Series是numpy的升级版，numpy有的series也有
# s2.add(10)    广播机制 全部加10
# s2["A"] = np.NaN  在运算时空值填充为0
# s2.add(10, fill_value=0)
# s2.astype(int)    修改数据类型
# s2.value_counts() 统计数值出现的次数
# s2.var()  方差
# s2.std()  标准差
# s2 + s3 如果索引没有一一对应，怎自动用NaN补充不对应的那一行
start = time.perf_counter()

print(s2 + s3)

end = time.perf_counter()
print(f"程序运行耗时为{(start - end).__round__(20)}")

第二部分

import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
from matplotlib import pyplot
import time

df = DataFrame(data={"Python": np.random.randint(0, 150, size=5), "Math": np.random.randint(0, 150, size=5),
                     "English": np.random.randint(0, 150, size=5)}, index=list("ABCDE"))
# DataFrame属性：values  columns  index shape  字典的key是列索引，字典的value是列的值，index是行索引
# df.shape (5, 3)

df2 = DataFrame(data=np.random.randint(0, 150, size=(5, 4)), index=list("ABCDE"),
                columns=["Python", "Math", "English", "Physic"])
# df2.columns       Index(['Python', 'Math', 'En', 'Physic'], dtype='object')

# df.to_csv("./data.txt")
# pd.show_versions()显示pandas支持的库 openpyxl安装了才能支持xlsx
# df.to_excel("./data.xlsx")
# dd1 = pd.read_csv("./data.txt")     # 读取数据
# dd2 = dd1.rename(mapper={"Unnamed: 0": "index"}, axis=1)    # 替换某个列索引的名字
# dd3 = dd2.set_index(keys="index")   # 设置某个列为行索引
# dd4 = pd.read_excel("./data.xlsx")
# 结构化数据 数据库里面的数据是结构化数据
# 半结构化数据    如json数据
# 非结构化数据    如网页中杂乱无章的数据是非结构化数据 爬虫提取非结构化数据转为半结构化或者结构化

# DataFrame的索引：字典方式 属性的方式
# 可以将DataFrame的列获取为一个Series，返回的Series拥有原DataFrame相同的索引，且name属性也已经设置好了，就是对应的列名
# df2[["Python", "En"]] # 这都是对列进行索引

# 对行进行索引三种方法  使用.ix[]通用    使用.loc["index"]   使用.iloc[0]
# df2.loc[["A", "B"]]   df2.iloc[[0,1]]

# 对元素进行索引
# df2["Python"]["A"]    df2["Python"][["A","B"]]    # 先找列
# df2.loc["C"]["Python"]  ==  df2.loc["C", "Python"] # 先找行 不支持df2.loc["Python", "C"]

# 切片
# 直接使用中括号时：索引表示的是列索引，切片表示的行切片
# df2["A":"D"]["Python"]切片后在哪某一列  不能使用df2["Python":"En"]切片
# df2.loc["A":"D"] == df2.iloc[0:4]
# 列切片只有一种方式 df2.iloc[:, 1:3]  冒号表示选取所有行

# DataFrame的运算  也有广播机制
# df2 - 10    df2.pow(2)幂运算   df2.divide(3).round(2)除以3并保留2位小数
# df2.cov() 协方差
# df2.corr() 相关性系数 0~1之间    可以筛选出不重要的一些属性
# df2.var() 方差  越小说明数据之间的大小波动越平稳
# df2.info() DataFrame的数据信息
# df2.describe()    数据的描述
# axis =0 ----- index
# axis =1 ----- columns

start = time.perf_counter()
print(df2)
end = time.perf_counter()
print(f"程序运行耗时为{(start - end).__round__(20)}")

第三部分

import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import time

# 空数据处理
df = DataFrame(np.random.randint(0, 150, size=(150, 3)), columns=["Python", "Math", "En"])
for i in range(15):
    index = np.random.randint(0, 150, size=1)[0]
    column = np.random.randint(0, 3, size=1)[0]
    df.iloc[index, column] = None  # 随机某行某列值为空

cond1 = df.isnull().any(axis=1)  # 空数据行为True  定位空数据的行
# print(df[cond1])  # 获得所有空数据行

cond2 = df.notnull().all(axis=1)  # 定位所有非空数据
# print(df[cond2])                    # 获得所有非空数据的行

# df.dropna()  # 删除空数据行
# 方式一
# df.drop(labels=["En"], axis=1)    # 默认删除行，要删除列时使用axis=1
# df.drop(labels=[5,8])     # 删除第五行和第八行的数据
# 方式二
# cond3 = df.isnull().any(axis=1)
# index = df[cond3].index()
# df.drop(labels=index)
# 删除小于60的行
# cond4 = (df < 80).any(axis=1)
# index2 = df[cond4].index
# df.drop(labels=index2)
# 平均分小于90的过滤
# cond5 = df.mean(axis=1) < 90
# index3 = df[cond5].index
# dd = df.drop(labels=index3)
# cond = cond1 | cond2   还可以多个条件一起使用

df2 = DataFrame(np.random.randint(0, 150, size=(5, 3)), columns=["Python", "Math", "En"])
df2["Python"][0] = None
df2["En"][3] = None
# dd = df2.fillna(60)               # fillna()填充数据
# dd = df2.fillna(value=df2.mean()) # 填充平均值
# dd = df2.fillna(value=df2.median())  # 填充中位数
# df.fillna(method="backfill", axis=0)  # method : {'backfill', 'bfill', 'pad', 'ffill', None},  其他填充方式

start = time.perf_counter()
print()
end = time.perf_counter()
print(f"程序运行耗时为{(start - end).__round__(20)}")

第四部分

import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import time

# 创建多层 行索引
# 隐式构造 最常见的方法是给DataFrame构造函数的index参数传递 两个或者更多数组
# Series创建多层索引
s = Series(np.random.randint(0, 150, size=6), index=pd.MultiIndex.from_product([["A", "B", "C"], ["期中", "期末"]]))
# DataFrame创建多层索引
# 二层行索引
df2 = DataFrame(np.random.randint(0, 150, size=(10, 3)), columns=["Python", "Math", "En"],
                index=pd.MultiIndex.from_product([list("ABCDE"), ["期中", "期末"]]))
# 三层行索引  格式有点乱了  size里面的20 对应from_product里面的5*2*2,否则长度不匹配会报错
df3 = DataFrame(np.random.randint(0, 150, size=(20, 3)), columns=["Python", "Math", "En"],
                index=pd.MultiIndex.from_product([list("ABCDE"), ["期中", "期末"], ["模拟一", "模拟二"]]))
# 二层列索引     注意创建时的行列数目 这里是五行六列
df4 = DataFrame(np.random.randint(0, 150, size=(5, 6)),
                columns=pd.MultiIndex.from_product([["Python", "Math", "En"], ["期中", "期末"]]), index=list("ABCDE"))

# 多层索引DataFrame的索引和切片
# 获取行和列，与之前的规则类似
# dd = df3.loc["A", "期末", "模拟二"]["Python"]    # 获取元素

# 聚合操作              level的值是指定哪一层操作保留
# dd = df3.mean(axis=0, level=2)

# 索引的堆栈 stack
# stack() 列变行   unstack()行变列    dd = df3.unstack().unstack().unstack()
# dd = df3.unstack(level=2)     这个是默认状态，可以指定行变成列的是哪一层
# dd = df3.unstack(level=[1, 2])    还可以是一个列表
# 多层Series通过unstack转成DataFrame
dd = df3.unstack(level=[1, 2])

# 数据聚合（重点）
# 通常是要使每一个数组生成一个单一的数值
# 数据分类处理：
#      分组---先把数据分为几组
#      用函数处理---为不同组的数据应用不同的函数以转换数据
#      合并---把不同组得到的结果合并起来
# 数据分类处理的核心：grounpby()函数
df5 = DataFrame({"item": np.random.randint(0, 10, size=100),
                 "sailer": np.random.randint(0, 10, size=100),
                 "weight": np.random.randint(30, 300, size=100),
                 "price": np.random.randint(1, 20, size=100)})


def convert_item(x):
    if x < 2:
        return "萝卜"
    elif x < 7:
        return "白菜"
    else:
        return "青椒"


df5["item"] = df5["item"].map(convert_item)


def convert_sailer(x):
    if x < 5:
        return "郭靖"
    elif x < 7:
        return "黄蓉"
    else:
        return "欧阳克"


df5["sailer"] = df5["sailer"].map(convert_sailer)


def convert_weight(x):
    if x < 80:
        return 50
    elif x < 180:
        return 150
    else:
        return 220


df5["weight"] = df5["weight"].map(convert_weight)
# 统计这一个月，三个人分别销售了多少斤蔬菜
ret = df5.groupby(["sailer"])["weight"]  # 分好组了，还没有计算


# ret = ret.sum()  # ==ret.apply(np.sum)
# ret = ret.count() 算次数  # ==ret.apply(np.size)
# 自定义分组聚合
def analysis(x):
    return (x.size, x.sum(), np.round(x.mean(), decimals=2), x.min())


ret = ret.apply(analysis)

ret2 = df5.groupby(["sailer", "item"])  # 既按照人统计，又按照菜别统计
# ret2 = ret2.mean()    求所有的平均值

# 价格求平均，销量求和
ret2 = ret2.agg({"weight": "sum", "price": "mean"}).round(2)

start = time.perf_counter()
print(ret2)
end = time.perf_counter()
print(f"程序运行耗时为{(start - end).__round__(20)}")

第五部分

import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import time

# 数据的拼接操作
# 级联：pd.concat   pd.append
# 合并：pd.merge    pd.join

# numpy级联回顾
# nd1 = np.random.randint(0, 10, size=(4, 5))
# nd2 = np.random.randint(0, 10, size=(7, 5))
# nd3 = np.random.randint(0, 10, size=(4, 8))
#
# dd1 = np.concatenate([nd1, nd2])
# dd2 = np.concatenate([nd1, nd3], axis=1)

df1 = DataFrame(np.random.randint(0, 150, size=(5, 3)), columns=["Python", "Math", "En"], index=list("ABCDE"))
df2 = DataFrame(np.random.randint(0, 150, size=(5, 3)), columns=["Python", "Math", "En"], index=list("ABCDE"))
df3 = DataFrame(np.random.randint(0, 150, size=(5, 3)), columns=["Chinese", "Physic", "Chem"], index=list("QWRTY"))
df4 = DataFrame(np.random.randint(0, 150, size=(5, 3)), columns=["Python", "Math", "Java"], index=list("ABCFG"))

# dd = pd.concat([df1, df2])    # 一般级联都是行方向进行级联，不同表，可以级联必然是属性相同
# dd = pd.concat([df1, df3], axis=1).fillna(0)
# dd = pd.concat([df1, df4], join="inner")    # join默认是outer保存所有索引 inner保留共同索引
# dd = pd.concat([df1, df4], join_axes=[df1.columns])   # 现在好像没有这个参数，指定某个数据集的索引为合并列索引
# dd = pd.concat([df1, df4], ignore_index=True)   # 忽略原始索引，用0，1，2，...
# dd = pd.concat([df1, df4], ignore_index=True)   # 忽略原始索引，用0，1，2，...
# dd = pd.concat([df1, df4], keys=["期中", "期末"])  # 定义数据的所属情况
# ds = dd.unstack(level=0).stack()    # 改变结构
# dd = df1.append(df2)        # append()末尾追加

# merge()融合，根据共同属性值进行融合 年龄，省份
dd = df1.merge(df2)

start = time.perf_counter()
print(dd)
end = time.perf_counter()
print(f"程序运行耗时为{(start - end).__round__(20)}")

第六部分

import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import time

# 数据的拼接操作
# 级联：pd.concat   pd.append
# 合并：pd.merge    pd.join
# merge()融合，根据共同属性值进行融合 如统计省份的人口

# 一对一融合
# df1 = DataFrame({"id": [1, 10, 1024], "name": ["po", "alis", "michael"], "sex": ["male", "female", "male"]})
# df2 = DataFrame({"id": [1, 11, 1025], "salary": [1000, 2000, 3000], "age": [20, 30, 40]})
# dd = df1.merge(df2)

# 多对一的融合
# df3 = DataFrame({"id": [1, 10, 1024], "name": ["po", "alis", "michael"], "sex": ["male", "female", "male"]})
# df4 = DataFrame({"id": [1, 1, 1], "salary": [1000, 2000, 3000], "age": [20, 30, 40]})
# dd = df3.merge(df4)

# 多对多的融合
# df5 = DataFrame({"id": [1, 1, 1], "name": ["po", "alis", "michael"], "sex": ["male", "female", "male"]})
# df6 = DataFrame({"id": [1, 1, 1], "salary": [1000, 2000, 3000], "age": [20, 30, 40]})
# dd = df5.merge(df6)

# key的规范化  使用on等于显示指定哪一列为key，当有多个key相同时使用
df7 = DataFrame({"id": [1, 10, 1024], "name": ["po", "alisa", "Michael"], "sex": ["male", "female", "male"]})
df8 = DataFrame({"id": [1, 11, 1025], "name": ["softpo", "alisa", "Michael"], "age": [20, 30, 40]})
df9 = DataFrame({"Id": [1, 11, 1025], "salary": [1000, 2000, 3000], "age": [20, 30, 40]})
# dd = df7.merge(df8, on="id")  # 有相同列时的情况
# dd = df7.merge(df9, left_on="id", right_on="Id")  # 没有相同列时的情况，可以指明哪两个列合并

# 内合并与外合并   内合并只保留两者都有的key（默认模式）    how是指定如何合并，可查看源码，还有左合并等等
# dd = df7.merge(df9, how="outer", left_on="id", right_on="Id")   # 无论列名是否对齐都合并

# 列冲突的解决  使用on解决，也可以使用suffixes=自己指定的后缀
# dd = df7.merge(df8, on="id", suffixes=("_A", "_B"))     # 原来是_x,_y

# 小合并的例子
df10 = DataFrame(np.random.randint(0, 150, size=(5, 3)), columns=["Python", "Math", "En"], index=list("ABCDE"))
df11 = DataFrame(df10.sum(axis=1), columns=["sum"])
dd = pd.merge(df10, df11, left_index=True, right_index=True)

start = time.perf_counter()
print(dd)
end = time.perf_counter()
print(f"程序运行耗时为{(start - end).__round__(20)}")

第七部分

# 美国人口数据分析

# 查询操作
#  df1 = df.query("列名 == 列值 and 列名 == '列值'"）

# 去重
# df.unique("列名")

# 全局设置，float保留两位或自定义几位小数
# pd.set_option('display.float_format',lambda x:'%0.2f'%(x))

Python 键盘上的舞者

发布了30 篇原创文章 · 获赞 5 · 访问量 3322

私信关注

《封号码罗》数据分析与人工智能之pandas（三）

第一部分

第二部分

第三部分

第四部分

第五部分

第六部分

第七部分

猜你喜欢