【python数据处理】matplotlib

最近学习python与数据处理时，发现数据总是很生硬，所以我便开始学习python一个强大的库matplotlib，可以有效将数据转化为直观的图形。

1.一些公共的方法

标签控制 Modify Ticks 调整横纵坐标值

注意要修改坐标名为sting类型时要先使用plt.subplot

ax = plt.subplot()
ax.set_xticks(months)
ax.set_xticklabels(month_names)
ax.set_yticks([0.10, 0.25, 0.5, 0.75])
ax.set_yticklabels(["10%", "25%", "50%", "75%"])

ax2 = plt.subplot()
ax2.set_xticks(range(len(drinks)))
ax2.set_xticklabels(drinks)
plt.show()

2.标题顾名思义

plt.xlabel("Time")
plt.ylabel("Dollars spent on coffee")
plt.title("My Last Twelve Years of Coffee Drinking")

3.plt.legend(legend_labels, loc=8) 标签给每一条线标个名字方便查看

Number Code：String：0.best 1.upper right 2.upper left 3.lower left 4.lower right 5.right 6.center left 7.center right 8.lower center 9.upper center 10.center

4.保存及大小设定

plt.close('all')防止一些未显示的图线捣乱

plt.figure(figsize=(width,height)) 设置大小

plt.savefig('power_generated.png') 保存

import codecademylib
from matplotlib import pyplot as plt

word_length = [8, 11, 12, 11, 13, 12, 9, 9, 7, 9]
power_generated = [753.9, 768.8, 780.1, 763.7, 788.5, 782, 787.2, 806.4, 806.2, 798.9]
years = [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009]

plt.close('all')
plt.figure()
plt.plot(years, word_length)
plt.savefig('winning_word_lengths.png')
plt.figure(figsize=(7, 3))
plt.plot(years, power_generated)
plt.savefig('power_generated.png')

5.plt.axis

显示特定区间可以理解为放大或缩小

plt.axis([x-min, x-max, y-min, y-max])显示这一段线

6.plt.subplot

用来创建多个图像和用来修改x，y轴坐标使用ax

子图

`plt.subplot()`需要传递三个参数：

子图的行数

子图的列数

子图索引

子图间隔控制

plt.subplots_adjust(wspace=0.35, bottom=0.2)

plt.subplot(2, 1, 1)
plt.plot(x, straight_line)

# Subplot 2
plt.subplot(2, 2, 3)
plt.plot(x, parabola)

# Subplot 3
plt.subplot(2, 2, 4)
plt.plot(x, cubic)

plt.subplots_adjust(wspace=0.35, bottom=0.2)

上下两个subplot如果前面的传入值一样会在一起共用plt.show(),如果修改了figsize会变成两张图,独立使用plt.show()

下方就是分别打印

import codecademylib3_seaborn
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

# Bar Graph: Featured Games

games = ["LoL", "Dota 2", "CS:GO", "DayZ", "HOS", "Isaac", "Shows", "Hearth", "WoT", "Agar.io"]

viewers =  [1070, 472, 302, 239, 210, 171, 170, 90, 86, 71]

ax=plt.subplot()
plt.figure(figsize=(5,5))

plt.bar(range(len(games)),viewers,color='slateblue')
ax.set_xticks(range(len(games)))
ax.set_xticklabels(games, rotation=30)

plt.title('Each game viewers')
plt.legend(["Twitch"])
plt.xlabel('Games')
plt.ylabel('Viewers')

plt.show()

# Pie Chart: League of Legends Viewers' Whereabouts

labels = ["US", "DE", "CA", "N/A", "GB", "TR", "BR", "DK", "PL", "BE", "NL", "Others"]

ax2=plt.subplot()
plt.figure(figsize=(5,5))
countries = [447, 66, 64, 49, 45, 28, 25, 20, 19, 17, 17, 279]
plt.pie(countries,labels=labels)
plt.axis('equal')
plt.show()

折线图

1. plt.plot(x, y) 画一条x为横坐标y为纵坐标的直线

可以画多条

2.样式控制

color 颜色，

linstyle='--' 表示线条是虚线，':' 表示是虚线不过是点点点样子，''看不见没有线条

marker='s' 关键节点是square 方块 , 'o'表示圆 , '*'表示星号

plt.plot(time, revenue, color="purple", linestyle='--') 紫色虚线
plt.plot(time, costs, color="#82edc9", marker='s') 蓝色方块结点线

3.Fill Between 类似于barchart 的error bar

import codecademylib
from matplotlib import pyplot as plt

months = range(12)
month_names = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
revenue = [16000, 14000, 17500, 19500, 21500, 21500, 22000, 23000, 20000, 19500, 18000, 16500]

plt.plot(months, revenue)

ax = plt.subplot()
ax.set_xticks(months)
ax.set_xticklabels(month_names)

y_upper = [i + (i*0.10) for i in revenue]
y_lower = [i - (i*0.10) for i in revenue]

plt.fill_between(months, y_lower, y_upper, alpha=0.2)

plt.show()

import codecademylib3_seaborn
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

hour = range(24)

viewers_hour = [30, 17, 34, 29, 19, 14, 3, 2, 4, 9, 5, 48, 62, 58, 40, 51, 69, 55, 76, 81, 102, 120, 71, 63]

plt.title("Codecademy Learners Time Series")

plt.xlabel("Hour")
plt.ylabel("Viewers")

plt.plot(hour, viewers_hour)

plt.legend(['2015-01-01'])

ax = plt.subplot()

ax.set_facecolor('seashell')

ax.set_xticks(hour)
ax.set_yticks([0, 20, 40, 60, 80, 100, 120])

y_upper = [i + (i*0.15) for i in viewers_hour]
y_lower = [i - (i*0.15) for i in viewers_hour]

plt.fill_between(hour, y_lower, y_upper, alpha=0.2)

# Add the code here:
plt.show()

条形图

1.相对于折线图画图为 plt.bar(x,y) 一般x可以用range(len(y))表示

import codecademylib
from matplotlib import pyplot as plt

drinks = ["cappuccino", "latte", "chai", "americano", "mocha", "espresso"]
sales =  [91, 76, 56, 66, 52, 27]

plt.bar(range(len(sales)), sales)
plt.bar(len(sales),sales)
plt.show()

2.Side-By-Side bar

import codecademylib
from matplotlib import pyplot as plt

drinks = ["cappuccino", "latte", "chai", "americano", "mocha", "espresso"]
sales1 = [91, 76, 56, 66, 52, 27]
sales2 = [65, 82, 36, 68, 38, 40]

#Paste the x_values code here
n = 1  # This is our first dataset (out of 2)
t = 2 # Number of dataset
d = 6 # Number of sets of bars
w = 0.8 # Width of each bar
store1_x = [t*element + w*n for element
             in range(d)]

plt.bar(store1_x, sales1)
#Paste the x_values code here
n = 2  # This is our second dataset (out of 2)
t = 2 # Number of dataset
d = 6 # Number of sets of bars
w = 0.8 # Width of each bar
store2_x = [t*element + w*n for element
             in range(d)]

plt.bar(store2_x, sales2)

plt.show()

# 导入绘图模块
import matplotlib.pyplot as plt
import numpy as np
# 构建数据
Y2016 = [15600,12700,11300,4270,3620]
Y2017 = [17400,14800,12000,5200,4020]
labels = ['北京','上海','香港','深圳','广州']
bar_width = 0.35

# 中文乱码的处理
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 绘图
plt.bar(np.arange(5), Y2016, label = '2016', color = 'steelblue', alpha = 0.8, width = bar_width)
plt.bar(np.arange(5)+bar_width, Y2017, label = '2017', color = 'indianred', alpha = 0.8, width = bar_width)
# 添加轴标签
plt.xlabel('Top5城市')
plt.ylabel('家庭数量')
# 添加标题
plt.title('亿万财富家庭数Top5城市分布')
# 添加刻度标签
plt.xticks(np.arange(5)+bar_width/2,labels)
# 设置Y轴的刻度范围
plt.ylim([2500, 19000])

# 为每个条形图添加数值标签
for x2016,y2016 in enumerate(Y2016):
    plt.text(x2016, y2016+100, '%s' %y2016)

for x2017,y2017 in enumerate(Y2017):
    plt.text(x2017+bar_width, y2017+100, '%s' %y2017)
# 显示图例
plt.legend()
# 显示图形
plt.show()

3.Stacked Bars

import codecademylib
from matplotlib import pyplot as plt

drinks = ["cappuccino", "latte", "chai", "americano", "mocha", "espresso"]
sales1 =  [91, 76, 56, 66, 52, 27]
sales2 = [65, 82, 36, 68, 38, 40]
  
plt.bar(range(len(drinks)), sales1)
plt.bar(range(len(drinks)), sales2, bottom=sales1)

plt.legend(["Location 1", "Location 2"])

plt.show()

import codecademylib
from matplotlib import pyplot as plt
import numpy as np

unit_topics = ['Limits', 'Derivatives', 'Integrals', 'Diff Eq', 'Applications']
As = [6, 3, 4, 3, 5]
Bs = [8, 12, 8, 9, 10]
Cs = [13, 12, 15, 13, 14]
Ds = [2, 3, 3, 2, 1]
Fs = [1, 0, 0, 3, 0]

x = range(5)

c_bottom = np.add(As, Bs)
d_bottom = np.add(c_bottom, Cs)
f_bottom = np.add(d_bottom, Ds)
#create d_bottom and f_bottom here

#create your plot here
plt.figure(figsize=(10,8))
plt.bar(x, As)
plt.bar(x, Bs, bottom=As)
plt.bar(x, Cs, bottom=c_bottom)
plt.bar(x, Ds, bottom=d_bottom)
plt.bar(x, Fs, bottom=f_bottom)

ax = plt.subplot()
ax.set_xticks(range(len(unit_topics)))
ax.set_xticklabels(unit_topics)

plt.title('Grade distribution')
plt.xlabel('Unit')
plt.ylabel('Number of Students')
plt.show()
plt.savefig('my_stacked_bar.png')

4.Error bars 允许有偏差值

import codecademylib
from matplotlib import pyplot as plt

drinks = ["cappuccino", "latte", "chai", "americano", "mocha", "espresso"]
ounces_of_milk = [6, 9, 4, 0, 9, 0]
error = [0.6, 0.9, 0.4, 0, 0.9, 0]

# Plot the bar graph here
plt.bar(range(len(drinks)), ounces_of_milk, yerr=error, capsize=5)

plt.show()

中文例子：

# 导入绘图模块
import matplotlib.pyplot as plt
# 构建数据
GDP = [12406.8,13908.57,9386.87,9143.64]

# 中文乱码的处理
#plt.rcParams['font.sans-serif'] =['Microsoft YaHei']
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 绘图                                        颜色和透明值可以修改
plt.bar(range(4), GDP, align = 'center',color='steelblue', alpha = 0.8)
# 添加轴标签
plt.ylabel('GDP')
#plt.xlabel('省市')
# 添加标题
plt.title('四个直辖市GDP大比拼')
# 添加刻度标签
plt.xticks(range(4),['北京市','上海市','天津市','重庆市'])
# 设置Y轴的刻度范围
plt.ylim([5000,15000])

# 为每个条形图添加数值标签
for x,y in enumerate(GDP):
    plt.text(x,y+100,'%s' %round(y,1),ha='center')
    
# 显示图形
plt.show()

# 导入绘图模块
import matplotlib.pyplot as plt
# 构建数据
price = [39.5,39.9,45.4,38.9,33.34]

# 中文乱码的处理
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 绘图
plt.barh(range(5), price, align = 'center',color='steelblue', alpha = 0.8)
# 添加轴标签
plt.xlabel('价格')
# 添加标题
plt.title('不同平台书的最低价比较')
# 添加刻度标签
plt.yticks(range(5),['亚马逊','当当网','中国图书网','京东','天猫'])
# 设置Y轴的刻度范围
plt.xlim([32,47])

# 为每个条形图添加数值标签
for x,y in enumerate(price):
    plt.text(y+0.1,x,'%s' %y,va='center')
# 显示图形    
plt.show()

饼图

1.相对于折线图和bar chart 饼图画图为 plt.pie(x)

plt.axis('equal') 表示正圆否则是椭圆

import codecademylib
from matplotlib import pyplot as plt
import numpy as np

payment_method_names = ["Card Swipe", "Cash", "Apple Pay", "Other"]
payment_method_freqs = [270, 77, 32, 11]

#make your pie chart here
plt.axis('equal')
plt.pie(payment_method_freqs)

plt.show()

2.有两种标签方式

plt.pie(payment_method_freqs,labels=payment_method_names)

plt.legend(payment_method_names)

3.百分比设置

autopct='%0.1f%%'

'%0.2f' ：4.08

'%0.2f%%' :4.08%

'%d%%' : 4%

plt.pie(budget_data,
        labels=budget_categories,
        autopct='%0.1f%%')

例子1：

import codecademylib
from matplotlib import pyplot as plt

unit_topics = ['Limits', 'Derivatives', 'Integrals', 'Diff Eq', 'Applications']
num_hardest_reported = [1, 3, 10, 15, 1]

#Make your plot here
plt.figure(figsize=(10,8))
plt.pie(num_hardest_reported, labels=unit_topics, autopct="%1d%%")

plt.axis('equal')
plt.title('Hardest Topics')

plt.show()
plt.savefig("my_pie_chart.png")

例子2：

# 导入第三方模块
import matplotlib.pyplot as plt

# 设置绘图的主题风格（不妨使用R中的ggplot分隔）
plt.style.use('ggplot')

# 构造数据
edu = [0.2515,0.3724,0.3336,0.0368,0.0057]
labels = ['中专','大专','本科','硕士','其他']

explode = [0,0.1,0,0,0]  # 用于突出显示大专学历人群
colors=['#9999ff','#ff9999','#7777aa','#2442aa','#dd5555'] # 自定义颜色

# 中文乱码和坐标轴负号的处理
#plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.sans-serif'] = ['SimHei']
# 将横、纵坐标轴标准化处理，保证饼图是一个正圆，否则为椭圆
plt.axes(aspect='equal')

# 控制x轴和y轴的范围
plt.xlim(0,4)
plt.ylim(0,4)

# 绘制饼图
plt.pie(x = edu, # 绘图数据
        explode=explode, # 突出显示大专人群
        labels=labels, # 添加教育水平标签
        colors=colors, # 设置饼图的自定义填充色
        autopct='%.1f%%', # 设置百分比的格式，这里保留一位小数
        pctdistance=0.8,  # 设置百分比标签与圆心的距离
        labeldistance = 1.15, # 设置教育水平标签与圆心的距离
        startangle = 180, # 设置饼图的初始角度
        radius = 1.5, # 设置饼图的半径
        counterclock = False, # 是否逆时针，这里设置为顺时针方向
        wedgeprops = {'linewidth': 1.5, 'edgecolor':'green'},# 设置饼图内外边界的属性值
        textprops = {'fontsize':12, 'color':'k'}, # 设置文本标签的属性值
        center = (1.8,1.8), # 设置饼图的原点
        frame = 1 )# 是否显示饼图的图框，这里设置显示

# 删除x轴和y轴的刻度
plt.xticks(())
plt.yticks(())
# 添加图标题
plt.title('芝麻信用失信用户教育水平分布')

# 显示图形
plt.show()

import codecademylib3_seaborn
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

# Bar Graph: Featured Games

games = ["LoL", "Dota 2", "CS:GO", "DayZ", "HOS", "Isaac", "Shows", "Hearth", "WoT", "Agar.io"]

viewers =  [1070, 472, 302, 239, 210, 171, 170, 90, 86, 71]
colors = ['lightskyblue', 'gold', 'lightcoral', 'gainsboro', 'royalblue', 'lightpink', 'darkseagreen', 'sienna', 'khaki', 'gold', 'violet', 'yellowgreen']
labels = ["US", "DE", "CA", "N/A", "GB", "TR", "BR", "DK", "PL", "BE", "NL", "Others"]
explode = (0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)

ax2=plt.subplot()
plt.figure(figsize=(5,5))
countries = [447, 66, 64, 49, 45, 28, 25, 20, 19, 17, 17, 279]
plt.pie(countries,labels=labels,colors=colors,explode=explode,shadow=True, startangle=345, pctdistance=1.15)

plt.axis('equal')

plt.title("League of Legends Viewers' Whereabouts")
plt.legend(labels, loc="right")
plt.show()

Histograms 直方图最适合显示数据集的图表

plt.hist

tips:pd中的列直接取出是seriese,要使用 df.column.values当做list用

1.hist()的range与bins可以写可以不写，控制横坐标与直方个数, 可以通过range和bins来使其更好显示

import codecademylib
import numpy as np
# Write matplotlib import here:
from matplotlib import pyplot as plt
commutes = np.genfromtxt('commutes.csv', delimiter=',')

# Plot histogram here:
plt.hist(commutes,range=(20,50),bins=6)
plt.show()

2.alpha=(0~1) 控制显示虚无度，

normed两张表相差过大时用来使相近

plt.hist(sales_times1, bins=20,alpha=0.4,normed=True)
#plot your other histogram here
plt.hist(sales_times2,bins=20,alpha=0.4,normed=True)

随机成成一组正态分布的数据方法

np.random.normal(mean,std,number)

import codecademylib
import numpy as np
from matplotlib import pyplot as plt

# Brachiosaurus
b_data = np.random.normal(6.7,0.7,1000)

# Fictionosaurus
f_data = np.random.normal(7.7,0.3,1000)

plt.hist(b_data,
         bins=30, range=(5, 8.5), histtype='step',
         label='Brachiosaurus')
plt.hist(f_data,
         bins=30, range=(5, 8.5), histtype='step',
         label='Fictionosaurus')
plt.xlabel('Femur Length (ft)')
plt.legend(loc=2)
plt.show()

例子：

import codecademylib
from matplotlib import pyplot as plt

exam_scores1 = [62.58, 67.63, 81.37, 52.53, 62.98, 72.15, 59.05, 73.85, 97.24, 76.81, 89.34, 74.44, 68.52, 85.13, 90.75, 70.29, 75.62, 85.38, 77.82, 98.31, 79.08, 61.72, 71.33, 80.77, 80.31, 78.16, 61.15, 64.99, 72.67, 78.94]
exam_scores2 = [72.38, 71.28, 79.24, 83.86, 84.42, 79.38, 75.51, 76.63, 81.48,78.81,79.23,74.38,79.27,81.07,75.42,90.35,82.93,86.74,81.33,95.1,86.57,83.66,85.58,81.87,92.14,72.15,91.64,74.21,89.04,76.54,81.9,96.5,80.05,74.77,72.26,73.23,92.6,66.22,70.09,77.2]

# Make your plot here
plt.figure(figsize=(10,8))
plt.hist(exam_scores1,bins=12,normed=True,histtype='step',linewidth=2)
plt.hist(exam_scores2,bins=12,normed=True,histtype='step',linewidth=2)

plt.legend(["1st Yr Teaching","2nd Yr Teaching"],loc=2)

plt.title('Final Exam Score Distribution')
plt.xlabel('percentage')
plt.ylabel('Frequency')

plt.show()
plt.savefig('my_historam.png')

散点图

import codecademylib3_seaborn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

mu = 1
std = 0.5
mu2 = 4.188

np.random.seed(100)

xs = np.append(np.append(np.append(np.random.normal(0.25,std,100), np.random.normal(0.75,std,100)), np.random.normal(0.25,std,100)), np.random.normal(0.75,std,100))

ys = np.append(np.append(np.append(np.random.normal(0.25,std,100), np.random.normal(0.25,std,100)), np.random.normal(0.75,std,100)), np.random.normal(0.75,std,100))

values = list(zip(xs, ys))

model = KMeans(init='random', n_clusters=2)

results = model.fit_predict(values)

plt.scatter(xs, ys, c=results, alpha=0.6)

colors = ['#6400e4', '#ffc740']

for i in range(2):
  points = np.array([values[j] for j in range(len(values)) if results[j] == i])
  plt.scatter(points[:, 0], points[:, 1], c=colors[i], alpha=0.6)
  
plt.title('Codecademy Mobile Feedback - Data Science')

plt.xlabel('Learn Python')
plt.ylabel('Learn SQL')
  
plt.show()