[python] matplotlib draws volcano graphs, bar graphs, and histograms

Article Directory

volcano map

Draw a volcano map, the input is two datafremes, the row is the sample name, and the column is the gene name. Gene expression was plotted using T-test test.

def minmax_scale(data):
    import numpy as np
    # # 示例数据
    # data = np.array([2, 4, 6, 8, 10])

    # 进行Min-Max标准化
    min_val = np.min(data)
    max_val = np.max(data)
    scaled_data = (data - min_val) / (max_val - min_val)

    # 输出标准化后的数据
    # print("标准化后的数据:", scaled_data)
    return scaled_data


# t-test检验 表达量上调还是下调
def gene_exp(group1, group2, is_scale=False):
    from scipy import stats
    import numpy as np
    # # 两组样本数据
    # group1 = np.array([1.2, 1.5, 1.8, 2.1, 2.4])
    # group2 = np.array([0.9, 1.3, 1.6, 2.3, 2.7, 3.0])
    
    if is_scale:
        group1 = minmax_scale(group1)
        group2 = minmax_scale(group2)
    
    # 执行差异表达分析(两组独立样本的t检验)
    t_statistic, p_value = stats.ttest_ind(group1, group2, equal_var=False)

    # 设置显著性水平(例如0.05)
    alpha = 0.05

    # # 判断基因的上调或下调情况
    # fold_change = np.mean(group2) / np.mean(group1)

    # if fold_change > 1 + alpha:
    #     # print("该基因上调")
    #     return 'Up', fold_change, p_value
    # elif fold_change < 1 - alpha:
    #     # print("该基因下调")
    #     return 'Down', fold_change, p_value
    # else:
    #     # print("该基因未表现出显著的上调或下调")
    #     return 'Unknown', fold_change, p_value
    
    # 计算基因的折叠变化(fold change)
    fold_change = np.log2(np.mean(group2) / np.mean(group1))

    # 判断基因的上调或下调情况
    if fold_change > 0 and p_value < alpha:
        # regulation = 'Upregulated'
        return 'Up', fold_change, p_value
    elif fold_change < 0 and p_value < alpha:
        # regulation = 'Downregulated'
        return 'Down', fold_change, p_value
    else:
        # regulation = 'No significant change'
        return 'Unknown', fold_change, p_value

# 生成需要的颜色list
def clst(lst):
    tlst = []
    for i in lst:
        if i == 'Up':
            tlst.append('r')
        elif i == 'Down':
            tlst.append('g')
        else:
            tlst.append('k')
    return tlst

           
def data_exp(df1, df2, is_scale=False):
    """
    df1: 数据1
    df2: 数据2
    数据1和数据2 dataframe格式,行是样本名(可以是不同样本数),列是基因名(相同的列)
    """
    from collections import OrderedDict
    cols = df1.columns
    rmk_lst = []
    fc_lst = []
    pv_lst = []
    # exp_dic = OrderedDict()
    for k in cols:
        rmk, fc, pv = gene_exp(list(df1[k]), list(df2[k]), is_scale=is_scale)
        # 可能出现nan和inf
        if np.isnan(fc) or np.isnan(pv) or np.isinf(fc) or np.isinf(pv):
            print(':'.join(k)+'\t'+rmk+'\t'+str(fc)+'\t'+str(pv))
            continue
        
        rmk_lst.append(rmk)
        fc_lst.append(fc)
        pv_lst.append(pv)
        # exp_dic[k] = [rmk, pv]
        print(':'.join(k)+'\t'+rmk+'\t'+str(fc)+'\t'+str(pv))
    # return exp_dic
    return rmk_lst, fc_lst, pv_lst


def plot_volcano(fold_change, p_value, title, pltcolor=False, regulation=[]):
    # 绘制火山图
    fig, ax = plt.subplots()

    # 计算-log10(p-value)
    # neg_log_pval = -np.log10(p_value) 
    # 计算-log10(p-value),避免出现负数问题
    neg_log_pval = -np.log10(np.maximum(p_value, np.finfo(float).eps))

    # 绘制散点图
    if pltcolor:
        # 根据上调/下调与否设置点的颜色
        # colors = np.where(regulation=='Up', 'r', np.where(regulation=='Down', 'g', 'k'))
        colors = clst(regulation)
        ax.scatter(fold_change, neg_log_pval, c=colors)
    else:
        ax.scatter(fold_change, neg_log_pval) # , c=colors)

    # 设置坐标轴标签和标题
    ax.set_xlabel('Fold Change (log2)')
    ax.set_ylabel('-log10(p-value)')
    ax.set_title('Volcano Plot[%s]' % title)

    # 绘制水平线(显著性水平)
    ax.axhline(-np.log10(alpha), color='gray', linestyle='--')

    # 显示图形
    plt.show() 

Instructions:

# 获取表达上下调结果、foldchange、pvalue:
mklst, fclst, pvlst = data_exp(data_df1, data_df2, is_scale=True)

# 绘制火山图
plot_volcano(list(fclst), list(pvlst), title="mytitle", pltcolor=True, regulation=rmklst)

bar chart

Code problem: the left and right labels are not aligned

import pandas as pd
import matplotlib.pyplot as plt

# 创建示例的 DataFrame 1
data1 = {
    
    '城市': ['北京', '上海', '广州', '深圳', '成都'],
         '人口': [2154, 2423, 1404, 1303, 1682]}
df1 = pd.DataFrame(data1)

# 创建示例的 DataFrame 2
data2 = {
    
    '城市': ['纽约', '伦敦', '巴黎', '东京', '新加坡'],
         '人口': [8537, 9304, 2141, 13929, 5894]}
df2 = pd.DataFrame(data2)

# 计算辅助变量
n = len(df1)  # 数据长度
width = 0.35  # 每个条形图的宽度

# 创建画布和主轴
fig, ax = plt.subplots(figsize=(10, 6))

# 绘制左侧的条形图
ax.barh(df1.index, df1['人口'], height=width, color='blue', label='DataFrame 1')

# 绘制右侧的条形图
ax.barh(df2.index, -df2['人口'], height=width, color='red', label='DataFrame 2')

# 设置纵坐标标签和刻度
ax.set_yticks(df1.index)
ax.set_yticklabels(df1['城市'])

# 设置右侧纵坐标标签和刻度,并逆序显示
ax2 = ax.twinx()
ax2.set_yticks(df1.index)
ax2.set_yticklabels(df2['城市'][::-1])  # 逆序显示右侧坐标标签

# 调整左右坐标轴的位置对齐
ax.tick_params(axis="y", direction="inout", length=6, pad=10)  # 左侧标签
ax2.tick_params(axis="y", direction="inout", length=6, pad=10)  # 右侧标签

# 显示图例
ax.legend()

# 显示图表
plt.show()

insert image description here

histogram

Custom divided bin interval

import numpy as np
import matplotlib.pyplot as plt

def histplot(df0, df1, label, xmax=None):
    # 手动指定区间
    bins = np.arange(0, 1.1, 0.1)  # 根据需要自定义区间
    
    names = df0.columns
    fig, axs = plt.subplots(nrows=8, ncols=5, figsize=(12, 20), sharex=True, sharey=True)
    for i, idx_name in enumerate(names):
        cx = i // 5
        rx = i % 5
        
        # 计算频次划bin
        hist, edges_0 = np.histogram(df0[idx_name], bins=bins)
        hist, edges_1 = np.histogram(df1[idx_name], bins=bins)
        
        axs[cx, rx].hist(df0[idx_name], bins=edges_0, label=label+'-0')
        axs[cx, rx].hist(df1[idx_name], bins=edges_1, alpha=0.5, label=label+'-1')
        axs[cx, rx].set_title(idx_name, fontsize=8)
        axs[cx, rx].grid(axis="y")
        axs[cx, rx].legend()
        if xmax is not None:
            axs[cx, rx].set_xlim(0, xmax) # (0, 0.1) # (0, 0.5)
    plt.show()

histplot(data_df0, data_df1, 'label')


def bins_lst(lst1, lst2, n=10):
    import numpy as np
    mlst = list(lst1) + list(lst2)
    minl = round(min(mlst), 4)
    maxl = round(max(mlst), 4)
    # print('minmax:', minl, maxl)
    step = round(float(maxl - minl) / n, 3)
    bins = np.arange(minl, maxl+step, step)
    return bins


def histplot_bin(hl_df, pc_df, label, nrows=2, hsize=6, nbin=10, xmax=None):
    names = hl_df.columns
    fig, axs = plt.subplots(nrows=nrows, ncols=5, figsize=(9, hsize)) # , sharex=True) # , sharey=True)
    for i, idx_name in enumerate(names):
        cx = i // 5
        rx = i % 5
        
        bins = bins_lst(hl_df[idx_name], pc_df[idx_name], n=nbin)
            
        # 计算频次划bin
        hist, edges_hl = np.histogram(hl_df[idx_name], bins=bins)
        hist, edges_pc = np.histogram(pc_df[idx_name], bins=bins)
        
        axs[cx, rx].hist(hl_df[idx_name], bins=edges_hl, label=label+'-0')
        axs[cx, rx].hist(pc_df[idx_name], bins=edges_pc, alpha=0.5, label=label+'-1')
        axs[cx, rx].set_title(idx_name, fontsize=8)
        axs[cx, rx].grid(axis="y")
        if xmax is not None:
            axs[cx, rx].set_xlim(0, xmax) # (0, 0.1) # (0, 0.5)
    
    axs[cx, rx].legend()
    plt.show()


# 使用
histplot2(df1[features], df2[features], label="label", nrows=3, hsize=8)

Guess you like

Origin blog.csdn.net/sinat_32872729/article/details/131382055