admin管理员组

文章数量:1438188

利用小提琴图探索帕尔默企鹅数据

利用小提琴图探索帕尔默企鹅数据

代码语言:javascript代码运行次数:0运行复制
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st

from palmerpenguins import load_penguins

数据探索

代码语言:javascript代码运行次数:0运行复制
# 数据展示
penguins = load_penguins()
penguins.head()
image-20240129165915062

image-20240129165915062

species:企鹅的种类,包括 Adelie、Chinstrap 和 Gentoo 三种。 island:企鹅所在岛屿的名字,包括 Biscoe、Dream 和 Torgersen 三个岛屿。 bill_length_mm:企鹅的喙长,单位毫米。 bill_depth_mm:企鹅的喙深,单位毫米。 flipper_length_mm:企鹅的鳍长,单位毫米。 body_mass_g:企鹅的体重,单位克。 sex:企鹅的性别,包括 Male 和 Female。

代码语言:javascript代码运行次数:0运行复制
# 数据清洗
penguins = penguins.dropna() # 删除na
species = sorted(penguins["species"].unique()) # 物种列表
y_data = [penguins[penguins["species"] == specie]["bill_length_mm"].values for specie in species] # 获取每个物种的bill_length_mm
代码语言:javascript代码运行次数:0运行复制
# 构造抖动点:避免数据点重叠
jitter = 0.04
x_data = [np.array([i] * len(d)) for i, d in enumerate(y_data)]
x_jittered = [x + st.t(df=6, scale=jitter).rvs(len(x)) for x in x_data]

绘制基础小提琴图

代码语言:javascript代码运行次数:0运行复制
# 设置基础信息:包括颜色、位置、水平线

# 颜色
BG_WHITE = "#fbf9f4"
GREY_LIGHT = "#b4aea9"
GREY50 = "#7F7F7F"
BLUE_DARK = "#1B2838"
BLUE = "#2a475e"
BLACK = "#282724"
GREY_DARK = "#747473"
RED_DARK = "#850e00"

COLOR_SCALE = ["#1B9E77", "#D95F02", "#7570B3"] # 取自 RColorBrewer R 库中的 Dark2 调色板

# 位置(三个物种的位置,可以是任意位置:例如[-1, 0, 1])
POSITIONS = [0, 1, 2]

# 水平线
HLINES = [40, 50, 60]
代码语言:javascript代码运行次数:0运行复制
# 构造基本布局:对于每个物种,依次添加小提琴图、箱线图、抖动数据点

# 初始画布
fig, ax = plt.subplots(figsize= (14, 10))
# 背景色
fig.patch.set_facecolor(BG_WHITE)
ax.set_facecolor(BG_WHITE)
# 水平线:用以参考bill_length_mm的位置
for h in HLINES:
    ax.axhline(h, color=GREY50, ls=(0, (5, 5)), alpha=0.8, zorder=0)
    
# 添加小提琴图框架
violins = ax.violinplot(
    y_data, 
    positions=POSITIONS,
    widths=0.45,
    bw_method="silverman",
    showmeans=False, 
    showmedians=False,
    showextrema=False
)

# 自定义小提琴图(外形设置)
for pc in violins["bodies"]:
    pc.set_facecolor("none")
    pc.set_edgecolor(BLACK)
    pc.set_linewidth(1.4)
    pc.set_alpha(1)
    
# 添加箱线图

medianprops = dict(
    linewidth=4, 
    color=GREY_DARK,
    solid_capstyle="butt"
)
boxprops = dict(
    linewidth=2, 
    color=GREY_DARK
)

ax.boxplot(
    y_data,
    positions=POSITIONS, 
    showfliers = False, # Do not show the outliers beyond the caps.
    showcaps = False,   # Do not show the caps
    medianprops = medianprops,
    whiskerprops = boxprops,
    boxprops = boxprops
)

# 添加抖动数据点
for x, y, color in zip(x_jittered, y_data, COLOR_SCALE):
    ax.scatter(x, y, s = 100, color=color, alpha=0.4)
    
png

添加自定义注释

众所周知,可视化好不好看,全凭注释是否精(花)准(哨)。接下来就是最考验技术的地方了!

代码语言:javascript代码运行次数:0运行复制
# 添加平均值标签
means = [y.mean() for y in y_data]
for i, mean in enumerate(means):
    # 添加代表平均值的点
    ax.scatter(i, mean, s=250, color=RED_DARK, zorder=3)
    
    # 添加连接平均值及其标签的线
    ax.plot([i, i + 0.25], [mean, mean], ls="dashdot", color="black", zorder=3)
    
    # 添加平均值标签
    ax.text(
        i + 0.25,
        mean,
        r"$\hat{\mu}_{\rm{mean}} = $" + str(round(mean, 2)),
        fontsize=13,
        va="center",
        bbox = dict(
            facecolor="white",
            edgecolor="black",
            boxstyle="round",
            pad=0.15
        ),
        zorder=10 # 确保该线位于顶部
    )

# 添加均值差的p值信息(多重比较)
tick_len = 0.25 # 首位端刻度长短
ax.plot([0, 0, 1, 1], [62.5 - tick_len, 62.5, 62.5, 62.5 - tick_len], c="black")
ax.plot([0, 0, 2, 2], [65 - tick_len, 65, 65, 65 - tick_len], c="black")
ax.plot([1, 1, 2, 2], [67.5 - tick_len, 67.5, 67.5, 67.5 - tick_len], c="black")

# 添加p值标签
label1 = r"$p_{\rm{Holm-corrected}}$ = 8.42e-14"
label2 = r"$p_{\rm{Holm-corrected}}$ = 4.3e-14"
label3 = r"$p_{\rm{Holm-corrected}}$ = 0.031"

pad = 0.2 # 文本距离实现的距离
ax.text(0.5, 62.5 + pad, label1, fontsize=11, va="bottom", ha="center")
ax.text(1, 65 + pad, label2, fontsize=11, va="bottom", ha="center")
ax.text(1.5, 67.5 + pad, label3, fontsize=11, va="bottom", ha="center") 

fig
output_14_0

为图表增加更丰富的信息

代码语言:javascript代码运行次数:0运行复制
# 自定义布局

# 隐藏右边/上边的框
ax.spines["right"].set_color("none")
ax.spines["top"].set_color("none")

# 定义边框颜色和线宽
ax.spines["left"].set_color(GREY_LIGHT)
ax.spines["left"].set_linewidth(2)
ax.spines["bottom"].set_color(GREY_LIGHT)
ax.spines["bottom"].set_linewidth(2)

# 自定义标签和刻度
ax.tick_params(length=0)
ax.set_yticks(HLINES)
ax.set_yticklabels(HLINES, size=15)
ax.set_ylabel("Bill Length", size=18, weight="bold")

# x轴上添加各物种数量信息
xlabels = [f"{specie}\n(n={y_data[i].size})" for i, specie in enumerate(species)]
ax.set_xticks(POSITIONS)
ax.set_xticklabels(xlabels, size=15, ha="center", ma="center")
ax.set_xlabel("Penguin Species", size=18, weight="bold")

# 自定义标题与副标题
# 标题
stats = [
    r"$\log_{\rm{e}}(\rm{BF}_{01})=-195.59$",
    r"$\widehat{R^2}_{\rm{Bayesian}}^{\rm{posterior}}=0.70$",
    r"$\rm{CI}_{95\%}^{\rm{HDI}}[0.67, 0.73]$",
    r"$r^{\rm{Cauchy}}_{\rm{JZS}} = 0.71$",
]

fig.suptitle(
    "Distribution of bill length across penguins species",
    x = 0.122,
    y = 0.975,
    ha="left",
    fontsize=26,
    fontname="Lobster Two",
    color=BLUE,
    weight="bold",  
)
    
# 子标题   
stats = [
    r"$F_{\rm{Welch}}$(2, 165.34)=409.93",
    r"p=8.27e-65",
    r"$\widehat{\omega_p^2}$=0.83",
    r"CI$_{95\%}$[0.79, 0.86]",
    r"n$_{\rm{obs}}$=333"
]

ax.set_title(
    ", ".join(stats),
    loc="left",
    ha="left",
    fontsize=20,
    color=BLUE_DARK
)
    
# 右下角添加文本注释
fig.text(
    0.55,
    0.03,
    ", ".join(stats),
    fontsize=10
)

fig.text(
    0.55,
    0.005,
    r"Pairwise test: $\bf{Games-Howell}$ $\bf{test}$; Comparisons shown: $\bf{Only}$ $\bf{significant}$",
    fontsize=10
)

fig
output_16_0

参考:Palmer Penguins exploration with violinplots in Matplotlib[1]

共勉~

参考资料

[1]

Palmer Penguins exploration with violinplots in Matplotlib: /

本文参与 腾讯云自媒体同步曝光计划,分享自微信公众号。原始发表:2025-04-22,如有侵权请联系 cloudcommunity@tencent 删除dataset布局基础数据

本文标签: 利用小提琴图探索帕尔默企鹅数据