Source code for ehrapy.plot._variable_correlation_plot

from __future__ import annotations

from typing import TYPE_CHECKING, Literal

import holoviews as hv
import numpy as np
import pandas as pd

import ehrapy as ep

if TYPE_CHECKING:
    from collections.abc import Sequence

    from ehrdata import EHRData


[docs] def variable_correlations( edata: EHRData, *, layer: str, var_names: Sequence[str] | None = None, method: Literal["spearman", "pearson", "kendall"] = "pearson", agg: Literal["mean", "last", "first"] = "mean", correction_method: Literal["bonferroni", "fdr_bh", "fdr_tsbh", "holm", "none"] = "bonferroni", alpha: float = 0.05, width: int = 600, height: int = 600, cmap: str = "RdBu_r", show_values: bool = True, title: str | None = None, ) -> hv.HeatMap | hv.Overlay: """Plot variable correlations as heatmap. Computes a correlation matrix (Pearson or Spearman) for the selected variables from the given layer. If the layer contains a time dimension, values are first aggregated per variable across time. Cells are annotated with the correlation coefficient. An asterisk marks statistically significant correlations after correction. Args: edata: Central data object. layer: Layer to extract data from. var_names: List of variable names to compute correlation of. If None, uses all numeric variables. method: Correlation method: "spearman", "kendall" or "pearson". agg: How to aggregate time dimension: "mean", "last" or "first". correction_method: Multiple testing correction method: * `'bonferroni'` conservative Bonferroni correction. * `'fdr_bh'` Benjamini-Hochberg false discovery rate (FDR) control. * `'fdr_tsbh'` two-stage Benjamini-Hochberg, better calibrated when many variables are truly correlated. * `'holm'` Holm-Bonferroni correction. * `'none'` no multiple-testing correction. alpha: Significance threshold after correction. width: Plot width in pixels. height: Plot height in pixels. cmap: Colormap for the heatmap. show_values: If True, display correlation values on cells. title: Set the title of the plot. Returns: :class:`holoviews.element.HeatMap` (if show_values=False) or :class:`holoviews.core.overlay.Overlay` (if show_values=True). Examples: >>> import ehrdata as ed >>> import ehrapy as ep >>> edata = ed.dt.ehrdata_blobs(n_variables=10, n_centers=5, n_observations=200, base_timepoints=3) >>> ep.pl.variable_correlations( ... edata, layer="tem_data", method="pearson", agg="mean", correction_method="fdr_bh", width=700 ... ) .. image:: /_static/docstring_previews/variable_correlations_heatmap.png """ corr_df, _, sig_df = ep.pp.variable_correlations( edata=edata, layer=layer, var_names=var_names, method=method, agg=agg, correction_method=correction_method, alpha=alpha, ) corr_long = corr_df.stack(dropna=False).rename("correlation") sig_long = sig_df.stack(dropna=False).rename("significant") heatmap_df = pd.concat([corr_long, sig_long], axis=1).reset_index() heatmap_df.columns = ["variable1", "variable2", "correlation", "significant"] is_nan = heatmap_df["correlation"].isna() is_diag = heatmap_df["variable1"] == heatmap_df["variable2"] heatmap_df["label"] = np.where( is_nan, "N/A", heatmap_df["correlation"].map("{:.2f}".format) + np.where(heatmap_df["significant"] & ~is_diag, "*", ""), ) # for NaN correlations the neutral color will be shown on the colorscale heatmap_df["correlation"] = heatmap_df["correlation"].fillna(0) if title is None: title = f"{method.capitalize()} Correlation Matrix " if correction_method != "none": title += f"(correction method: {correction_method}, alpha={alpha})" heatmap = hv.HeatMap(heatmap_df, kdims=["variable1", "variable2"], vdims=["correlation", "label"]) heatmap = heatmap.opts( width=width, height=height, cmap=cmap, clim=(-1, 1), colorbar=True, title=title, xrotation=45, toolbar="above", fontscale=1.2, xlabel="", ylabel="", ) if show_values: labels = hv.Labels(heatmap_df, kdims=["variable1", "variable2"], vdims="label").opts( text_font_size="10pt", text_color="black", text_align="center", ) overlay = heatmap * labels return overlay return heatmap
[docs] def variable_dependencies( edata: EHRData, *, layer: str, var_names: Sequence[str] | None = None, method: Literal["spearman", "pearson", "kendall"] = "pearson", agg: Literal["mean", "last", "first"] = "mean", correction_method: Literal["bonferroni", "fdr_bh", "fdr_tsbh", "holm", "none"] = "bonferroni", alpha: float = 0.05, abs_correlation_threshold: float = 0.3, only_significant: bool = True, width: int = 600, height: int = 600, cmap: str = "RdBu_r", title: str | None = None, ) -> hv.Chord: """Plot correlation dependencies as a chord diagram. Computes pairwise correlations between selected variables from layer and visualizes them as a chord diagram. If the layer contains a time dimension, values are aggregated per variable before correlation is computed. Args: edata: Central data object. layer: Layer to extract data from. var_names: List of variable names to compute correlation of. If None, uses all numeric variables. method: Correlation method: "spearman", "kendall" or "pearson". agg: How to aggregate time dimension: "mean", "last" or "first". correction_method: Multiple testing correction method: * `'bonferroni'` conservative Bonferroni correction. * `'fdr_bh'` Benjamini-Hochberg false discovery rate (FDR) control. * `'fdr_tsbh'` two-stage Benjamini-Hochberg, better calibrated when many variables are truly correlated. * `'holm'` Holm-Bonferroni correction. * `'none'` no multiple-testing correction. alpha: Significance threshold after correction. abs_correlation_threshold: Minimum absolute correlation to show a chord. only_significant: If True, only show significant correlations. width: Plot width in pixels. height: Plot height in pixels. cmap: Colormap for the chord diagram. title: Set the title of the plot. Returns: :class:`holoviews.element.Chord` object. Examples: >>> import ehrdata as ed >>> import ehrapy as ep >>> edata = ed.dt.ehrdata_blobs(n_variables=10, n_centers=5, n_observations=200, base_timepoints=3) >>> ep.pl.variable_dependencies( ... edata, layer="tem_data", method="pearson", agg="mean", correction_method="fdr_bh" ... ) .. image:: /_static/docstring_previews/variable_dependencies_chord.png """ if not 0 <= abs_correlation_threshold <= 1: raise ValueError(f"min_correlation must be between 0 and 1, got {abs_correlation_threshold}") corr_df, _, sig_df = ep.pp.variable_correlations( edata=edata, layer=layer, var_names=var_names, method=method, agg=agg, correction_method=correction_method, alpha=alpha, ) corr_long = corr_df.stack(dropna=False).rename("correlation") sig_long = sig_df.stack().rename("significant") edges_df = pd.concat([corr_long, sig_long], axis=1).reset_index() edges_df.columns = ["variable1", "variable2", "correlation", "significant"] variables = corr_df.columns.to_list() var_to_idx = {var: idx for idx, var in enumerate(variables)} edges_df["source"] = edges_df["variable1"].map(var_to_idx) edges_df["target"] = edges_df["variable2"].map(var_to_idx) edges_df = edges_df[edges_df["source"] < edges_df["target"]] edges_df = edges_df.dropna(subset="correlation") edges_df["value"] = edges_df["correlation"].abs() if only_significant: edges_df = edges_df[edges_df["significant"]] edges_df = edges_df[edges_df["value"] >= abs_correlation_threshold] edges_df = edges_df[["source", "target", "value", "correlation"]].reset_index(drop=True) if len(edges_df) == 0: raise ValueError( f"No correlations meet criteria (minimum absolute correlation to plot = {abs_correlation_threshold})." f"\nTry lowering abs_correlation_threshold or setting only_significant=False." ) nodes_df = pd.DataFrame({"index": range(len(variables)), "name": variables}) if title is None: title = f"{method.capitalize()} Correlation Chord Diagram " if correction_method != "none": title += f"({correction_method}, alpha={alpha})" chord = hv.Chord((edges_df, hv.Dataset(nodes_df, "index"))) chord = chord.opts( width=width, height=height, node_color="index", edge_color="correlation", colorbar=True, labels="name", node_size=15, clim=(-1, 1), title=title, cmap=cmap, ) return chord