Source code for ehrapy.preprocessing._highly_variable_features
from __future__ import annotations
from typing import TYPE_CHECKING
import scanpy as sc
from ehrapy._compat import function_2D_only, use_ehrdata
if TYPE_CHECKING:
import pandas as pd
from anndata import AnnData
from ehrdata import EHRData
[docs]
@use_ehrdata(deprecated_after="1.0.0")
@function_2D_only()
def highly_variable_features(
edata: EHRData | AnnData,
*,
layer: str | None = None,
top_features_percentage: float = 0.2,
span: float | None = 0.3,
n_bins: int = 20,
subset: bool = False,
inplace: bool = True,
check_values: bool = True,
) -> pd.DataFrame | None:
"""Annotate highly variable features.
Expects count data. A normalized variance for each feature is computed. First, the data
are standardized (i.e., z-score normalization per feature) with a regularized
standard deviation. Next, the normalized variance is computed as the variance
of each feature after the transformation. Features are ranked by the normalized variance.
Args:
edata: Central data object.
layer: If provided, use `edata.layers[layer]` for expression values instead of `edata.X`.
top_features_percentage: Percentage of highly-variable features to keep.
span: The fraction of the data used when estimating the variance in the loess model fit.
n_bins: Number of bins for binning. Normalization is done with respect to each bin.
If just a single observation falls into a bin, the normalized dispersion is artificially set to 1.
You'll be informed about this if you set `settings.verbosity = 4`.
subset: Inplace subset to highly-variable features if `True` otherwise merely indicate highly variable features.
inplace: Whether to place calculated metrics in `.var` or return them.
check_values: Check if counts in selected layer are integers. A Warning is returned if set to True.
Returns:
Depending on `inplace` returns calculated metrics (:class:`~pandas.DataFrame`) or
updates `.var` with the following fields
**highly_variable**
boolean indicator of highly-variable features
**means**
means per feature
**variances**
variance per feature
**variances_norm**
normalized variance per feature, averaged in the case of multiple batches
**highly_variable_rank**
rank of the feature according to normalized variance, median rank in the case of multiple batches
"""
n_top_features = int(top_features_percentage * len(edata.var))
return sc.pp.highly_variable_genes(
adata=edata,
layer=layer,
n_top_genes=n_top_features,
span=span,
n_bins=n_bins,
flavor="seurat_v3",
subset=subset,
inplace=inplace,
check_values=check_values,
)