Source code for ehrapy.preprocessing._highly_variable_features

from __future__ import annotations

import pandas as pd
import scanpy as sc
from anndata import AnnData


[docs]def highly_variable_features(
    adata: AnnData,
    layer: str | None = None,
    top_features_percentage: float = 0.2,
    span: float | None = 0.3,
    n_bins: int = 20,
    subset: bool = False,
    inplace: bool = True,
    check_values: bool = True,
) -> pd.DataFrame | None:
    """Annotate highly variable features.

    Expects count data. A normalized variance for each feature is computed. First, the data
    are standardized (i.e., z-score normalization per feature) with a regularized
    standard deviation. Next, the normalized variance is computed as the variance
    of each feature after the transformation. Features are ranked by the normalized variance.

    Args:
        adata: The annotated data matrix of shape `n_obs` × `n_vars`.
        layer: If provided, use `adata.layers[layer]` for expression values instead of `adata.X`. Defaults to None .
        top_features_percentage: Percentage of highly-variable features to keep. Defaults to 0.2 .
        span: The fraction of the data used when estimating the variance in the loess model fit. Defaults to 0.3 .
        n_bins: Number of bins for binning. Normalization is done with respect to each bin.
                If just a single observation falls into a bin, the normalized dispersion is artificially set to 1.
                You'll be informed about this if you set `settings.verbosity = 4`. Defaults to 20 .
        subset: Inplace subset to highly-variable features if `True` otherwise merely indicate highly variable features.
                Defaults to False .
        inplace: Whether to place calculated metrics in `.var` or return them. Defaults to True .
        check_values: Check if counts in selected layer are integers. A Warning is returned if set to True.
                      Defaults to True .

    Returns:
        Depending on `inplace` returns calculated metrics (:class:`~pandas.DataFrame`) or
        updates `.var` with the following fields

    highly_variable : bool
        boolean indicator of highly-variable features
    **means**
        means per feature
    **variances**
        variance per feature
    **variances_norm**
        normalized variance per feature, averaged in the case of multiple batches
    highly_variable_rank : float
        rank of the feature according to normalized variance, median rank in the case of multiple batches
    """
    n_top_features = int(top_features_percentage * len(adata.var))

    return sc.pp.highly_variable_genes(
        adata=adata,
        layer=layer,
        n_top_genes=n_top_features,
        span=span,
        n_bins=n_bins,
        flavor="seurat_v3",
        subset=subset,
        inplace=inplace,
        check_values=check_values,
    )