Source code for ehrapy.preprocessing._outliers

from __future__ import annotations

import numpy as np
import pandas as pd
import scipy.stats.mstats
from anndata import AnnData


[docs]def winsorize( adata: AnnData, vars: str | list[str] | set[str] = None, obs_cols: str | list[str] | set[str] = None, limits: list[float] = None, copy: bool = False, **kwargs, ) -> AnnData: """Returns a Winsorized version of the input array. The implementation is based on https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.winsorize.html Args: adata: AnnData object to winsorize vars: The features to winsorize. obs_cols: Columns in obs with features to winsorize. limits: Tuple of the percentages to cut on each side of the array as floats between 0. and 1. Defaults to (0.01, 0.99) copy: Whether to return a copy or not **kwargs: Keywords arguments get passed to scipy.stats.mstats.winsorize Returns: Winsorized AnnData object if copy is True. Examples: >>> import ehrapy as ep >>> adata = ep.data.mimic_2(encoded=True) >>> ep.pp.winsorize(adata, ['bmi']) """ _validate_outlier_input(adata, obs_cols, vars) if copy: # pragma: no cover adata = adata.copy() if limits is None: limits = [0.01, 0.99] if vars: for var in vars: adata[:, var].X = scipy.stats.mstats.winsorize( np.array(adata[:, var].X), limits=limits, nan_policy="omit", **kwargs ) if obs_cols: for col in obs_cols: winsorized_array = scipy.stats.mstats.winsorize(adata.obs[col], limits=limits, nan_policy="omit", **kwargs) adata.obs[col] = pd.Series(winsorized_array).values if copy: return adata
[docs]def clip_quantile( adata: AnnData, limits: list[float], vars: str | list[str] | set[str] = None, obs_cols: str | list[str] | set[str] = None, copy: bool = False, ) -> AnnData: """Clips (limits) features. Given an interval, values outside the interval are clipped to the interval edges. The implementation is based on https://numpy.org/doc/stable/reference/generated/numpy.clip.html Args: adata: The AnnData object vars: Columns in var with features to clip obs_cols: Columns in obs with features to clip limits: Interval, values outside of which are clipped to the interval edges copy: Whether to return a copy of AnnData or not Returns: A copy of original AnnData object with clipped features. Examples: >>> import ehrapy as ep >>> adata = ep.data.mimic_2(encoded=True) >>> ep.pp.clip_quantile(adata, ['bmi']) """ obs_cols, vars = _validate_outlier_input(adata, obs_cols, vars) # type: ignore if vars: for var in vars: adata[:, var].X = np.clip(adata[:, var].X, limits[0], limits[1]) if obs_cols: for col in obs_cols: obs_array = adata.obs[col].to_numpy() clipped_array = np.clip(obs_array, limits[0], limits[1]) adata.obs[col] = pd.Series(clipped_array).values if copy: # pragma: no cover adata = adata.copy() if copy: return adata
def _validate_outlier_input( adata, obs_cols: str | list[str] | set[str], vars: str | list[str] | set[str] ) -> tuple[set[str], set[str]]: """Validates the obs/var columns for outlier preprocessing. Args: adata: AnnData object obs_cols: str or list of obs columns vars: str or list of var names Returns: A tuple of lists of obs/var columns """ if isinstance(vars, str) or isinstance(vars, list): # pragma: no cover vars = set(vars) if isinstance(obs_cols, str) or isinstance(obs_cols, list): # pragma: no cover obs_cols = set(obs_cols) if vars is not None: diff = vars - set(adata.var_names) if len(diff) != 0: raise ValueError(f"Columns {','.join(var for var in diff)} are not in var_names.") if obs_cols is not None: diff = obs_cols - set(adata.obs.columns.values) if len(diff) != 0: raise ValueError(f"Columns {','.join(var for var in diff)} are not in obs.") return obs_cols, vars