[docs]definfer_feature_types(adata:AnnData,layer:str|None=None,output:Literal["tree","dataframe"]|None="tree"):"""Infer feature types from AnnData object. For each feature in adata.var_names, the method infers one of the following types: 'date', 'categorical', or 'numeric'. The inferred types are stored in adata.var['feature_type']. Please check the inferred types and adjust if necessary using adata.var['feature_type']['feature1']='corrected_type'. Be aware that not all features stored numerically are of 'numeric' type, as categorical features might be stored in a numerically encoded format. For example, a feature with values [0, 1, 2] might be a categorical feature with three categories. This is accounted for in the method, but it is recommended to check the inferred types. Args: adata: :class:`~anndata.AnnData` object storing the EHR data. layer: The layer to use from the AnnData object. If None, the X layer is used. output: The output format. Choose between 'tree', 'dataframe', or None. If 'tree', the feature types will be printed to the console in a tree format. If 'dataframe', a pandas DataFrame with the feature types will be returned. If None, nothing will be returned. Defaults to 'tree'. """feature_types={}df=anndata_to_df(adata,layer=layer)forfeatureinadata.var_names:col=df[feature].dropna()majority_type=col.apply(type).value_counts().idxmax()ifmajority_type==pd.Timestamp:feature_types[feature]=DATE_TAGelifmajority_typenotin[int,float,complex]:feature_types[feature]=CATEGORICAL_TAG# Guess categorical if the feature is an integer and the values are 0/1 to n-1 with no gapselifnp.all(i.is_integer()foriincol)and((col.min()==0andnp.all(np.sort(col.unique())==np.arange(col.nunique())))or(col.min()==1andnp.all(np.sort(col.unique())==np.arange(1,col.nunique()+1)))):feature_types[feature]=CATEGORICAL_TAGelse:feature_types[feature]=CONTINUOUS_TAGadata.var[FEATURE_TYPE_KEY]=pd.Series(feature_types)[adata.var_names]logger.info(f"Stored feature types in adata.var['{FEATURE_TYPE_KEY}']."f" Please verify and adjust if necessary using adata.var['{FEATURE_TYPE_KEY}']['feature1']='corrected_type'.")ifoutput=="tree":feature_type_overview(adata)elifoutput=="dataframe":returnadata.var[FEATURE_TYPE_KEY]elifoutputisnotNone:raiseValueError(f"Output format {output} not recognized. Choose between 'tree', 'dataframe', or None.")
defcheck_feature_types(func):@wraps(func)defwrapper(adata,*args,**kwargs):ifFEATURE_TYPE_KEYnotinadata.var.keys():raiseValueError("Feature types are not specified in adata.var. Please run `infer_feature_types` first.")np.all(adata.var[FEATURE_TYPE_KEY].isin([CATEGORICAL_TAG,CONTINUOUS_TAG,DATE_TAG]))returnfunc(adata,*args,**kwargs)returnwrapper@check_feature_typesdeffeature_type_overview(adata:AnnData):"""Print an overview of the feature types in the AnnData object."""tree=Tree(f"[b] Detected feature types for AnnData object with {len(adata.obs_names)} obs and {len(adata.var_names)} vars",guide_style="underline2",)branch=tree.add("📅[b] Date features")fordateinsorted(adata.var_names[adata.var[FEATURE_TYPE_KEY]==DATE_TAG]):branch.add(date)branch=tree.add("📐[b] Numerical features")fornumericinsorted(adata.var_names[adata.var[FEATURE_TYPE_KEY]==CONTINUOUS_TAG]):branch.add(numeric)branch=tree.add("🗂️[b] Categorical features")cat_features=adata.var_names[adata.var[FEATURE_TYPE_KEY]==CATEGORICAL_TAG]df=anndata_to_df(adata[:,cat_features])forcategoricalinsorted(cat_features):branch.add(f"{categorical} ({df.loc[:,categorical].nunique()} categories)")print(tree)