Source code for ehrapy.tools.nlp._translators

from __future__ import annotations

from functools import wraps
from typing import Iterable, List, Union

import deepl
import numpy as np
from anndata import AnnData

try:
    from deep_translator import (
        GoogleTranslator,
        LibreTranslator,
        MicrosoftTranslator,
        MyMemoryTranslator,
        YandexTranslator,
    )
except ConnectionError:  # pragma: no cover
    print("[bold red]Unable to import GoogleTranslator. Do you have an internet connection?")
from deepl import Formality, GlossaryInfo, TextResult
from rich import print

from ehrapy.anndata.anndata_ext import _get_column_values, get_column_indices


[docs]class Translator: """Class providing an interface to all translation functions. Requires a flavour.""" def __init__( self, flavour: str = "deepl", source: str = "de", target: str = "en", token: str = None ) -> None: # pragma: no cover self.translator: DeepL | GoogleTranslate | LibreTranslate | MyMemoryTranslate | MicrosoftTranslate | YandexTranslate = ( None ) if flavour == "deepl": self.translator = DeepL(token) elif flavour == "googletranslate": self.translator = GoogleTranslate(source, target) # type: ignore elif flavour == "libre": self.translator = LibreTranslate(source, target) # type: ignore elif flavour == "mymemory": self.translator = MyMemoryTranslate(source, target) # type: ignore elif flavour == "microsoft": self.translator = MicrosoftTranslate(token, source, target) # type: ignore elif flavour == "yandex": self.translator = YandexTranslate(token, source, target) # type: ignore else: raise NotImplementedError(f"Flavour '{flavour}' is not supported.") self.flavour = flavour self.source_language = source self.target_language = target
[docs] def translate_text(self, text: str | Iterable, target_language: str = None) -> str | list[str]: # pragma: no cover """Translates the provided text into the target language. Args: text: The text to translate target_language: The target language to translate the Text into, e.g. EN-GB Returns: A :class:`~deepl.TextResult` object """ if target_language is None: target_language = self.target_language return self.translator.translate_text(text, target_language=target_language)
[docs] def translate_obs_column( self, adata: AnnData, columns=Union[str, List], translate_column_name: bool = False, inplace: bool = False, ) -> None: """Translates a single obs column and optionally replaces the original values Args: adata: :class:`~anndata.AnnData` object containing the obs column to translate target_language: The target language to translate into (default: EN-US) columns: The columns to translate. Can be either a single column (str) or a list of columns translate_column_name: Whether to translate the column name itself inplace: Whether to replace the obs values or add a new obs column """ if isinstance(columns, str): # pragma: no cover columns = [columns] translate_text = self.translate_text for column in columns: # as of Pandas 1.1.0 the default for new string column is still 'object' if adata.obs[column].dtype != str and adata.obs[column].dtype != object: # pragma: no cover raise ValueError("Attempted to translate column {column} which does not contain only strings.") target_column = column if translate_column_name: # TODO This requires a test target_column = translate_text(column) if not inplace: # pragma: no cover target_column = f"{target_column}_{self.target_language}" adata.obs[target_column] = adata.obs[column].apply(translate_text)
[docs] def translate_var_column( self, adata: AnnData, columns=Union[str, List], translate_column_name: bool = False, inplace: bool = False, ) -> None: """Translates a single var column and optionally replaces the original values Args: adata: :class:`~anndata.AnnData` object containing the obs column to translate target_language: The target language to translate into (default: EN-US) columns: The columns to translate. Can be either a single column (str) or a list of columns translate_column_name: Whether to translate the column name itself inplace: Whether to replace the obs values or add a new obs column """ if isinstance(columns, str): # pragma: no cover columns = [columns] translate_text = self.translate_text for column in columns: # as of Pandas 1.1.0 the default for new string column is still 'object' if adata.var[column].dtype != str and adata.var[column].dtype != object: # pragma: no cover raise ValueError("Attempted to translate column {column} which does not contain only strings.") target_column = column if translate_column_name: # TODO this requires a test target_column = translate_text(column) if not inplace: # pragma: no cover target_column = f"{target_column}_{self.target_language}" adata.var[target_column] = adata.var[column].apply(translate_text)
[docs] def translate_X_column( self, adata: AnnData, columns=Union[str, List], translate_column_name: bool = False, ) -> None: """Translates a X column into the target language in place. Note that the translation of a column in X is **always** in place. Args: adata: :class:`~anndata.AnnData` object containing the var column to translate target_language: The target language to translate into (default: EN-US) columns: The columns to translate. Can be either a single column (str) or a list of columns translate_column_name: Whether to translate the column name itself (only translates var_names, not var) """ if isinstance(columns, str): # pragma: no cover columns = [columns] translate_text = self.translate_text indices = get_column_indices(adata, columns) for column, index in zip(columns, indices): column_values = _get_column_values(adata, index) if column_values.dtype != str and column_values.dtype != object: # pragma: no cover raise ValueError("Attempted to translate column {column} which does not only contain strings.") if translate_column_name: # TODO This requires a test translated_column_name = translate_text(column) index_values = adata.var_names.tolist() index_values[index] = translated_column_name adata.var_names = index_values translated_column_values: str | list[str] = translate_text(column_values) adata.X[:, index] = translated_column_values
class DeepL: """Implementation of the DeepL translator""" def __init__(self, authentication_key: str): # pragma: no cover self.translator = deepl.Translator(authentication_key) def _check_usage(function): # noqa # pragma: no cover """Checks the usage limit of the DeepL Account. Prints a warning if the DeepL usage limit is exceeded. Args: function: The function to actually call """ @wraps(function) def wrapper(self, *args, **kwargs) -> None: usage = self.translator.get_usage() if usage.any_limit_exceeded: print("[bold red]DeepL limit exceeded. Please increase your quota") else: if (usage.character.count / usage.character.limit) > 0.9: print( "[bold yellow]Reached 90% of the character translation limit. " "Ensure that you have enough quota." ) print(f"[bold yellow]{self.translator.get_usage}") elif usage.document.limit is not None and (usage.document.count / usage.document.limit) > 0.9: print( "[bold yellow]Reached 90% of the document translation limit. " "Ensure that you have enough quota." ) print(f"[bold yellow]Current usage: {usage.document.count}") elif ( usage.team_document.limit is not None and (usage.team_document.count / usage.team_document.limit) > 0.9 ): print( "[bold yellow]Reached 90% of the team document translation limit " "Ensure that you have enough quota" ) print(f"[bold yellow]Current usage: {usage.team_document.count}") return function(self, *args, **kwargs) # type: ignore return wrapper # @_check_usage def authenticate(self, authentication_key: str) -> None: # pragma: no cover """Authenticates the DeepL user Args: authentication_key: DeepL authentication key """ self.translator = deepl.Translator(authentication_key) def print_source_languages(self) -> None: # pragma: no cover """prints all possible source languages to translate from Example: "DE (German)" """ for language in self.translator.get_source_languages(): print(f"{language.code} ({language.name})") def print_target_languages(self) -> None: # pragma: no cover """Prints all possible target languages to translate to""" for language in self.translator.get_target_languages(): if language.supports_formality: print(f"{language.code} ({language.name}) supports formality") else: print(f"{language.code} ({language.name})") # @_check_usage def translate_text(self, text: str | Iterable, target_language: str) -> list[str] | str: """Translates the provided text into the target language Args: text: The text to translate target_language: The target language to translate the Text into, e.g. EN-GB Returns: A :class:`~deepl.TextResult` object """ if isinstance(text, List) or isinstance(text, np.ndarray): return [ self.translator.translate_text(translation, target_lang=target_language).text for translation in text ] return self.translator.translate_text(text, target_lang=target_language).text # @_check_usage # pragma: no cover def translate_document( self, input_file_path: str, output_path: str, target_language: str, formality: str = Formality.DEFAULT ) -> None: """Translate a complete document into the target language Args: input_file_path: File path to the document to translate output_path: Output path to write the translation to target_language: Target language to translate the document into formality: Desired formality for translation, as Formality enum, "less" or "more". """ self.translator.translate_document_from_filepath( input_file_path, output_path, target_lang=target_language, formality=formality ) # @_check_usage # pragma: no cover def create_glossary( self, glossary_name: str, source_language: str, target_language: str, entries: dict[str, str] ) -> GlossaryInfo: """Creates a DeepL Glossary to translate with. A Glossary may help ensuring that specific words get translated into specific translations Args: glossary_name: Name of the Glossary source_language: The source language of the Glossary target_language: The target language of the Glossary entries: A Dictionary of Glossary entries Returns: A :class:`~deepl.GlossaryInfo` object """ return self.translator.create_glossary(glossary_name, source_language, target_language, entries) # @_check_usage def translate_with_glossary(self, text: str | list, glossary: GlossaryInfo) -> TextResult | list[TextResult]: """Translates text with a provided Glossary Args: text: Text to translate glossary: A :class:`~deepl.GlossaryInfo` object Returns: A :class:`~deepl.TextResult` object """ return self.translator.translate_text_with_glossary(text, glossary) class GoogleTranslate: def __init__(self, source="auto", target="en"): self.translator = GoogleTranslator(source, target) def print_source_languages(self) -> None: # pragma: no cover """prints all possible source languages to translate from Example: "DE (German)" """ for code, language in self.translator.get_supported_languages(as_dict=True).items(): print(f"{code} ({language})") def print_target_languages(self) -> None: # pragma: no cover """Prints all possible target languages to translate to""" for code, language in self.translator.get_supported_languages(as_dict=True).items(): print(f"{code} ({language})") def translate_text(self, text: str | Iterable, target_language: str) -> str | list[str]: """Translates the provided text into the target language Args: text: The text to translate target_language: The target language to translate the Text into, e.g. EN-GB Returns: The translated text. """ if isinstance(text, List) or isinstance(text, np.ndarray): return [self.translator.translate(word, target_lang=target_language) for word in text] return self.translator.translate(text, target_lang=target_language) class LibreTranslate: def __init__(self, source="auto", target="en"): self.translator = LibreTranslator(source, target) def print_source_languages(self) -> None: # pragma: no cover """prints all possible source languages to translate from Example: "DE (German)" """ for code, language in self.translator.get_supported_languages(as_dict=True).items(): print(f"{code} ({language})") def print_target_languages(self) -> None: # pragma: no cover """Prints all possible target languages to translate to""" for code, language in self.translator.get_supported_languages(as_dict=True).items(): print(f"{code} ({language})") def translate_text(self, text: str | Iterable, target_language: str) -> str | list[str]: """Translates the provided text into the target language Args: text: The text to translate target_language: The target language to translate the Text into, e.g. EN-GB Returns: The translated text. """ if isinstance(text, List) or isinstance(text, np.ndarray): return [self.translator.translate(word, target_lang=target_language) for word in text] return self.translator.translate(text, target_lang=target_language) class MyMemoryTranslate: def __init__(self, source="auto", target="en"): self.translator = MyMemoryTranslator(source, target) def print_source_languages(self) -> None: # pragma: no cover """prints all possible source languages to translate from Example: "DE (German)" """ for code, language in self.translator.get_supported_languages(as_dict=True).items(): print(f"{code} ({language})") def print_target_languages(self) -> None: # pragma: no cover """Prints all possible target languages to translate to""" for code, language in self.translator.get_supported_languages(as_dict=True).items(): print(f"{code} ({language})") def translate_text(self, text: str | Iterable, target_language: str) -> str | list[str]: """Translates the provided text into the target language Args: text: The text to translate target_language: The target language to translate the Text into, e.g. EN-GB Returns: The translated text. """ if isinstance(text, List) or isinstance(text, np.ndarray): return [self.translator.translate(word, target_lang=target_language) for word in text] return self.translator.translate(text, target_lang=target_language) class MicrosoftTranslate: def __init__(self, authentication_key, source="auto", target="en"): self.translator = MicrosoftTranslator(api_key=authentication_key, source=source, target=target) def print_source_languages(self) -> None: # pragma: no cover """prints all possible source languages to translate from Example: "DE (German)" """ for code, language in self.translator.get_supported_languages(as_dict=True).items(): print(f"{code} ({language})") def print_target_languages(self) -> None: # pragma: no cover """Prints all possible target languages to translate to""" for code, language in self.translator.get_supported_languages(as_dict=True).items(): print(f"{code} ({language})") def translate_text(self, text: str | Iterable, target_language: str) -> str | list[str]: """Translates the provided text into the target language Args: text: The text to translate target_language: The target language to translate the Text into, e.g. EN-GB Returns: The translated text. """ if isinstance(text, List) or isinstance(text, np.ndarray): return [self.translator.translate(word, target_lang=target_language) for word in text] return self.translator.translate(text, target_lang=target_language) class YandexTranslate: def __init__(self, authentication_key, source="auto", target="en"): # pragma: no cover self.translator = YandexTranslator(api_key=authentication_key, source=source, target=target) def print_source_languages(self) -> None: # pragma: no cover """prints all possible source languages to translate from Example: "DE (German)" """ for code, language in self.translator.get_supported_languages(as_dict=True).items(): print(f"{code} ({language})") def print_target_languages(self) -> None: # pragma: no cover """Prints all possible target languages to translate to""" for code, language in self.translator.get_supported_languages(as_dict=True).items(): print(f"{code} ({language})") def translate_text(self, text: str | Iterable, target_language: str) -> str | list[str]: """Translates the provided text into the target language Args: text: The text to translate target_language: The target language to translate the Text into, e.g. EN-GB Returns: The translated text. """ if isinstance(text, List) or isinstance(text, np.ndarray): return [self.translator.translate(word, target_lang=target_language) for word in text] return self.translator.translate(text, target_lang=target_language)