Skip to content

Imputer

Bases: TableTransformer

Replace missing values using the given strategy.

Parameters:

Name Type Description Default
strategy Strategy

How to replace missing values.

required
value_to_replace float | str | None

The value that should be replaced.

None

Examples:

>>> from safeds.data.tabular.containers import Column, Table
>>> from safeds.data.tabular.transformation import Imputer
>>>
>>> table = Table.from_columns(
...     [
...         Column("a", [1, 3, None]),
...         Column("b", [None, 2, 3]),
...     ],
... )
>>> transformer = Imputer(Imputer.Strategy.Constant(0))
>>> transformed_table = transformer.fit_and_transform(table)
Source code in src/safeds/data/tabular/transformation/_imputer.py
class Imputer(TableTransformer):
    """
    Replace missing values using the given strategy.

    Parameters
    ----------
    strategy:
        How to replace missing values.
    value_to_replace:
        The value that should be replaced.

    Examples
    --------
    >>> from safeds.data.tabular.containers import Column, Table
    >>> from safeds.data.tabular.transformation import Imputer
    >>>
    >>> table = Table.from_columns(
    ...     [
    ...         Column("a", [1, 3, None]),
    ...         Column("b", [None, 2, 3]),
    ...     ],
    ... )
    >>> transformer = Imputer(Imputer.Strategy.Constant(0))
    >>> transformed_table = transformer.fit_and_transform(table)
    """

    class Strategy(ABC):
        """Various strategies to replace missing values. Use the static methods to create instances of this class."""

        @abstractmethod
        def __eq__(self, other: object) -> bool:
            pass  # pragma: no cover

        @abstractmethod
        def __hash__(self) -> int:
            pass  # pragma: no cover

        @abstractmethod
        def _apply(self, imputer: sk_SimpleImputer) -> None:
            """
            Set the imputer strategy of the given imputer.

            Parameters
            ----------
            imputer:
                The imputer to augment.
            """

        @staticmethod
        def Constant(value: Any) -> Imputer.Strategy:  # noqa: N802
            """
            Replace missing values with the given constant value.

            Parameters
            ----------
            value:
                The value to replace missing values.
            """
            return _Constant(value)  # pragma: no cover

        @staticmethod
        def Mean() -> Imputer.Strategy:  # noqa: N802
            """Replace missing values with the mean of each column."""
            return _Mean()  # pragma: no cover

        @staticmethod
        def Median() -> Imputer.Strategy:  # noqa: N802
            """Replace missing values with the median of each column."""
            return _Median()  # pragma: no cover

        @staticmethod
        def Mode() -> Imputer.Strategy:  # noqa: N802
            """Replace missing values with the mode of each column."""
            return _Mode()  # pragma: no cover

    def __init__(self, strategy: Imputer.Strategy, *, value_to_replace: float | str | None = None):
        if value_to_replace is None:
            value_to_replace = pd.NA

        self._strategy = strategy
        self._value_to_replace = value_to_replace

        self._wrapped_transformer: sk_SimpleImputer | None = None
        self._column_names: list[str] | None = None

    @property
    def strategy(self) -> Imputer.Strategy:
        """The strategy used to replace missing values."""
        return self._strategy

    @property
    def value_to_replace(self) -> Any:
        """The value that should be replaced."""
        return self._value_to_replace

    @property
    def is_fitted(self) -> bool:
        """Whether the transformer is fitted."""
        return self._wrapped_transformer is not None

    def fit(self, table: Table, column_names: list[str] | None) -> Imputer:
        """
        Learn a transformation for a set of columns in a table.

        This transformer is not modified.

        Parameters
        ----------
        table:
            The table used to fit the transformer.
        column_names:
            The list of columns from the table used to fit the transformer. If `None`, all columns are used.

        Returns
        -------
        fitted_transformer:
            The fitted transformer.

        Raises
        ------
        UnknownColumnNameError
            If column_names contain a column name that is missing in the table
        ValueError
            If the table contains 0 rows
        NonNumericColumnError
            If the strategy is set to either Mean or Median and the specified columns of the table contain non-numerical data.
        """
        from sklearn.impute import SimpleImputer as sk_SimpleImputer

        if column_names is None:
            column_names = table.column_names
        else:
            missing_columns = sorted(set(column_names) - set(table.column_names))
            if len(missing_columns) > 0:
                raise UnknownColumnNameError(missing_columns)

        if table.number_of_rows == 0:
            raise ValueError("The Imputer cannot be fitted because the table contains 0 rows")

        if (isinstance(self._strategy, _Mean | _Median)) and table.keep_only_columns(
            column_names,
        ).remove_columns_with_non_numerical_values().number_of_columns < len(
            column_names,
        ):
            raise NonNumericColumnError(
                str(
                    sorted(
                        set(table.keep_only_columns(column_names).column_names)
                        - set(
                            table.keep_only_columns(column_names)
                            .remove_columns_with_non_numerical_values()
                            .column_names,
                        ),
                    ),
                ),
            )

        if isinstance(self._strategy, _Mode):
            multiple_most_frequent = {}
            for name in column_names:
                if len(table.get_column(name).mode()) > 1:
                    multiple_most_frequent[name] = table.get_column(name).mode()
            if len(multiple_most_frequent) > 0:
                warnings.warn(
                    "There are multiple most frequent values in a column given to the Imputer.\nThe lowest values"
                    " are being chosen in this cases. The following columns have multiple most frequent"
                    f" values:\n{multiple_most_frequent}",
                    UserWarning,
                    stacklevel=2,
                )

        wrapped_transformer = sk_SimpleImputer()
        self._strategy._apply(wrapped_transformer)
        wrapped_transformer.missing_values = self._value_to_replace
        wrapped_transformer.fit(table._data[column_names])

        result = Imputer(self._strategy)
        result._wrapped_transformer = wrapped_transformer
        result._column_names = column_names

        return result

    def transform(self, table: Table) -> Table:
        """
        Apply the learned transformation to a table.

        The table is not modified.

        Parameters
        ----------
        table:
            The table to which the learned transformation is applied.

        Returns
        -------
        transformed_table:
            The transformed table.

        Raises
        ------
        TransformerNotFittedError
            If the transformer has not been fitted yet.
        UnknownColumnNameError
            If the input table does not contain all columns used to fit the transformer.
        ValueError
            If the table contains 0 rows.
        """
        import pandas as pd

        # Transformer has not been fitted yet
        if self._wrapped_transformer is None or self._column_names is None:
            raise TransformerNotFittedError

        # Input table does not contain all columns used to fit the transformer
        missing_columns = sorted(set(self._column_names) - set(table.column_names))
        if len(missing_columns) > 0:
            raise UnknownColumnNameError(missing_columns)

        if table.number_of_rows == 0:
            raise ValueError("The Imputer cannot transform the table because it contains 0 rows")

        data = table._data.reset_index(drop=True)
        data[self._column_names] = pd.DataFrame(
            self._wrapped_transformer.transform(data[self._column_names]),
            columns=self._column_names,
        )
        return Table._from_pandas_dataframe(data, table.schema)

    def get_names_of_added_columns(self) -> list[str]:
        """
        Get the names of all new columns that have been added by the Imputer.

        Returns
        -------
        added_columns:
            A list of names of the added columns, ordered as they will appear in the table.

        Raises
        ------
        TransformerNotFittedError
            If the transformer has not been fitted yet.
        """
        if not self.is_fitted:
            raise TransformerNotFittedError
        return []

    def get_names_of_changed_columns(self) -> list[str]:
        """
         Get the names of all columns that may have been changed by the Imputer.

        Returns
        -------
        changed_columns:
             The list of (potentially) changed column names, as passed to fit.

        Raises
        ------
        TransformerNotFittedError
            If the transformer has not been fitted yet.
        """
        if self._column_names is None:
            raise TransformerNotFittedError
        return self._column_names

    def get_names_of_removed_columns(self) -> list[str]:
        """
        Get the names of all columns that have been removed by the Imputer.

        Returns
        -------
        removed_columns:
            A list of names of the removed columns, ordered as they appear in the table the Imputer was fitted on.

        Raises
        ------
        TransformerNotFittedError
            If the transformer has not been fitted yet.
        """
        if not self.is_fitted:
            raise TransformerNotFittedError
        return []

is_fitted: bool property

Whether the transformer is fitted.

strategy: Imputer.Strategy property

The strategy used to replace missing values.

value_to_replace: Any property

The value that should be replaced.

Strategy

Bases: ABC

Various strategies to replace missing values. Use the static methods to create instances of this class.

Source code in src/safeds/data/tabular/transformation/_imputer.py
class Strategy(ABC):
    """Various strategies to replace missing values. Use the static methods to create instances of this class."""

    @abstractmethod
    def __eq__(self, other: object) -> bool:
        pass  # pragma: no cover

    @abstractmethod
    def __hash__(self) -> int:
        pass  # pragma: no cover

    @abstractmethod
    def _apply(self, imputer: sk_SimpleImputer) -> None:
        """
        Set the imputer strategy of the given imputer.

        Parameters
        ----------
        imputer:
            The imputer to augment.
        """

    @staticmethod
    def Constant(value: Any) -> Imputer.Strategy:  # noqa: N802
        """
        Replace missing values with the given constant value.

        Parameters
        ----------
        value:
            The value to replace missing values.
        """
        return _Constant(value)  # pragma: no cover

    @staticmethod
    def Mean() -> Imputer.Strategy:  # noqa: N802
        """Replace missing values with the mean of each column."""
        return _Mean()  # pragma: no cover

    @staticmethod
    def Median() -> Imputer.Strategy:  # noqa: N802
        """Replace missing values with the median of each column."""
        return _Median()  # pragma: no cover

    @staticmethod
    def Mode() -> Imputer.Strategy:  # noqa: N802
        """Replace missing values with the mode of each column."""
        return _Mode()  # pragma: no cover

Constant(value) staticmethod

Replace missing values with the given constant value.

Parameters:

Name Type Description Default
value Any

The value to replace missing values.

required
Source code in src/safeds/data/tabular/transformation/_imputer.py
@staticmethod
def Constant(value: Any) -> Imputer.Strategy:  # noqa: N802
    """
    Replace missing values with the given constant value.

    Parameters
    ----------
    value:
        The value to replace missing values.
    """
    return _Constant(value)  # pragma: no cover

Mean() staticmethod

Replace missing values with the mean of each column.

Source code in src/safeds/data/tabular/transformation/_imputer.py
@staticmethod
def Mean() -> Imputer.Strategy:  # noqa: N802
    """Replace missing values with the mean of each column."""
    return _Mean()  # pragma: no cover

Median() staticmethod

Replace missing values with the median of each column.

Source code in src/safeds/data/tabular/transformation/_imputer.py
@staticmethod
def Median() -> Imputer.Strategy:  # noqa: N802
    """Replace missing values with the median of each column."""
    return _Median()  # pragma: no cover

Mode() staticmethod

Replace missing values with the mode of each column.

Source code in src/safeds/data/tabular/transformation/_imputer.py
@staticmethod
def Mode() -> Imputer.Strategy:  # noqa: N802
    """Replace missing values with the mode of each column."""
    return _Mode()  # pragma: no cover

fit(table, column_names)

Learn a transformation for a set of columns in a table.

This transformer is not modified.

Parameters:

Name Type Description Default
table Table

The table used to fit the transformer.

required
column_names list[str] | None

The list of columns from the table used to fit the transformer. If None, all columns are used.

required

Returns:

Name Type Description
fitted_transformer Imputer

The fitted transformer.

Raises:

Type Description
UnknownColumnNameError

If column_names contain a column name that is missing in the table

ValueError

If the table contains 0 rows

NonNumericColumnError

If the strategy is set to either Mean or Median and the specified columns of the table contain non-numerical data.

Source code in src/safeds/data/tabular/transformation/_imputer.py
def fit(self, table: Table, column_names: list[str] | None) -> Imputer:
    """
    Learn a transformation for a set of columns in a table.

    This transformer is not modified.

    Parameters
    ----------
    table:
        The table used to fit the transformer.
    column_names:
        The list of columns from the table used to fit the transformer. If `None`, all columns are used.

    Returns
    -------
    fitted_transformer:
        The fitted transformer.

    Raises
    ------
    UnknownColumnNameError
        If column_names contain a column name that is missing in the table
    ValueError
        If the table contains 0 rows
    NonNumericColumnError
        If the strategy is set to either Mean or Median and the specified columns of the table contain non-numerical data.
    """
    from sklearn.impute import SimpleImputer as sk_SimpleImputer

    if column_names is None:
        column_names = table.column_names
    else:
        missing_columns = sorted(set(column_names) - set(table.column_names))
        if len(missing_columns) > 0:
            raise UnknownColumnNameError(missing_columns)

    if table.number_of_rows == 0:
        raise ValueError("The Imputer cannot be fitted because the table contains 0 rows")

    if (isinstance(self._strategy, _Mean | _Median)) and table.keep_only_columns(
        column_names,
    ).remove_columns_with_non_numerical_values().number_of_columns < len(
        column_names,
    ):
        raise NonNumericColumnError(
            str(
                sorted(
                    set(table.keep_only_columns(column_names).column_names)
                    - set(
                        table.keep_only_columns(column_names)
                        .remove_columns_with_non_numerical_values()
                        .column_names,
                    ),
                ),
            ),
        )

    if isinstance(self._strategy, _Mode):
        multiple_most_frequent = {}
        for name in column_names:
            if len(table.get_column(name).mode()) > 1:
                multiple_most_frequent[name] = table.get_column(name).mode()
        if len(multiple_most_frequent) > 0:
            warnings.warn(
                "There are multiple most frequent values in a column given to the Imputer.\nThe lowest values"
                " are being chosen in this cases. The following columns have multiple most frequent"
                f" values:\n{multiple_most_frequent}",
                UserWarning,
                stacklevel=2,
            )

    wrapped_transformer = sk_SimpleImputer()
    self._strategy._apply(wrapped_transformer)
    wrapped_transformer.missing_values = self._value_to_replace
    wrapped_transformer.fit(table._data[column_names])

    result = Imputer(self._strategy)
    result._wrapped_transformer = wrapped_transformer
    result._column_names = column_names

    return result

get_names_of_added_columns()

Get the names of all new columns that have been added by the Imputer.

Returns:

Name Type Description
added_columns list[str]

A list of names of the added columns, ordered as they will appear in the table.

Raises:

Type Description
TransformerNotFittedError

If the transformer has not been fitted yet.

Source code in src/safeds/data/tabular/transformation/_imputer.py
def get_names_of_added_columns(self) -> list[str]:
    """
    Get the names of all new columns that have been added by the Imputer.

    Returns
    -------
    added_columns:
        A list of names of the added columns, ordered as they will appear in the table.

    Raises
    ------
    TransformerNotFittedError
        If the transformer has not been fitted yet.
    """
    if not self.is_fitted:
        raise TransformerNotFittedError
    return []

get_names_of_changed_columns()

Get the names of all columns that may have been changed by the Imputer.

Returns:

Name Type Description
changed_columns list[str]

The list of (potentially) changed column names, as passed to fit.

Raises:

Type Description
TransformerNotFittedError

If the transformer has not been fitted yet.

Source code in src/safeds/data/tabular/transformation/_imputer.py
def get_names_of_changed_columns(self) -> list[str]:
    """
     Get the names of all columns that may have been changed by the Imputer.

    Returns
    -------
    changed_columns:
         The list of (potentially) changed column names, as passed to fit.

    Raises
    ------
    TransformerNotFittedError
        If the transformer has not been fitted yet.
    """
    if self._column_names is None:
        raise TransformerNotFittedError
    return self._column_names

get_names_of_removed_columns()

Get the names of all columns that have been removed by the Imputer.

Returns:

Name Type Description
removed_columns list[str]

A list of names of the removed columns, ordered as they appear in the table the Imputer was fitted on.

Raises:

Type Description
TransformerNotFittedError

If the transformer has not been fitted yet.

Source code in src/safeds/data/tabular/transformation/_imputer.py
def get_names_of_removed_columns(self) -> list[str]:
    """
    Get the names of all columns that have been removed by the Imputer.

    Returns
    -------
    removed_columns:
        A list of names of the removed columns, ordered as they appear in the table the Imputer was fitted on.

    Raises
    ------
    TransformerNotFittedError
        If the transformer has not been fitted yet.
    """
    if not self.is_fitted:
        raise TransformerNotFittedError
    return []

transform(table)

Apply the learned transformation to a table.

The table is not modified.

Parameters:

Name Type Description Default
table Table

The table to which the learned transformation is applied.

required

Returns:

Name Type Description
transformed_table Table

The transformed table.

Raises:

Type Description
TransformerNotFittedError

If the transformer has not been fitted yet.

UnknownColumnNameError

If the input table does not contain all columns used to fit the transformer.

ValueError

If the table contains 0 rows.

Source code in src/safeds/data/tabular/transformation/_imputer.py
def transform(self, table: Table) -> Table:
    """
    Apply the learned transformation to a table.

    The table is not modified.

    Parameters
    ----------
    table:
        The table to which the learned transformation is applied.

    Returns
    -------
    transformed_table:
        The transformed table.

    Raises
    ------
    TransformerNotFittedError
        If the transformer has not been fitted yet.
    UnknownColumnNameError
        If the input table does not contain all columns used to fit the transformer.
    ValueError
        If the table contains 0 rows.
    """
    import pandas as pd

    # Transformer has not been fitted yet
    if self._wrapped_transformer is None or self._column_names is None:
        raise TransformerNotFittedError

    # Input table does not contain all columns used to fit the transformer
    missing_columns = sorted(set(self._column_names) - set(table.column_names))
    if len(missing_columns) > 0:
        raise UnknownColumnNameError(missing_columns)

    if table.number_of_rows == 0:
        raise ValueError("The Imputer cannot transform the table because it contains 0 rows")

    data = table._data.reset_index(drop=True)
    data[self._column_names] = pd.DataFrame(
        self._wrapped_transformer.transform(data[self._column_names]),
        columns=self._column_names,
    )
    return Table._from_pandas_dataframe(data, table.schema)