Skip to content

BaselineClassifier

Baseline Classifier.

Get a baseline by fitting data on multiple different models and comparing the best metrics.

Parameters:

Name Type Description Default
extended_search bool

If set to true, an extended set of models will be used to fit the classifier. This might result in significantly higher runtime.

False

Methods:

Name Description
fit

Train the Classifier with given training data.

predict

Make a prediction for the given test data and calculate the best metrics.

Attributes:

Name Type Description
is_fitted bool

Whether the model is fitted.

Source code in src/safeds/ml/classical/classification/_baseline_classifier.py
class BaselineClassifier:
    """
    Baseline Classifier.

    Get a baseline by fitting data on multiple different models and comparing the best metrics.

    Parameters
    ----------
    extended_search:
        If set to true, an extended set of models will be used to fit the classifier.
        This might result in significantly higher runtime.
    """

    def __init__(self, extended_search: bool = False):
        self._is_fitted = False
        self._list_of_model_types = [
            AdaBoostClassifier(),
            DecisionTreeClassifier(),
            SupportVectorClassifier(),
            RandomForestClassifier(),
        ]
        if extended_search:
            self._list_of_model_types.extend([GradientBoostingClassifier()])  # pragma: no cover

        self._fitted_models: list[Classifier] = []
        self._feature_names: list[str] | None = None
        self._target_name: str = "none"

    def fit(self, train_data: TabularDataset) -> Self:
        """
        Train the Classifier with given training data.

        The original model is not modified.

        Parameters
        ----------
        train_data:
            The data the network should be trained on.

        Returns
        -------
        trained_classifier:
            The trained Classifier

        Raises
        ------
        DatasetMissesDataError
            If the given train_data contains no data.
        ColumnTypeError
            If one or more columns contain non-numeric values.
        """
        from concurrent.futures import ProcessPoolExecutor

        # Validate Data
        train_data_as_table = train_data.to_table()
        if train_data_as_table.row_count == 0:
            raise DatasetMissesDataError
        _check_columns_are_numeric(train_data_as_table, train_data.features.add_columns(train_data.target).column_names)

        copied_model = copy.deepcopy(self)

        with ProcessPoolExecutor(
            max_workers=len(self._list_of_model_types),
            mp_context=mp.get_context("spawn"),
        ) as executor:
            futures = []
            for model in self._list_of_model_types:
                futures.append(executor.submit(_fit_single_model, model, train_data))
            [done, _] = wait(futures, return_when=ALL_COMPLETED)
            for future in done:
                copied_model._fitted_models.append(future.result())
        executor.shutdown()

        copied_model._is_fitted = True
        copied_model._feature_names = train_data.features.column_names
        copied_model._target_name = train_data.target.name
        return copied_model

    def predict(self, test_data: TabularDataset) -> dict[str, float]:
        """
        Make a prediction for the given test data and calculate the best metrics.

        The original Model is not modified.

        Parameters
        ----------
        test_data:
            The data the Classifier should predict.

        Returns
        -------
        best_metrics:
            A dictionary with the best metrics that were achieved.

        Raises
        ------
        NotFittedError
            If the model has not been fitted yet
        FeatureDataMismatchError
            If the features of the test data do not match with the features of the trained Classifier.
        DatasetMissesDataError
            If the given test_data contains no data.
        TargetDataMismatchError
            If the target column of the test data does not match the target column of the training data.
        ColumnTypeError
            If one or more columns contain non-numeric values.
        """
        from concurrent.futures import ProcessPoolExecutor

        from safeds.ml.metrics import ClassificationMetrics

        if not self._is_fitted:
            raise NotFittedError(kind="model")

        # Validate data
        if not self._feature_names == test_data.features.column_names:
            raise FeatureDataMismatchError
        if not self._target_name == test_data.target.name:
            raise TargetDataMismatchError(
                actual_target_name=test_data.target.name,
                missing_target_name=self._target_name,
            )
        test_data_as_table = test_data.to_table()
        if test_data_as_table.row_count == 0:
            raise DatasetMissesDataError
        _check_columns_are_numeric(test_data_as_table, test_data.features.add_columns(test_data.target).column_names)

        with ProcessPoolExecutor(
            max_workers=len(self._list_of_model_types),
            mp_context=mp.get_context("spawn"),
        ) as executor:
            results = []
            futures = []
            for model in self._fitted_models:
                futures.append(executor.submit(_predict_single_model, model, test_data))
            [done, _] = wait(futures, return_when=ALL_COMPLETED)
            for future in done:
                results.append(future.result())
        executor.shutdown()

        max_metrics = {"accuracy": 0.0, "f1score": 0.0, "precision": 0.0, "recall": 0.0}
        for result in results:
            accuracy = ClassificationMetrics.accuracy(result, test_data)

            positive_class = test_data.target.get_value(0)
            f1score = ClassificationMetrics.f1_score(result, test_data, positive_class)
            precision = ClassificationMetrics.precision(result, test_data, positive_class)
            recall = ClassificationMetrics.recall(result, test_data, positive_class)

            if max_metrics.get("accuracy", 0.0) < accuracy:
                max_metrics.update({"accuracy": accuracy})

            if max_metrics.get("f1score", 0.0) < f1score:
                max_metrics.update({"f1score": f1score})

            if max_metrics.get("precision", 0.0) < precision:
                max_metrics.update({"precision": precision})

            if max_metrics.get("recall", 0.0) < recall:
                max_metrics.update({"recall": recall})

        return max_metrics

    @property
    def is_fitted(self) -> bool:
        """Whether the model is fitted."""
        return self._is_fitted

is_fitted

Whether the model is fitted.

fit

Train the Classifier with given training data.

The original model is not modified.

Parameters:

Name Type Description Default
train_data TabularDataset

The data the network should be trained on.

required

Returns:

Name Type Description
trained_classifier Self

The trained Classifier

Raises:

Type Description
DatasetMissesDataError

If the given train_data contains no data.

ColumnTypeError

If one or more columns contain non-numeric values.

Source code in src/safeds/ml/classical/classification/_baseline_classifier.py
def fit(self, train_data: TabularDataset) -> Self:
    """
    Train the Classifier with given training data.

    The original model is not modified.

    Parameters
    ----------
    train_data:
        The data the network should be trained on.

    Returns
    -------
    trained_classifier:
        The trained Classifier

    Raises
    ------
    DatasetMissesDataError
        If the given train_data contains no data.
    ColumnTypeError
        If one or more columns contain non-numeric values.
    """
    from concurrent.futures import ProcessPoolExecutor

    # Validate Data
    train_data_as_table = train_data.to_table()
    if train_data_as_table.row_count == 0:
        raise DatasetMissesDataError
    _check_columns_are_numeric(train_data_as_table, train_data.features.add_columns(train_data.target).column_names)

    copied_model = copy.deepcopy(self)

    with ProcessPoolExecutor(
        max_workers=len(self._list_of_model_types),
        mp_context=mp.get_context("spawn"),
    ) as executor:
        futures = []
        for model in self._list_of_model_types:
            futures.append(executor.submit(_fit_single_model, model, train_data))
        [done, _] = wait(futures, return_when=ALL_COMPLETED)
        for future in done:
            copied_model._fitted_models.append(future.result())
    executor.shutdown()

    copied_model._is_fitted = True
    copied_model._feature_names = train_data.features.column_names
    copied_model._target_name = train_data.target.name
    return copied_model

predict

Make a prediction for the given test data and calculate the best metrics.

The original Model is not modified.

Parameters:

Name Type Description Default
test_data TabularDataset

The data the Classifier should predict.

required

Returns:

Name Type Description
best_metrics dict[str, float]

A dictionary with the best metrics that were achieved.

Raises:

Type Description
NotFittedError

If the model has not been fitted yet

FeatureDataMismatchError

If the features of the test data do not match with the features of the trained Classifier.

DatasetMissesDataError

If the given test_data contains no data.

TargetDataMismatchError

If the target column of the test data does not match the target column of the training data.

ColumnTypeError

If one or more columns contain non-numeric values.

Source code in src/safeds/ml/classical/classification/_baseline_classifier.py
def predict(self, test_data: TabularDataset) -> dict[str, float]:
    """
    Make a prediction for the given test data and calculate the best metrics.

    The original Model is not modified.

    Parameters
    ----------
    test_data:
        The data the Classifier should predict.

    Returns
    -------
    best_metrics:
        A dictionary with the best metrics that were achieved.

    Raises
    ------
    NotFittedError
        If the model has not been fitted yet
    FeatureDataMismatchError
        If the features of the test data do not match with the features of the trained Classifier.
    DatasetMissesDataError
        If the given test_data contains no data.
    TargetDataMismatchError
        If the target column of the test data does not match the target column of the training data.
    ColumnTypeError
        If one or more columns contain non-numeric values.
    """
    from concurrent.futures import ProcessPoolExecutor

    from safeds.ml.metrics import ClassificationMetrics

    if not self._is_fitted:
        raise NotFittedError(kind="model")

    # Validate data
    if not self._feature_names == test_data.features.column_names:
        raise FeatureDataMismatchError
    if not self._target_name == test_data.target.name:
        raise TargetDataMismatchError(
            actual_target_name=test_data.target.name,
            missing_target_name=self._target_name,
        )
    test_data_as_table = test_data.to_table()
    if test_data_as_table.row_count == 0:
        raise DatasetMissesDataError
    _check_columns_are_numeric(test_data_as_table, test_data.features.add_columns(test_data.target).column_names)

    with ProcessPoolExecutor(
        max_workers=len(self._list_of_model_types),
        mp_context=mp.get_context("spawn"),
    ) as executor:
        results = []
        futures = []
        for model in self._fitted_models:
            futures.append(executor.submit(_predict_single_model, model, test_data))
        [done, _] = wait(futures, return_when=ALL_COMPLETED)
        for future in done:
            results.append(future.result())
    executor.shutdown()

    max_metrics = {"accuracy": 0.0, "f1score": 0.0, "precision": 0.0, "recall": 0.0}
    for result in results:
        accuracy = ClassificationMetrics.accuracy(result, test_data)

        positive_class = test_data.target.get_value(0)
        f1score = ClassificationMetrics.f1_score(result, test_data, positive_class)
        precision = ClassificationMetrics.precision(result, test_data, positive_class)
        recall = ClassificationMetrics.recall(result, test_data, positive_class)

        if max_metrics.get("accuracy", 0.0) < accuracy:
            max_metrics.update({"accuracy": accuracy})

        if max_metrics.get("f1score", 0.0) < f1score:
            max_metrics.update({"f1score": f1score})

        if max_metrics.get("precision", 0.0) < precision:
            max_metrics.update({"precision": precision})

        if max_metrics.get("recall", 0.0) < recall:
            max_metrics.update({"recall": recall})

    return max_metrics