ImageDataset

Bases: Dataset[ImageList, Out_co]

A Dataset for ImageLists as input and ImageLists, Tables or Columns as output.

Parameters:

Name	Type	Description	Default
`input_data`	`ImageList`	the input ImageList	required
`output_data`	`Out_co`	the output data	required
`batch_size`	`int`	the batch size used for training	`1`
`shuffle`	`bool`	whether the data should be shuffled after each epoch of training	`False`

Methods:

Name	Description
`get_input`	Get the input data of this dataset.
`get_output`	Get the output data of this dataset.
`shuffle`	Return a new `ImageDataset` with shuffled data.
`split`	Create two image datasets by splitting the data of the current dataset.

Attributes:

Name	Type	Description
`input_size`	`ImageSize`	Get the input `ImageSize` of this dataset.
`output_size`	`ImageSize \| int`	Get the output size of this dataset.

Source code in src/safeds/data/labeled/containers/_image_dataset.py

class ImageDataset(Dataset[ImageList, Out_co]):
    """
    A Dataset for ImageLists as input and ImageLists, Tables or Columns as output.

    Parameters
    ----------
    input_data:
        the input ImageList
    output_data:
        the output data
    batch_size:
        the batch size used for training
    shuffle:
        whether the data should be shuffled after each epoch of training
    """

    def __init__(self, input_data: ImageList, output_data: Out_co, batch_size: int = 1, shuffle: bool = False) -> None:
        import torch

        _init_default_device()

        self._shuffle_tensor_indices: torch.Tensor = torch.tensor(list(range(len(input_data))), dtype=torch.int64)
        self._shuffle_after_epoch: bool = shuffle
        self._batch_size: int = batch_size
        self._next_batch_index: int = 0

        if isinstance(input_data, _MultiSizeImageList):
            raise ValueError("The given input ImageList contains images of different sizes.")  # noqa: TRY004
        elif isinstance(input_data, _EmptyImageList):
            raise ValueError("The given input ImageList contains no images.")  # noqa: TRY004
        else:
            self._input_size: ImageSize = ImageSize(input_data.widths[0], input_data.heights[0], input_data.channel)
            self._input: _SingleSizeImageList = input_data._as_single_size_image_list()
        if ((isinstance(output_data, Column | Table)) and len(input_data) != output_data.row_count) or (
            isinstance(output_data, ImageList) and len(input_data) != len(output_data)
        ):
            if isinstance(output_data, Table):
                output_len = output_data.row_count
            else:
                output_len = len(output_data)
            raise OutputLengthMismatchError(f"{len(input_data)} != {output_len}")
        if isinstance(output_data, Table):
            non_numerical_columns = []
            wrong_interval_columns = []
            for column_name in output_data.column_names:
                if not output_data.get_column_type(column_name).is_numeric:
                    non_numerical_columns.append(column_name)
                elif (output_data.get_column(column_name).min() or 0) < 0 or (
                    output_data.get_column(column_name).max() or 0
                ) > 1:
                    wrong_interval_columns.append(column_name)
            if len(non_numerical_columns) > 0:
                raise NonNumericColumnError(f"Columns {non_numerical_columns} are not numerical.")
            if len(wrong_interval_columns) > 0:
                raise ValueError(f"Columns {wrong_interval_columns} have values outside of the interval [0, 1].")
            _output: _TableAsTensor | _ColumnAsTensor | _SingleSizeImageList = _TableAsTensor(output_data)
            _output_size: int | ImageSize = output_data.column_count
        elif isinstance(output_data, Column):
            _column_as_tensor = _ColumnAsTensor(output_data)
            _output_size = len(_column_as_tensor._one_hot_encoder._get_names_of_added_columns())
            _output = _column_as_tensor
        elif isinstance(output_data, _SingleSizeImageList):
            _output = output_data._clone()._as_single_size_image_list()
            _output_size = ImageSize(output_data.widths[0], output_data.heights[0], output_data.channel)
        else:
            raise ValueError("The given output ImageList contains images of different sizes.")  # noqa: TRY004
        self._output = _output  # type: ignore[var-annotated]  # TODO: check what the type should be
        self._output_size = _output_size  # type: ignore[var-annotated]  # TODO: check what the type should be

    def __iter__(self) -> ImageDataset:
        if self._shuffle_after_epoch:
            im_ds = self.shuffle()
        else:
            im_ds = copy.copy(self)
        im_ds._next_batch_index = 0
        return im_ds

    def __next__(self) -> tuple[Tensor, Tensor]:
        if self._next_batch_index * self._batch_size >= len(self._shuffle_tensor_indices):
            raise StopIteration
        self._next_batch_index += 1
        return self._get_batch(self._next_batch_index - 1)

    def __len__(self) -> int:
        return len(self._shuffle_tensor_indices)

    def __eq__(self, other: object) -> bool:
        """
        Compare two image datasets.

        Parameters
        ----------
        other:
            The image dataset to compare to.

        Returns
        -------
        equals:
            Whether the two image datasets are the same.
        """
        if not isinstance(other, ImageDataset):
            return NotImplemented
        return (self is other) or (
            self._shuffle_after_epoch == other._shuffle_after_epoch
            and self._batch_size == other._batch_size
            and isinstance(other._output, type(self._output))
            and (self._input == other._input)
            and (self._output == other._output)
            and (self._shuffle_tensor_indices.tolist() == other._shuffle_tensor_indices.tolist())
        )

    def __hash__(self) -> int:
        """
        Return a deterministic hash value for this image dataset.

        Returns
        -------
        hash:
            the hash value
        """
        return _structural_hash(
            self._input,
            self._output,
            self._shuffle_after_epoch,
            self._batch_size,
            self._shuffle_tensor_indices.tolist(),
        )

    def __sizeof__(self) -> int:
        """
        Return the complete size of this object.

        Returns
        -------
        size:
            Size of this object in bytes.
        """
        return (
            sys.getsizeof(self._shuffle_tensor_indices)
            + self._shuffle_tensor_indices.element_size() * self._shuffle_tensor_indices.nelement()
            + sys.getsizeof(self._input)
            + sys.getsizeof(self._output)
            + sys.getsizeof(self._input_size)
            + sys.getsizeof(self._output_size)
            + sys.getsizeof(self._shuffle_after_epoch)
            + sys.getsizeof(self._batch_size)
            + sys.getsizeof(self._next_batch_index)
        )

    @property
    def input_size(self) -> ImageSize:
        """
        Get the input `ImageSize` of this dataset.

        Returns
        -------
        input_size:
            the input `ImageSize`
        """
        return self._input_size

    @property
    def output_size(self) -> ImageSize | int:
        """
        Get the output size of this dataset.

        Returns
        -------
        output_size:
            the output size
        """
        return self._output_size

    def get_input(self) -> ImageList:
        """
        Get the input data of this dataset.

        Returns
        -------
        input:
            the input data of this dataset
        """
        return self._sort_image_list_with_shuffle_tensor_indices_reduce_if_necessary(self._input)

    def get_output(self) -> Out_co:
        """
        Get the output data of this dataset.

        Returns
        -------
        output:
            the output data of this dataset
        """
        output = self._output
        if isinstance(output, _TableAsTensor):
            return output._to_table(self._shuffle_tensor_indices)  # type: ignore[return-value]
        elif isinstance(output, _ColumnAsTensor):
            return output._to_column(self._shuffle_tensor_indices)  # type: ignore[return-value]
        else:
            return self._sort_image_list_with_shuffle_tensor_indices_reduce_if_necessary(self._output)  # type: ignore[return-value]

    def _sort_image_list_with_shuffle_tensor_indices_reduce_if_necessary(
        self,
        image_list: _SingleSizeImageList,
    ) -> _SingleSizeImageList:
        shuffled_image_list = _SingleSizeImageList()
        tensor_pos = [
            image_list._indices_to_tensor_positions[shuffled_index]
            for shuffled_index in sorted(self._shuffle_tensor_indices.tolist())
        ]
        temp_pos = {
            shuffled_index: new_index for new_index, shuffled_index in enumerate(self._shuffle_tensor_indices.tolist())
        }
        shuffled_image_list._tensor = image_list._tensor[tensor_pos]
        shuffled_image_list._tensor_positions_to_indices = [
            new_index for _, new_index in sorted(temp_pos.items(), key=lambda item: item[0])
        ]
        shuffled_image_list._indices_to_tensor_positions = shuffled_image_list._calc_new_indices_to_tensor_positions()
        return shuffled_image_list

    def _get_batch(self, batch_number: int, batch_size: int | None = None) -> tuple[Tensor, Tensor]:
        import torch

        _init_default_device()

        if batch_size is None:
            batch_size = self._batch_size

        _check_bounds("batch_size", batch_size, lower_bound=_ClosedBound(1))

        if batch_number < 0 or batch_size * batch_number >= len(self._shuffle_tensor_indices):
            raise IndexOutOfBoundsError(batch_size * batch_number)
        max_index = (
            batch_size * (batch_number + 1)
            if batch_size * (batch_number + 1) < len(self._shuffle_tensor_indices)
            else len(self._shuffle_tensor_indices)
        )
        input_tensor = (
            self._input._tensor[
                [
                    self._input._indices_to_tensor_positions[index]
                    for index in self._shuffle_tensor_indices[batch_size * batch_number : max_index].tolist()
                ]
            ].to(torch.float32)
            / 255
        )
        output_tensor: Tensor
        if isinstance(self._output, _SingleSizeImageList):
            output_tensor = (
                self._output._tensor[
                    [
                        self._input._indices_to_tensor_positions[index]
                        for index in self._shuffle_tensor_indices[batch_size * batch_number : max_index].tolist()
                    ]
                ].to(torch.float32)
                / 255
            )
        else:  # _output is instance of _TableAsTensor or _ColumnAsTensor
            output_tensor = self._output._tensor[self._shuffle_tensor_indices[batch_size * batch_number : max_index]]
        return input_tensor, output_tensor

    def shuffle(self) -> ImageDataset[Out_co]:
        """
        Return a new `ImageDataset` with shuffled data.

        The original dataset is not modified.

        Returns
        -------
        image_dataset:
            the shuffled `ImageDataset`
        """
        import torch

        _init_default_device()

        im_dataset: ImageDataset[Out_co] = copy.copy(self)
        im_dataset._shuffle_tensor_indices = self._shuffle_tensor_indices[
            torch.randperm(len(self._shuffle_tensor_indices))
        ]
        im_dataset._next_batch_index = 0
        return im_dataset

    def split(
        self,
        percentage_in_first: float,
        *,
        shuffle: bool = True,
    ) -> tuple[ImageDataset[Out_co], ImageDataset[Out_co]]:
        """
        Create two image datasets by splitting the data of the current dataset.

        The first dataset contains a percentage of the data specified by `percentage_in_first`, and the second dataset
        contains the remaining data.

        The original dataset is not modified.
        By default, the data is shuffled before splitting. You can disable this by setting `shuffle` to False.

        Parameters
        ----------
        percentage_in_first:
            The percentage of data to include in the first dataset. Must be between 0 and 1.
        shuffle:
            Whether to shuffle the data before splitting.

        Returns
        -------
        first_dataset:
            The first dataset.
        second_dataset:
            The second dataset.

        Raises
        ------
        OutOfBoundsError
            If `percentage_in_first` is not between 0 and 1.
        """
        import torch

        _check_bounds(
            "percentage_in_first",
            percentage_in_first,
            lower_bound=_ClosedBound(0),
            upper_bound=_ClosedBound(1),
        )

        _init_default_device()

        first_dataset: ImageDataset[Out_co] = copy.copy(self)
        second_dataset: ImageDataset[Out_co] = copy.copy(self)

        if shuffle:
            shuffled_indices = torch.randperm(len(self._shuffle_tensor_indices))
        else:
            shuffled_indices = torch.arange(len(self._shuffle_tensor_indices))

        first_dataset._shuffle_tensor_indices, second_dataset._shuffle_tensor_indices = shuffled_indices.split(
            [
                round(percentage_in_first * len(self)),
                len(self) - round(percentage_in_first * len(self)),
            ],
        )
        return first_dataset, second_dataset

`input_size` ¶

Get the input ImageSize of this dataset.

Returns:

Name	Type	Description
`input_size`	`ImageSize`	the input `ImageSize`

`output_size` ¶

Get the output size of this dataset.

Returns:

Name	Type	Description
`output_size`	`ImageSize \| int`	the output size

`get_input` ¶

Get the input data of this dataset.

Returns:

Name	Type	Description
`input`	`ImageList`	the input data of this dataset

Source code in src/safeds/data/labeled/containers/_image_dataset.py

def get_input(self) -> ImageList:
    """
    Get the input data of this dataset.

    Returns
    -------
    input:
        the input data of this dataset
    """
    return self._sort_image_list_with_shuffle_tensor_indices_reduce_if_necessary(self._input)

`get_output` ¶

Get the output data of this dataset.

Returns:

Name	Type	Description
`output`	`Out_co`	the output data of this dataset

Source code in src/safeds/data/labeled/containers/_image_dataset.py

def get_output(self) -> Out_co:
    """
    Get the output data of this dataset.

    Returns
    -------
    output:
        the output data of this dataset
    """
    output = self._output
    if isinstance(output, _TableAsTensor):
        return output._to_table(self._shuffle_tensor_indices)  # type: ignore[return-value]
    elif isinstance(output, _ColumnAsTensor):
        return output._to_column(self._shuffle_tensor_indices)  # type: ignore[return-value]
    else:
        return self._sort_image_list_with_shuffle_tensor_indices_reduce_if_necessary(self._output)  # type: ignore[return-value]

`shuffle` ¶

Return a new ImageDataset with shuffled data.

The original dataset is not modified.

Returns:

Name	Type	Description
`image_dataset`	`ImageDataset[Out_co]`	the shuffled `ImageDataset`

Source code in src/safeds/data/labeled/containers/_image_dataset.py

def shuffle(self) -> ImageDataset[Out_co]:
    """
    Return a new `ImageDataset` with shuffled data.

    The original dataset is not modified.

    Returns
    -------
    image_dataset:
        the shuffled `ImageDataset`
    """
    import torch

    _init_default_device()

    im_dataset: ImageDataset[Out_co] = copy.copy(self)
    im_dataset._shuffle_tensor_indices = self._shuffle_tensor_indices[
        torch.randperm(len(self._shuffle_tensor_indices))
    ]
    im_dataset._next_batch_index = 0
    return im_dataset

`split` ¶

Create two image datasets by splitting the data of the current dataset.

The first dataset contains a percentage of the data specified by percentage_in_first, and the second dataset contains the remaining data.

The original dataset is not modified. By default, the data is shuffled before splitting. You can disable this by setting shuffle to False.

Parameters:

Name	Type	Description	Default
`percentage_in_first`	`float`	The percentage of data to include in the first dataset. Must be between 0 and 1.	required
`shuffle`	`bool`	Whether to shuffle the data before splitting.	`True`

Returns:

Name	Type	Description
`first_dataset`	`ImageDataset[Out_co]`	The first dataset.
`second_dataset`	`ImageDataset[Out_co]`	The second dataset.

Raises:

Type	Description
`OutOfBoundsError`	If `percentage_in_first` is not between 0 and 1.

Source code in src/safeds/data/labeled/containers/_image_dataset.py

def split(
    self,
    percentage_in_first: float,
    *,
    shuffle: bool = True,
) -> tuple[ImageDataset[Out_co], ImageDataset[Out_co]]:
    """
    Create two image datasets by splitting the data of the current dataset.

    The first dataset contains a percentage of the data specified by `percentage_in_first`, and the second dataset
    contains the remaining data.

    The original dataset is not modified.
    By default, the data is shuffled before splitting. You can disable this by setting `shuffle` to False.

    Parameters
    ----------
    percentage_in_first:
        The percentage of data to include in the first dataset. Must be between 0 and 1.
    shuffle:
        Whether to shuffle the data before splitting.

    Returns
    -------
    first_dataset:
        The first dataset.
    second_dataset:
        The second dataset.

    Raises
    ------
    OutOfBoundsError
        If `percentage_in_first` is not between 0 and 1.
    """
    import torch

    _check_bounds(
        "percentage_in_first",
        percentage_in_first,
        lower_bound=_ClosedBound(0),
        upper_bound=_ClosedBound(1),
    )

    _init_default_device()

    first_dataset: ImageDataset[Out_co] = copy.copy(self)
    second_dataset: ImageDataset[Out_co] = copy.copy(self)

    if shuffle:
        shuffled_indices = torch.randperm(len(self._shuffle_tensor_indices))
    else:
        shuffled_indices = torch.arange(len(self._shuffle_tensor_indices))

    first_dataset._shuffle_tensor_indices, second_dataset._shuffle_tensor_indices = shuffled_indices.split(
        [
            round(percentage_in_first * len(self)),
            len(self) - round(percentage_in_first * len(self)),
        ],
    )
    return first_dataset, second_dataset

ImageDataset

input_size ¶

output_size ¶

get_input ¶

get_output ¶

shuffle ¶

split ¶

`input_size` ¶

`output_size` ¶

`get_input` ¶

`get_output` ¶

`shuffle` ¶

`split` ¶