Skip to content

stats

Bases: FileOperation

Orchestrates dataset analysis to extract geometric, spatial, and quality features.

This class processes annotations to provide insights into object distribution, area variance, and potential dataset biases. It helps identify issues like class imbalance or feature outliers before the model training phase.

Source code in file_operations/stats_operation.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
class StatsOperation(FileOperation):
    """
    Orchestrates dataset analysis to extract geometric, spatial, and quality features.

    This class processes annotations to provide insights into object distribution,
    area variance, and potential dataset biases. It helps identify issues like
    class imbalance or feature outliers before the model training phase.
    """

    def __init__(self, settings: AppSettings, **kwargs):
        """
        Initializes the StatsOperation with analytical engines and reporters.

        Args:
            settings (AppSettings): Global application configuration object.
            **kwargs (dict): Additional parameters such as 'img_path', 'target_format',
                and file extensions.
        """
        super().__init__(settings, **kwargs)
        self.extensions = kwargs.get("ext", self.settings.extensions)
        self.img_path = kwargs.get('img_path')
        self.target_format: Union[str, None] = kwargs.get('target_format', self.settings.destination_type)
        self.stats_mapping: Dict[str, BaseStats.__subclasses__()] = {
            "yolo": YoloStats,
            "voc": VOCStats
        }

        self.reporter_mapping: Dict[str, Union[BaseDatasetReporter.__subclasses__()]] = {
            "image": ImageDatasetReporter
        }

        self.reporter: BaseDatasetReporter = self.reporter_mapping.get(self.settings.datatype)(
            settings=self.settings
        )
        self.stats_method: BaseStats = self.stats_mapping[self.target_format](
            settings=self.settings,
            source_format =self.target_format,
            img_path=self.img_path,
            extensions=self.extensions
        )


    @staticmethod
    def add_arguments(settings: AppSettings, parser: argparse.ArgumentParser) -> None:
        """
        Defines CLI arguments required for dataset statistics calculation.

        Args:
            settings (AppSettings): Global settings for default values.
            parser (argparse.ArgumentParser): CLI argument parser instance.
        """
        parser.add_argument(
            Arguments.destination_type,
            help=HelpStrings.destination_type,
        )
        parser.add_argument(
            Arguments.img_path,
            help=HelpStrings.img_path,
            default=None,
        )
        parser.add_argument(
            Arguments.n_jobs,
            help=HelpStrings.n_jobs,
            default=settings.n_jobs
        )
        parser.add_argument(
            Arguments.margin,
            help=HelpStrings.margin,
            default=settings.margin_threshold,
        )
        parser.add_argument(
            Arguments.report_path,
            help=HelpStrings.report_path,
            default=settings.report_path
        )


    def do_task(self):
        """
        Executes the main analytical pipeline for the dataset.

        The process includes:
            1. Loading annotations and setting up class mappings.
            2. Extracting a feature matrix (geometry, brightness, etc.).
            3. Logging a summary report to the console.
            4. Generating visual analytics (Plots, Heatmaps, and UMAP projections).
        """
        if self.target_format == "yolo":
            classes_mapping  = self.stats_method.set_class_mapping(file_paths=self.files_for_task)
            self.files_for_task = tuple(f for f in self.files_for_task if f.name != "classes.txt")
        else:
            classes_mapping = None
        df = self.stats_method.get_features(file_paths=self.files_for_task, class_mapping=classes_mapping)

        if df.empty:
            self.logger.warning(f"No annotations found in {self.src}")
            return

        self.logger.info(f"Found {len(self.files_for_task)} annotations in {self.src}")

        self.reporter.show_console_report(df=df, target_format=self.target_format)

        report_path = generate_directory_name(src=self.settings.report_path)
        features = self.stats_method.get_umap_features(df=df)
        self.reporter.generate_visual_report(df=df, destination=report_path, features=features)


    @property
    def img_path(self) -> Path:
        """Path: The directory where source images are located."""
        return self._img_path


    @img_path.setter
    def img_path(self, img_path: Union[Path, str, None]) -> None:
        """
        Sets and validates the path to images folder.

        Args:
            img_path (Union[Path, str, None]): Path to the images.
                If None, defaults to the source directory (common for YOLO).

        Raises:
            TypeError: If the provided path is not a string, Path, or None.
        """
        if isinstance(img_path, Path):
            self._img_path = img_path
        elif isinstance(img_path, str):
            self._img_path = Path(img_path)
        elif img_path is None:
            self._img_path = self.source_directory
            self.logger.warning(f"Dataset images path is not defined. Set same annotations path: {self.source_directory}")
        else:
            msg = f"img_path must be Path or str, not {type(img_path)}"
            self.logger.error(msg)
            raise TypeError(msg)


    @property
    def extensions(self) -> Tuple[str, ...]:
        """Tuple[str, ...]: Returns the supported image file extensions."""
        return self._extensions


    @extensions.setter
    def extensions(self, value: Tuple[str, ...]) -> None:
        """
        Sets the valid image extensions for the converter.

        Args:
            value (Tuple[str, ...]): A tuple of extension strings (e.g., ('.jpg',)).

        Raises:
            TypeError: If the input cannot be converted into a tuple.
        """
        if isinstance(value, tuple):
            self._extensions = value
        else:
            try:
                self._extensions = tuple(value)
            except TypeError as e:
                msg = f"extensions must be convertable into tuple, got {type(value)}"
                self.logger.error(msg)
                raise TypeError(msg)

extensions property writable

Tuple[str, ...]: Returns the supported image file extensions.

img_path property writable

Path: The directory where source images are located.

__init__(settings, **kwargs)

Initializes the StatsOperation with analytical engines and reporters.

Parameters:

Name Type Description Default
settings AppSettings

Global application configuration object.

required
**kwargs dict

Additional parameters such as 'img_path', 'target_format', and file extensions.

{}
Source code in file_operations/stats_operation.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
def __init__(self, settings: AppSettings, **kwargs):
    """
    Initializes the StatsOperation with analytical engines and reporters.

    Args:
        settings (AppSettings): Global application configuration object.
        **kwargs (dict): Additional parameters such as 'img_path', 'target_format',
            and file extensions.
    """
    super().__init__(settings, **kwargs)
    self.extensions = kwargs.get("ext", self.settings.extensions)
    self.img_path = kwargs.get('img_path')
    self.target_format: Union[str, None] = kwargs.get('target_format', self.settings.destination_type)
    self.stats_mapping: Dict[str, BaseStats.__subclasses__()] = {
        "yolo": YoloStats,
        "voc": VOCStats
    }

    self.reporter_mapping: Dict[str, Union[BaseDatasetReporter.__subclasses__()]] = {
        "image": ImageDatasetReporter
    }

    self.reporter: BaseDatasetReporter = self.reporter_mapping.get(self.settings.datatype)(
        settings=self.settings
    )
    self.stats_method: BaseStats = self.stats_mapping[self.target_format](
        settings=self.settings,
        source_format =self.target_format,
        img_path=self.img_path,
        extensions=self.extensions
    )

add_arguments(settings, parser) staticmethod

Defines CLI arguments required for dataset statistics calculation.

Parameters:

Name Type Description Default
settings AppSettings

Global settings for default values.

required
parser ArgumentParser

CLI argument parser instance.

required
Source code in file_operations/stats_operation.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
@staticmethod
def add_arguments(settings: AppSettings, parser: argparse.ArgumentParser) -> None:
    """
    Defines CLI arguments required for dataset statistics calculation.

    Args:
        settings (AppSettings): Global settings for default values.
        parser (argparse.ArgumentParser): CLI argument parser instance.
    """
    parser.add_argument(
        Arguments.destination_type,
        help=HelpStrings.destination_type,
    )
    parser.add_argument(
        Arguments.img_path,
        help=HelpStrings.img_path,
        default=None,
    )
    parser.add_argument(
        Arguments.n_jobs,
        help=HelpStrings.n_jobs,
        default=settings.n_jobs
    )
    parser.add_argument(
        Arguments.margin,
        help=HelpStrings.margin,
        default=settings.margin_threshold,
    )
    parser.add_argument(
        Arguments.report_path,
        help=HelpStrings.report_path,
        default=settings.report_path
    )

do_task()

Executes the main analytical pipeline for the dataset.

The process includes
  1. Loading annotations and setting up class mappings.
  2. Extracting a feature matrix (geometry, brightness, etc.).
  3. Logging a summary report to the console.
  4. Generating visual analytics (Plots, Heatmaps, and UMAP projections).
Source code in file_operations/stats_operation.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def do_task(self):
    """
    Executes the main analytical pipeline for the dataset.

    The process includes:
        1. Loading annotations and setting up class mappings.
        2. Extracting a feature matrix (geometry, brightness, etc.).
        3. Logging a summary report to the console.
        4. Generating visual analytics (Plots, Heatmaps, and UMAP projections).
    """
    if self.target_format == "yolo":
        classes_mapping  = self.stats_method.set_class_mapping(file_paths=self.files_for_task)
        self.files_for_task = tuple(f for f in self.files_for_task if f.name != "classes.txt")
    else:
        classes_mapping = None
    df = self.stats_method.get_features(file_paths=self.files_for_task, class_mapping=classes_mapping)

    if df.empty:
        self.logger.warning(f"No annotations found in {self.src}")
        return

    self.logger.info(f"Found {len(self.files_for_task)} annotations in {self.src}")

    self.reporter.show_console_report(df=df, target_format=self.target_format)

    report_path = generate_directory_name(src=self.settings.report_path)
    features = self.stats_method.get_umap_features(df=df)
    self.reporter.generate_visual_report(df=df, destination=report_path, features=features)