Skip to content

dedup

Bases: FileOperation, FileRemoverMixin

An operation to find and remove visual duplicates in a dataset.

This class compares images in the source folder using hashing algorithms (like dHash). It identifies similar images based on a similarity threshold and can either delete them automatically or ask the user for confirmation.

Attributes:

Name Type Description
filetype str

The type of files to process (e.g., 'image').

method str

The hashing method used for comparison (e.g., 'dhash').

remove bool

If True, duplicates are deleted automatically without asking.

comparer ImageComparer

The engine that performs the actual image comparison.

Source code in file_operations/deduplicate.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
class DedupOperation(FileOperation, FileRemoverMixin):
    """
    An operation to find and remove visual duplicates in a dataset.

    This class compares images in the source folder using hashing algorithms
    (like dHash). It identifies similar images based on a similarity threshold
    and can either delete them automatically or ask the user for confirmation.

    Attributes:
        filetype (str): The type of files to process (e.g., 'image').
        method (str): The hashing method used for comparison (e.g., 'dhash').
        remove (bool): If True, duplicates are deleted automatically without asking.
        comparer (ImageComparer): The engine that performs the actual image comparison.
    """
    def __init__(self, **kwargs):
        """
        Initializes the deduplication operation.

        Args:
            **kwargs (dict): Parameters from the command line or settings, including
                'filetype', 'method', 'threshold', and 'core_size'.
        """
        super().__init__(**kwargs)
        self.mapping = {
            Constants.image: ImageComparer
        }

        self.filetype = kwargs.get("filetype", self.settings.datatype)
        self.method = kwargs.get("method", self.settings.method)
        self.remove = kwargs.get("remove", self.settings.remove)
        self.comparer: ImageComparer = self.mapping[self.filetype](self.settings)

    @staticmethod
    def add_arguments(settings: AppSettings, parser: argparse.ArgumentParser) -> None:
        """
        Defines CLI arguments for the deduplication task.

        Args:
            settings (AppSettings): Global configuration for default values.
            parser (argparse.ArgumentParser): The parser to which arguments are added.
        """
        parser.add_argument(
            Arguments.threshold,
            help=HelpStrings.threshold,
            default=settings.hash_threshold
        )
        parser.add_argument(
            Arguments.datatype,
            help=HelpStrings.datatype,
            default=settings.datatype
        )
        parser.add_argument(
            Arguments.method, Arguments.m,
            help=HelpStrings.method,
            default=settings.method
        )
        parser.add_argument(
            Arguments.remove, Arguments.rm,
            help=HelpStrings.remove,
            action="store_true"
        )
        parser.add_argument(
            Arguments.core_size,
            help=HelpStrings.core_size,
            default=settings.core_size
        )
        parser.add_argument(
            Arguments.n_jobs,
            help=HelpStrings.n_jobs,
            default=settings.n_jobs
        )
        parser.add_argument(
            Arguments.cache_name,
            help=HelpStrings.cache_name,
            default=None
        )

    def do_task(self):
        """
        Executes the deduplication process.

        This method uses the 'ImageComparer' to find duplicates among the
        collected files. If duplicates are found, it checks for user
        confirmation (or uses the 'remove' flag) and deletes the files
        using 'FileRemoverMixin'.
        """
        duplicates = self.comparer.compare(self.files_for_task)
        duplicates_count = len(duplicates)
        self.logger.info(f"Found {duplicates_count} duplicates in {len(self.files_for_task)} files")

        if duplicates_count > 0 and self.confirm_removing():
            self.remove_all(duplicates)

        wait(logger=self.logger, timeout=self.sleep)

    def confirm_removing(self) -> bool:
        """
        Checks if the operation has permission to delete the found duplicates.

        If the 'remove' flag is not set, the method asks the user to type
        'delete' in the console to proceed.

        Returns:
            bool: True if deletion is confirmed, False otherwise.
        """
        if not self.remove:
            user_choice = input("for deleting founded duplicate files type 'yes': ")
            return user_choice.lower() in self.settings.confirm_choice
        return True

__init__(**kwargs)

Initializes the deduplication operation.

Parameters:

Name Type Description Default
**kwargs dict

Parameters from the command line or settings, including 'filetype', 'method', 'threshold', and 'core_size'.

{}
Source code in file_operations/deduplicate.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
def __init__(self, **kwargs):
    """
    Initializes the deduplication operation.

    Args:
        **kwargs (dict): Parameters from the command line or settings, including
            'filetype', 'method', 'threshold', and 'core_size'.
    """
    super().__init__(**kwargs)
    self.mapping = {
        Constants.image: ImageComparer
    }

    self.filetype = kwargs.get("filetype", self.settings.datatype)
    self.method = kwargs.get("method", self.settings.method)
    self.remove = kwargs.get("remove", self.settings.remove)
    self.comparer: ImageComparer = self.mapping[self.filetype](self.settings)

add_arguments(settings, parser) staticmethod

Defines CLI arguments for the deduplication task.

Parameters:

Name Type Description Default
settings AppSettings

Global configuration for default values.

required
parser ArgumentParser

The parser to which arguments are added.

required
Source code in file_operations/deduplicate.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
@staticmethod
def add_arguments(settings: AppSettings, parser: argparse.ArgumentParser) -> None:
    """
    Defines CLI arguments for the deduplication task.

    Args:
        settings (AppSettings): Global configuration for default values.
        parser (argparse.ArgumentParser): The parser to which arguments are added.
    """
    parser.add_argument(
        Arguments.threshold,
        help=HelpStrings.threshold,
        default=settings.hash_threshold
    )
    parser.add_argument(
        Arguments.datatype,
        help=HelpStrings.datatype,
        default=settings.datatype
    )
    parser.add_argument(
        Arguments.method, Arguments.m,
        help=HelpStrings.method,
        default=settings.method
    )
    parser.add_argument(
        Arguments.remove, Arguments.rm,
        help=HelpStrings.remove,
        action="store_true"
    )
    parser.add_argument(
        Arguments.core_size,
        help=HelpStrings.core_size,
        default=settings.core_size
    )
    parser.add_argument(
        Arguments.n_jobs,
        help=HelpStrings.n_jobs,
        default=settings.n_jobs
    )
    parser.add_argument(
        Arguments.cache_name,
        help=HelpStrings.cache_name,
        default=None
    )

confirm_removing()

Checks if the operation has permission to delete the found duplicates.

If the 'remove' flag is not set, the method asks the user to type 'delete' in the console to proceed.

Returns:

Name Type Description
bool bool

True if deletion is confirmed, False otherwise.

Source code in file_operations/deduplicate.py
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def confirm_removing(self) -> bool:
    """
    Checks if the operation has permission to delete the found duplicates.

    If the 'remove' flag is not set, the method asks the user to type
    'delete' in the console to proceed.

    Returns:
        bool: True if deletion is confirmed, False otherwise.
    """
    if not self.remove:
        user_choice = input("for deleting founded duplicate files type 'yes': ")
        return user_choice.lower() in self.settings.confirm_choice
    return True

do_task()

Executes the deduplication process.

This method uses the 'ImageComparer' to find duplicates among the collected files. If duplicates are found, it checks for user confirmation (or uses the 'remove' flag) and deletes the files using 'FileRemoverMixin'.

Source code in file_operations/deduplicate.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def do_task(self):
    """
    Executes the deduplication process.

    This method uses the 'ImageComparer' to find duplicates among the
    collected files. If duplicates are found, it checks for user
    confirmation (or uses the 'remove' flag) and deletes the files
    using 'FileRemoverMixin'.
    """
    duplicates = self.comparer.compare(self.files_for_task)
    duplicates_count = len(duplicates)
    self.logger.info(f"Found {duplicates_count} duplicates in {len(self.files_for_task)} files")

    if duplicates_count > 0 and self.confirm_removing():
        self.remove_all(duplicates)

    wait(logger=self.logger, timeout=self.sleep)