markusressel/py-image-dedup

View on GitHub
py_image_dedup/config.py

Summary

Maintainability
A
2 hrs
Test Coverage
import os
from datetime import timedelta

from container_app_conf import ConfigBase
from container_app_conf.entry.bool import BoolConfigEntry
from container_app_conf.entry.file import DirectoryConfigEntry
from container_app_conf.entry.float import FloatConfigEntry
from container_app_conf.entry.int import IntConfigEntry
from container_app_conf.entry.list import ListConfigEntry
from container_app_conf.entry.regex import RegexConfigEntry
from container_app_conf.entry.string import StringConfigEntry
from container_app_conf.entry.timedelta import TimeDeltaConfigEntry
from container_app_conf.source.env_source import EnvSource
from container_app_conf.source.yaml_source import YamlSource
from py_range_parse import Range

NODE_MAIN = "py_image_dedup"

NODE_DAEMON = "daemon"
NODE_PROCESSING_TIMEOUT = "processing_timeout"

NODE_FILE_OBSERVER_TYPE = "file_observer"
FILE_OBSERVER_TYPE_POLLING = "polling"
FILE_OBSERVER_TYPE_INOTIFY = "inotify"

NODE_DRY_RUN = "dry_run"

NODE_ELASTICSEARCH = "elasticsearch"

NODE_HOST = "host"
NODE_MAX_DISTANCE = "max_distance"
NODE_AUTO_CREATE_INDEX = "auto_create_index"
NODE_INDEX = "index"

NODE_ANALYSIS = "analysis"

NODE_EXCLUSIONS = "exclusions"
NODE_SOURCE_DIRECTORIES = "source_directories"
NODE_RECURSIVE = "recursive"
NODE_SEARCH_ACROSS_ROOT_DIRS = "across_dirs"
NODE_FILE_EXTENSIONS = "file_extensions"
NODE_USE_EXIF_DATA = "use_exif_data"
NODE_THREADS = "threads"

NODE_DEDUPLICATION = "deduplication"

NODE_PRIORITIZATION_RULES = "prioritization_rules"
NODE_MAX_FILE_MODIFICATION_TIME_DIFF = "max_file_modification_time_diff"
NODE_REMOVE_EMPTY_FOLDERS = "remove_empty_folders"
NODE_DUPLICATES_TARGET_DIRECTORY = "duplicates_target_directory"

NODE_STATS = "stats"
NODE_ENABLED = "enabled"
NODE_PORT = "port"


class DeduplicatorConfig(ConfigBase):

    def __new__(cls, *args, **kwargs):
        yaml_source = YamlSource("py_image_dedup")
        data_sources = [
            EnvSource(),
            yaml_source
        ]
        return super(DeduplicatorConfig, cls).__new__(cls, data_sources=data_sources)

    DRY_RUN = BoolConfigEntry(
        description="If enabled no source file will be touched",
        key_path=[
            NODE_MAIN,
            NODE_DRY_RUN
        ],
        default=True
    )

    ELASTICSEARCH_HOST = StringConfigEntry(
        description="Hostname of the elasticsearch backend instance to use.",
        key_path=[
            NODE_MAIN,
            NODE_ELASTICSEARCH,
            NODE_HOST
        ],
        default="127.0.0.1"
    )

    ELASTICSEARCH_PORT = IntConfigEntry(
        description="Port of the elasticsearch backend instance to use.",
        key_path=[
            NODE_MAIN,
            NODE_ELASTICSEARCH,
            NODE_PORT
        ],
        range=Range(1, 65535),
        default=9200
    )

    ELASTICSEARCH_MAX_DISTANCE = FloatConfigEntry(
        description="Maximum signature distance [0..1] to query from elasticsearch backend.",
        key_path=[
            NODE_MAIN,
            NODE_ELASTICSEARCH,
            NODE_MAX_DISTANCE
        ],
        default=0.10
    )

    ELASTICSEARCH_AUTO_CREATE_INDEX = BoolConfigEntry(
        description="Whether to automatically create an index in the target database.",
        key_path=[
            NODE_MAIN,
            NODE_ELASTICSEARCH,
            NODE_AUTO_CREATE_INDEX
        ],
        default=True
    )

    ELASTICSEARCH_INDEX = StringConfigEntry(
        description="The index name to use for storing and querying image analysis data.",
        key_path=[
            NODE_MAIN,
            NODE_ELASTICSEARCH,
            NODE_INDEX
        ],
        default="images"
    )

    ANALYSIS_USE_EXIF_DATA = BoolConfigEntry(
        description="Whether to scan for EXIF data or not.",
        key_path=[
            NODE_MAIN,
            NODE_ANALYSIS,
            NODE_USE_EXIF_DATA
        ],
        default=True
    )

    SOURCE_DIRECTORIES = ListConfigEntry(
        description="Comma separated list of source paths to analyse and deduplicate.",
        item_type=DirectoryConfigEntry,
        item_args={
            "check_existence": True
        },
        key_path=[
            NODE_MAIN,
            NODE_ANALYSIS,
            NODE_SOURCE_DIRECTORIES
        ],
        required=True,
        example=[
            "/home/myuser/pictures/"
        ]
    )

    RECURSIVE = BoolConfigEntry(
        description="When set all directories will be recursively analyzed.",
        key_path=[
            NODE_MAIN,
            NODE_ANALYSIS,
            NODE_RECURSIVE
        ],
        default=True
    )

    SEARCH_ACROSS_ROOT_DIRS = BoolConfigEntry(
        description="When set duplicates will be found even if they are located in different root directories.",
        key_path=[
            NODE_MAIN,
            NODE_ANALYSIS,
            NODE_SEARCH_ACROSS_ROOT_DIRS
        ],
        default=False
    )

    FILE_EXTENSION_FILTER = ListConfigEntry(
        description="Comma separated list of file extensions.",
        item_type=StringConfigEntry,
        key_path=[
            NODE_MAIN,
            NODE_ANALYSIS,
            NODE_FILE_EXTENSIONS
        ],
        required=True,
        default=[
            ".png",
            ".jpg",
            ".jpeg"
        ]
    )

    EXCLUSIONS = ListConfigEntry(
        description="Comma separated list of regular expression filters.",
        item_type=RegexConfigEntry,
        key_path=[
            NODE_MAIN,
            NODE_ANALYSIS,
            NODE_EXCLUSIONS
        ],
        default=[]
    )

    ANALYSIS_THREADS = IntConfigEntry(
        description="Number of threads to use for image analysis phase.",
        key_path=[
            NODE_MAIN,
            NODE_ANALYSIS,
            NODE_THREADS
        ],
        default=os.cpu_count()
    )

    MAX_FILE_MODIFICATION_TIME_DELTA = TimeDeltaConfigEntry(
        description="Maximum file modification date difference between multiple "
                    "duplicates to be considered the same image",
        key_path=[
            NODE_MAIN,
            NODE_DEDUPLICATION,
            NODE_MAX_FILE_MODIFICATION_TIME_DIFF
        ],
        default=None,
        example=timedelta(minutes=5)
    )

    PRIORITIZATION_RULES = ListConfigEntry(
        description="Comma separated list of prioritization rules to use for ordering duplicate "
                    "images before proceeding with the deduplication process.",
        item_type=StringConfigEntry,
        key_path=[
            NODE_MAIN,
            NODE_DEDUPLICATION,
            NODE_PRIORITIZATION_RULES
        ],
        required=False,
        default=[
            "higher-pixel-count",
            "more-exif-data",
            "bigger-file-size",
            "newer-file-modification-date",
            "smaller-distance",
            "doesnt-contain-copy-in-file-name",
            "longer-file-name",
            "shorter-folder-path",
            "higher-score",
        ]
    )

    REMOVE_EMPTY_FOLDERS = BoolConfigEntry(
        description="Whether to remove empty folders or not.",
        key_path=[
            NODE_MAIN,
            NODE_REMOVE_EMPTY_FOLDERS
        ],
        default=False
    )

    DEDUPLICATOR_DUPLICATES_TARGET_DIRECTORY = DirectoryConfigEntry(
        description="Directory path to move duplicates to instead of deleting them.",
        key_path=[
            NODE_MAIN,
            NODE_DEDUPLICATION,
            NODE_DUPLICATES_TARGET_DIRECTORY
        ],
        check_existence=True,
        default=None,
        example="/home/myuser/pictures/duplicates/"
    )

    DAEMON_PROCESSING_TIMEOUT = TimeDeltaConfigEntry(
        description="Time to wait for filesystems changes to settle before analysing.",
        key_path=[
            NODE_MAIN,
            NODE_DAEMON,
            NODE_PROCESSING_TIMEOUT
        ],
        default="30s"
    )

    DAEMON_FILE_OBSERVER_TYPE = StringConfigEntry(
        description="Type of file observer to use.",
        key_path=[
            NODE_MAIN,
            NODE_DAEMON,
            NODE_FILE_OBSERVER_TYPE
        ],
        regex="|".join([FILE_OBSERVER_TYPE_POLLING, FILE_OBSERVER_TYPE_INOTIFY]),
        default=FILE_OBSERVER_TYPE_POLLING,
        required=True
    )

    STATS_ENABLED = BoolConfigEntry(
        description="Whether to enable prometheus statistics or not.",
        key_path=[
            NODE_MAIN,
            NODE_STATS,
            NODE_ENABLED
        ],
        default=True
    )

    STATS_PORT = IntConfigEntry(
        description="The port to expose statistics on.",
        key_path=[
            NODE_MAIN,
            NODE_STATS,
            NODE_PORT
        ],
        default=8000
    )