dmyersturnbull/pocketutils

View on GitHub
src/pocketutils/tools/path_tools.py

Summary

Maintainability
B
5 hrs
Test Coverage
# SPDX-FileCopyrightText: Copyright 2020-2023, Contributors to pocketutils
# SPDX-PackageHomePage: https://github.com/dmyersturnbull/pocketutils
# SPDX-License-Identifier: Apache-2.0
"""

"""

import logging
import os
import re
import sys
from collections.abc import Callable, Sequence
from copy import copy
from dataclasses import dataclass
from pathlib import Path, PurePath
from typing import Any, Self

from pocketutils import ValueIllegalError

__all__ = ["PathUtils", "PathTools"]

logger = logging.getLogger("pocketutils")

_bad_chars = {
    "<",
    ">",
    ":",
    '"',
    "|",
    "?",
    "*",
    "\\",
    "/",
    *{chr(c) for c in range(128, 128 + 33)},
    *{chr(c) for c in range(32)},
    "\t",
}

# note that we can't call WindowsPath.is_reserved because it can't be instantiated on non-Linux
# also, these appear to be different from the ones defined there

# don't handle Long UNC paths
# also cannot be blank or whitespace
# the $ suffixed ones are for FAT
# no CLOCK$, even with an ext
# also no SCREEN$
_bad_strs = {
    "CON",
    "PRN",
    "AUX",
    "NUL",
    "COM1",
    "COM2",
    "COM3",
    "COM4",
    "COM5",
    "COM6",
    "COM7",
    "COM8",
    "COM9",
    "LPT1",
    "LPT2",
    "LPT3",
    "LPT4",
    "LPT5",
    "LPT6",
    "LPT7",
    "LPT8",
    "LPT9",
}
_bad_strs_fat = {*_bad_strs, *{"$IDLE$", "CONFIG$", "KEYBD$", "SCREEN$", "CLOCK$", "LST"}}


@dataclass(slots=True, frozen=True)
class PathUtils:
    def is_path_like(self: Self, value: Any) -> bool:
        return isinstance(value, str | PurePath | os.PathLike)

    def up_dir(self: Self, n: int, *parts) -> Path:
        """
        Get an absolute path `n` parents from `os.getcwd()`.
        Does not sanitize.

        Ex: In dir '/home/john/dir_a/dir_b':
            updir(2, 'dir1', 'dir2')  # returns Path('/home/john/dir1/dir2')
        """
        base = Path(os.getcwd())
        for _ in range(n):
            base = base.parent
        for part in parts:
            base = base / part
        return base.resolve()

    def guess_trash(self: Self) -> Path:
        """
        Chooses a reasonable path for trash based on the OS.
        This is not reliable.
        For a more sophisticated solution, see https://github.com/hsoft/send2trash
        However, even that can fail.
        """
        plat = sys.platform.lower()
        if "darwin" in plat:
            return Path.home() / ".Trash"
        elif "win" in plat:
            return Path(Path.home().root) / "$Recycle.Bin"
        else:
            return Path.home() / ".trash"

    def sanitize_path(
        self: Self,
        path: PurePath | str,
        *,
        is_file: bool | None = None,
        fat: bool = False,
        trim: bool = False,
        warn: bool | Callable[[str], Any] = True,
    ) -> Path:
        r"""
        Sanitizes a path for major OSes and filesystems.
        Also see sanitize_path_nodes and sanitize_path_node.
        Mostly platform-independent.

        The idea is to sanitize for both Windows and Posix, regardless of the platform in use.
        The sanitization should be as uniform as possible for both platforms.
        This works for at least Windows+NTFS.
        Tilde substitution for long filenames in Windows is unsupported.

        A corner case is drive letters in Linux:
        "C:\\Users\\john" is converted to '/C:/users/john' if os.name=='posix'
        """
        w = {True: logger.warning, False: lambda _: None}.get(warn, warn)
        path = str(path)
        if path.startswith("\\\\?"):
            msg = f"Long UNC Windows paths (\\\\? prefix) are not supported (path '{path}')"
            raise ValueIllegalError(msg, value=str(path))
        bits = str(path).strip().replace("\\", "/").split("/")
        new_nodes = list(self.sanitize_nodes(bits, is_file=is_file, fat=fat, trim=trim))
        # unfortunately POSIX turns Path('C:\', '5') into C:\/5
        # this isn't an ideal way to fix it, but it works
        pat = re.compile(r"^([A-Z]:)\\?$")
        if os.name == "posix" and len(new_nodes) > 0 and pat.fullmatch(new_nodes[0]):
            new_nodes[0] = new_nodes[0].rstrip("\\")
            new_nodes.insert(0, "/")
        new_path = Path(*new_nodes)
        if new_path != path:
            w(f"Sanitized filename {path} → {new_path}")
        return Path(new_path)

    def sanitize_nodes(
        self: Self,
        bits: Sequence[PurePath | str],
        *,
        is_file: bool | None = None,
        fat: bool = False,
        trim: bool = False,
    ) -> Sequence[str]:
        fixed_bits = [
            bit + os.sep
            if i == 0 and bit.strip() in ["", ".", ".."]
            else self.sanitize_node(
                bit,
                is_file=(False if i < len(bits) - 1 else is_file),
                trim=trim,
                fat=fat,
                is_root_or_drive=(None if i == 0 else False),
            )
            for i, bit in enumerate(bits)
            if bit.strip() not in ["", "."]
            or i == 0  # ignore // (empty) just like Path does (but fail on sanitize_path_node(' '))
        ]
        return [bit for i, bit in enumerate(fixed_bits) if i == 0 or bit not in ["", "."]]

    def sanitize_node(
        self: Self,
        bit: PurePath | str,
        *,
        is_file: bool | None = None,
        is_root_or_drive: bool | None = None,
        fat: bool = False,
        trim: bool = False,
    ) -> str:
        r"""
        Sanitizes a path node such that it will be fine for major OSes and filesystems.
        For example:
            - 'plums;and/or;apples' becomes 'plums_and_or_apples' (escaped ; and /)
            - 'null.txt' becomes '_null_.txt' ('null' is forbidden in Windows)
            - 'abc  ' becomes 'abc' (no trailing spaces)

        The behavior is platform-independent -- os, sys, and pathlib are not used.
        For ex, calling sanitize_path_node(r'C:\') returns r'C:\' on both Windows and Linux
        If you want to sanitize a whole path, see sanitize_path instead.

        Args:
            bit: The node
            is_file: False for directories, True otherwise, None if unknown
            is_root_or_drive: True if known to be the root ('/') or a drive ('C:\'), None if unknown
            fat: Also make compatible with FAT filesystems
            trim: Truncate to 254 chars (otherwise fails)
        """
        # since is_file and is_root_or_drive are both Optional[bool], let's be explicit and use 'is' for clarity
        if is_file is True and is_root_or_drive is True:
            msg = "is_file and is_root_or_drive are both true"
            raise ValueIllegalError(msg)
        if is_file is True and is_root_or_drive is None:
            is_root_or_drive = False
        if is_root_or_drive is True and is_file is None:
            is_file = False
        source_bit = copy(str(bit))
        bit = str(bit).strip()
        # first, catch root or drive as long as is_root_or_drive is not false
        # if is_root_or_drive is True (which is a weird call), then fail if it's not
        # otherwise, it's not a root or drive letter, so keep going
        if is_root_or_drive is not False:
            # \ is allowed in Windows
            if bit in ["/", "\\"]:
                return bit
            m = re.compile(r"^([A-Z]:)(?:\\)?$").fullmatch(bit)
            # this is interesting
            # for bit=='C:' and is_root_or_drive=None,
            # it could be either a drive letter
            # or a file path that should be corrected to 'C_'
            # I guess here we're going with a drive letter
            if m is not None:
                # we need C:\ and not C: because:
                # Path('C:\\', '5').is_absolute() is True
                # but Path('C:', '5').is_absolute() is False
                # unfortunately, doing Path('C:\\', '5') on Linux gives 'C:\\/5'
                # I can't handle that here, but sanitize_path() will account for it
                return m.group(1) + "\\"
            if is_root_or_drive is True:
                msg = f"Node '{bit}' is not the root or a drive letter"
                raise ValueIllegalError(msg, value=bit)
        # just dots is invalid
        if set(bit.replace(" ", "")) == "." and bit not in ["..", "."]:
            bit = "_" + bit + "_"
            # raise IllegalPathError(f"Node '{source_bit}' is invalid")
        for q in _bad_chars:
            bit = bit.replace(q, "_")
        bad_strs = _bad_strs_fat if fat else _bad_strs
        if bit.upper() in bad_strs:
            # arbitrary decision
            bit = "_" + bit + "_"
        else:
            stub, ext = os.path.splitext(bit)
            if stub.upper() in bad_strs:
                bit = "_" + stub + "_" + ext
        if bit.strip() == "":
            bit = "_" + bit + "_"
            # raise IllegalPathError(f"Node '{source_bit}' is empty or contains only whitespace")
        # "." cannot end a node
        bit = bit.rstrip()
        if is_file is not True and (bit == "." or bit == ".."):
            return bit
        # never allow '.' or ' ' to end a filename
        bit = bit.rstrip(". ")
        # do this after
        if len(bit) > 254 and trim:
            bit = bit[:254]
        elif len(bit) > 254:
            msg = f"Node '{source_bit}' has more than 254 characters"
            raise ValueIllegalError(msg, value=bit)
        return bit


PathTools = PathUtils()