sjoerdk/anonapi

View on GitHub
anonapi/selection.py

Summary

Maintainability
A
45 mins
Test Coverage
"""Functions to filter and select files from folders.
Useful for example for selecting only DICOM files in a folder.
"""
import re
from fnmatch import fnmatch
from pathlib import Path

from pydicom.misc import is_dicom

from tqdm import tqdm

from fileselection.fileselection import FileSelectionFile, FileSelectionFolder

from anonapi.logging import get_module_logger

logger = get_module_logger(__name__)


class FileFolder:
    """A folder that might contain some files. Makes it easy to iterate
    over these files in different ways
    """

    def __init__(self, path):
        self.path = Path(path)

    def iterate(
        self,
        pattern="*",
        recurse=True,
        exclude_patterns=None,
        ignore_dotfiles=True,
    ):
        """Iterator that yields subpaths. Makes it easy to use progress bar

        Parameters
        ----------
        pattern: str, optional
            Glob file pattern. Default is '*' (match all)
        recurse: bool, optional
            Search for paths in all underlying directories. Default is True
        exclude_patterns: List[str], optional
            Exclude any root_path that matches_header any of these patterns.
            Patterns are unix-style: * as wildcard. See fnmatch function.
            Defaults to emtpy list meaning no exclusions
        ignore_dotfiles: bool, optional
            Ignore any filename starting with '.'

        Returns
        -------
        generator
            Yields Path if the root_path is a file, None otherwise

        """
        if not exclude_patterns:
            exclude_patterns = []

        if recurse:
            glob_pattern = f"**/{pattern}"
        else:
            glob_pattern = f"{pattern}"

        all_paths_iter = self.path.glob(glob_pattern)
        for x in all_paths_iter:
            # sleep(0.2)
            exclude = any(
                [
                    fnmatch(x.relative_to(self.path), y)
                    for y in exclude_patterns
                ]
            )
            ignore = x.name.startswith(".") and ignore_dotfiles
            if x.is_file() and not exclude and not ignore:
                yield x
            else:
                continue


def create_dicom_selection(path, check_dicom=True) -> FileSelectionFile:
    """Find all DICOM files path (recursive) and save them as a FileSelectionFile.

    Parameters
    ----------
    path: PathLike
        Search for DICOM files in the path, recursively
    check_dicom: bool, optional
        If True, open each file to see whether it is valid DICOM (thorough).
        If False, will only select based on filename (fast). Defaults to True

    Returns
    -------
    FileSelectionFile
        The created file selection that has been saved to disk
    """
    # Find all dicom files in this folder
    folder = FileFolder(path)
    logger.info(f"Finding all files in {path}")
    files = [x for x in tqdm(folder.iterate()) if x is not None]
    if check_dicom:
        dicom_files = find_dicom_files(files)
    else:
        logger.info(
            f"Found {len(files)} files. Adding all that look like DICOM"
        )
        dicom_files = [x for x in files if looks_like_dicom_file(x)]

    logger.info(f"Found {len(dicom_files)} DICOM files")
    # record dicom files as fileselection
    selection_folder = FileSelectionFolder(path=path.absolute())
    selection = FileSelectionFile(
        data_file_path=selection_folder.get_data_file_path(),
        description=Path(path).name + " auto-generated by anonapi",
        selected_paths=[x.relative_to(folder.path) for x in dicom_files],
    )
    selection_folder.save_file_selection(selection)
    return selection


def find_dicom_files(files):
    """Go through files and determine which ones are valid DICOM files

    Parameters
    ----------
    files: Sequence[str]
        Full file paths to check

    Returns
    -------
    List[str]
        Full file paths that point to a valid DICOM file
    """
    logger.info(f"Found {len(files)} files. Finding out which ones are DICOM")
    dicom_files = [x for x in tqdm(files) if is_dicom(x)]
    return dicom_files


def looks_like_dicom_file(path) -> bool:
    """Does this file path look like a DICOM file?

    For doing a first quick selection of which files to include for deidentification
    """

    if Path(path).suffix.lower() in (".dicom", ".dcm"):
        return True
    elif re.match(r"^(\.[0-9]*)*$", Path(path).suffix):
        # there are only numbers in the extension. This might be a DICOM file
        return True
    else:
        return False