Source code for pathvalidate._filename

"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""

import itertools
import ntpath
import posixpath
import re
import warnings
from pathlib import Path, PurePath
from typing import Optional, Pattern, Sequence, Tuple

from ._base import AbstractSanitizer, AbstractValidator, BaseFile, BaseValidator
from ._common import findall_to_str, to_str, validate_pathtype
from ._const import DEFAULT_MIN_LEN, INVALID_CHAR_ERR_MSG_TMPL, Platform
from ._types import PathType, PlatformType
from .error import ErrorAttrKey, ErrorReason, InvalidCharError, ValidationError
from .handler import ReservedNameHandler, ValidationErrorHandler


_DEFAULT_MAX_FILENAME_LEN = 255
_RE_INVALID_FILENAME = re.compile(f"[{re.escape(BaseFile._INVALID_FILENAME_CHARS):s}]", re.UNICODE)
_RE_INVALID_WIN_FILENAME = re.compile(
    f"[{re.escape(BaseFile._INVALID_WIN_FILENAME_CHARS):s}]", re.UNICODE
)


class FileNameSanitizer(AbstractSanitizer):
    def __init__(
        self,
        max_len: int = _DEFAULT_MAX_FILENAME_LEN,
        fs_encoding: Optional[str] = None,
        platform: Optional[PlatformType] = None,
        null_value_handler: Optional[ValidationErrorHandler] = None,
        reserved_name_handler: Optional[ValidationErrorHandler] = None,
        additional_reserved_names: Optional[Sequence[str]] = None,
        validate_after_sanitize: bool = False,
        validator: Optional[AbstractValidator] = None,
    ) -> None:
        if validator:
            fname_validator = validator
        else:
            fname_validator = FileNameValidator(
                min_len=DEFAULT_MIN_LEN,
                max_len=max_len,
                fs_encoding=fs_encoding,
                check_reserved=True,
                additional_reserved_names=additional_reserved_names,
                platform=platform,
            )

        super().__init__(
            max_len=max_len,
            fs_encoding=fs_encoding,
            null_value_handler=null_value_handler,
            reserved_name_handler=reserved_name_handler,
            additional_reserved_names=additional_reserved_names,
            platform_max_len=_DEFAULT_MAX_FILENAME_LEN,
            platform=platform,
            validate_after_sanitize=validate_after_sanitize,
            validator=fname_validator,
        )

        self._sanitize_regexp = self._get_sanitize_regexp()

    def sanitize(self, value: PathType, replacement_text: str = "") -> PathType:
        try:
            validate_pathtype(value, allow_whitespaces=not self._is_windows(include_universal=True))
        except ValidationError as e:
            if e.reason == ErrorReason.NULL_NAME:
                if isinstance(value, PurePath):
                    raise

                return self._null_value_handler(e)
            raise

        sanitized_filename = self._sanitize_regexp.sub(replacement_text, str(value))
        sanitized_filename = sanitized_filename[: self.max_len]

        try:
            self._validator.validate(sanitized_filename)
        except ValidationError as e:
            if e.reason == ErrorReason.RESERVED_NAME:
                replacement_word = self._reserved_name_handler(e)
                if e.reserved_name != replacement_word:
                    sanitized_filename = re.sub(
                        re.escape(e.reserved_name), replacement_word, sanitized_filename
                    )
            elif e.reason == ErrorReason.INVALID_CHARACTER and self._is_windows(
                include_universal=True
            ):
                # Do not start a file or directory name with a space
                sanitized_filename = sanitized_filename.lstrip(" ")

                # Do not end a file or directory name with a space or a period
                sanitized_filename = sanitized_filename.rstrip(" ")
                if sanitized_filename not in (".", ".."):
                    sanitized_filename = sanitized_filename.rstrip(" .")
            elif e.reason == ErrorReason.NULL_NAME:
                sanitized_filename = self._null_value_handler(e)

        if self._validate_after_sanitize:
            try:
                self._validator.validate(sanitized_filename)
            except ValidationError as e:
                raise ValidationError(
                    description=str(e),
                    reason=ErrorReason.INVALID_AFTER_SANITIZE,
                    platform=self.platform,
                )

        if isinstance(value, PurePath):
            return Path(sanitized_filename)

        return sanitized_filename

    def _get_sanitize_regexp(self) -> Pattern[str]:
        if self._is_windows(include_universal=True):
            return _RE_INVALID_WIN_FILENAME

        return _RE_INVALID_FILENAME


class FileNameValidator(BaseValidator):
    _WINDOWS_RESERVED_FILE_NAMES = ("CON", "PRN", "AUX", "CLOCK$", "NUL") + tuple(
        f"{name:s}{num:d}" for name, num in itertools.product(("COM", "LPT"), range(1, 10))
    )
    _MACOS_RESERVED_FILE_NAMES = (":",)

    @property
    def reserved_keywords(self) -> Tuple[str, ...]:
        common_keywords = super().reserved_keywords

        if self._is_universal():
            word_set = set(
                common_keywords
                + self._WINDOWS_RESERVED_FILE_NAMES
                + self._MACOS_RESERVED_FILE_NAMES
            )
        elif self._is_windows():
            word_set = set(common_keywords + self._WINDOWS_RESERVED_FILE_NAMES)
        elif self._is_posix() or self._is_macos():
            word_set = set(common_keywords + self._MACOS_RESERVED_FILE_NAMES)
        else:
            word_set = set(common_keywords)

        return tuple(sorted(word_set))

    def __init__(
        self,
        min_len: int = DEFAULT_MIN_LEN,
        max_len: int = _DEFAULT_MAX_FILENAME_LEN,
        fs_encoding: Optional[str] = None,
        platform: Optional[PlatformType] = None,
        check_reserved: bool = True,
        additional_reserved_names: Optional[Sequence[str]] = None,
    ) -> None:
        super().__init__(
            min_len=min_len,
            max_len=max_len,
            fs_encoding=fs_encoding,
            check_reserved=check_reserved,
            additional_reserved_names=additional_reserved_names,
            platform_max_len=_DEFAULT_MAX_FILENAME_LEN,
            platform=platform,
        )

    def validate(self, value: PathType) -> None:
        validate_pathtype(value, allow_whitespaces=not self._is_windows(include_universal=True))

        unicode_filename = to_str(value)
        byte_ct = len(unicode_filename.encode(self._fs_encoding))

        self.validate_abspath(unicode_filename)

        err_kwargs = {
            ErrorAttrKey.REASON: ErrorReason.INVALID_LENGTH,
            ErrorAttrKey.PLATFORM: self.platform,
            ErrorAttrKey.FS_ENCODING: self._fs_encoding,
            ErrorAttrKey.BYTE_COUNT: byte_ct,
        }
        if byte_ct > self.max_len:
            raise ValidationError(
                [
                    f"filename is too long: expected<={self.max_len:d} bytes, actual={byte_ct:d} bytes"
                ],
                **err_kwargs,
            )
        if byte_ct < self.min_len:
            raise ValidationError(
                [
                    f"filename is too short: expected>={self.min_len:d} bytes, actual={byte_ct:d} bytes"
                ],
                **err_kwargs,
            )

        self._validate_reserved_keywords(unicode_filename)
        self.__validate_universal_filename(unicode_filename)

        if self._is_windows(include_universal=True):
            self.__validate_win_filename(unicode_filename)

    def validate_abspath(self, value: str) -> None:
        err = ValidationError(
            description=f"found an absolute path ({value}), expected a filename",
            platform=self.platform,
            reason=ErrorReason.FOUND_ABS_PATH,
        )

        if self._is_windows(include_universal=True):
            if ntpath.isabs(value):
                raise err

        if posixpath.isabs(value):
            raise err

    def __validate_universal_filename(self, unicode_filename: str) -> None:
        match = _RE_INVALID_FILENAME.findall(unicode_filename)
        if match:
            raise InvalidCharError(
                INVALID_CHAR_ERR_MSG_TMPL.format(
                    invalid=findall_to_str(match), value=repr(unicode_filename)
                ),
                platform=Platform.UNIVERSAL,
            )

    def __validate_win_filename(self, unicode_filename: str) -> None:
        match = _RE_INVALID_WIN_FILENAME.findall(unicode_filename)
        if match:
            raise InvalidCharError(
                INVALID_CHAR_ERR_MSG_TMPL.format(
                    invalid=findall_to_str(match), value=repr(unicode_filename)
                ),
                platform=Platform.WINDOWS,
            )

        if unicode_filename in (".", ".."):
            return

        KB2829981_err_tmpl = "{}. Refer: https://learn.microsoft.com/en-us/troubleshoot/windows-client/shell-experience/file-folder-name-whitespace-characters"  # noqa: E501

        if unicode_filename[-1] in (" ", "."):
            raise InvalidCharError(
                INVALID_CHAR_ERR_MSG_TMPL.format(
                    invalid=re.escape(unicode_filename[-1]), value=repr(unicode_filename)
                ),
                platform=Platform.WINDOWS,
                description=KB2829981_err_tmpl.format(
                    "Do not end a file or directory name with a space or a period"
                ),
            )

        if unicode_filename[0] in (" "):
            raise InvalidCharError(
                INVALID_CHAR_ERR_MSG_TMPL.format(
                    invalid=re.escape(unicode_filename[0]), value=repr(unicode_filename)
                ),
                platform=Platform.WINDOWS,
                description=KB2829981_err_tmpl.format(
                    "Do not start a file or directory name with a space"
                ),
            )


[docs] def validate_filename( filename: PathType, platform: Optional[PlatformType] = None, min_len: int = DEFAULT_MIN_LEN, max_len: int = _DEFAULT_MAX_FILENAME_LEN, fs_encoding: Optional[str] = None, check_reserved: bool = True, additional_reserved_names: Optional[Sequence[str]] = None, ) -> None: """Verifying whether the ``filename`` is a valid file name or not. Args: filename: Filename to validate. platform: Target platform name of the filename. .. include:: platform.txt min_len: Minimum byte length of the ``filename``. The value must be greater or equal to one. Defaults to ``1``. max_len: Maximum byte length of the ``filename``. The value must be lower than: - ``Linux``: 4096 - ``macOS``: 1024 - ``Windows``: 260 - ``universal``: 260 Defaults to ``255``. fs_encoding: Filesystem encoding that used to calculate the byte length of the filename. If |None|, get the value from the execution environment. check_reserved: If |True|, check reserved names of the ``platform``. additional_reserved_names: Additional reserved names to check. Case insensitive. Raises: ValidationError (ErrorReason.INVALID_LENGTH): If the ``filename`` is longer than ``max_len`` characters. ValidationError (ErrorReason.INVALID_CHARACTER): If the ``filename`` includes invalid character(s) for a filename: |invalid_filename_chars|. The following characters are also invalid for Windows platforms: |invalid_win_filename_chars|. ValidationError (ErrorReason.RESERVED_NAME): If the ``filename`` equals reserved name by OS. Windows reserved name is as follows: ``"CON"``, ``"PRN"``, ``"AUX"``, ``"NUL"``, ``"COM[1-9]"``, ``"LPT[1-9]"``. Example: :ref:`example-validate-filename` See Also: `Naming Files, Paths, and Namespaces - Win32 apps | Microsoft Docs <https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file>`__ """ FileNameValidator( platform=platform, min_len=min_len, max_len=max_len, fs_encoding=fs_encoding, check_reserved=check_reserved, additional_reserved_names=additional_reserved_names, ).validate(filename)
[docs] def is_valid_filename( filename: PathType, platform: Optional[PlatformType] = None, min_len: int = DEFAULT_MIN_LEN, max_len: Optional[int] = None, fs_encoding: Optional[str] = None, check_reserved: bool = True, additional_reserved_names: Optional[Sequence[str]] = None, ) -> bool: """Check whether the ``filename`` is a valid name or not. Args: filename: A filename to be checked. platform: Target platform name of the filename. Example: :ref:`example-is-valid-filename` See Also: :py:func:`.validate_filename()` """ return FileNameValidator( platform=platform, min_len=min_len, max_len=-1 if max_len is None else max_len, fs_encoding=fs_encoding, check_reserved=check_reserved, additional_reserved_names=additional_reserved_names, ).is_valid(filename)
[docs] def sanitize_filename( filename: PathType, replacement_text: str = "", platform: Optional[PlatformType] = None, max_len: Optional[int] = _DEFAULT_MAX_FILENAME_LEN, fs_encoding: Optional[str] = None, check_reserved: Optional[bool] = None, null_value_handler: Optional[ValidationErrorHandler] = None, reserved_name_handler: Optional[ValidationErrorHandler] = None, additional_reserved_names: Optional[Sequence[str]] = None, validate_after_sanitize: bool = False, ) -> PathType: """Make a valid filename from a string. To make a valid filename, the function does the following: - Replace invalid characters as file names included in the ``filename`` with the ``replacement_text``. Invalid characters are: - unprintable characters - |invalid_filename_chars| - for Windows (or universal) only: |invalid_win_filename_chars| - Replace a value if a sanitized value is a reserved name by operating systems with a specified handler by ``reserved_name_handler``. Args: filename: Filename to sanitize. replacement_text: Replacement text for invalid characters. Defaults to ``""``. platform: Target platform name of the filename. .. include:: platform.txt max_len: Maximum byte length of the ``filename``. Truncate the name length if the ``filename`` length exceeds this value. Defaults to ``255``. fs_encoding: Filesystem encoding that used to calculate the byte length of the filename. If |None|, get the value from the execution environment. check_reserved: [Deprecated] Use 'reserved_name_handler' instead. null_value_handler: Function called when a value after sanitization is an empty string. You can specify predefined handlers: - :py:func:`~.handler.NullValueHandler.return_null_string` - :py:func:`~.handler.NullValueHandler.return_timestamp` - :py:func:`~.handler.raise_error` Defaults to :py:func:`.handler.NullValueHandler.return_null_string` that just return ``""``. reserved_name_handler: Function called when a value after sanitization is a reserved name. You can specify predefined handlers: - :py:meth:`~.handler.ReservedNameHandler.add_leading_underscore` - :py:meth:`~.handler.ReservedNameHandler.add_trailing_underscore` - :py:meth:`~.handler.ReservedNameHandler.as_is` - :py:func:`~.handler.raise_error` Defaults to :py:func:`.handler.add_trailing_underscore`. additional_reserved_names: Additional reserved names to sanitize. Case insensitive. validate_after_sanitize: Execute validation after sanitization to the file name. Returns: Same type as the ``filename`` (str or PathLike object): Sanitized filename. Raises: ValueError: If the ``filename`` is an invalid filename. Example: :ref:`example-sanitize-filename` """ if check_reserved is not None: warnings.warn( "'check_reserved' is deprecated. Use 'reserved_name_handler' instead.", DeprecationWarning, ) if check_reserved is False: reserved_name_handler = ReservedNameHandler.as_is return FileNameSanitizer( platform=platform, max_len=-1 if max_len is None else max_len, fs_encoding=fs_encoding, null_value_handler=null_value_handler, reserved_name_handler=reserved_name_handler, additional_reserved_names=additional_reserved_names, validate_after_sanitize=validate_after_sanitize, ).sanitize(filename, replacement_text)