Module: char_util

Expand source code
# Copyright (C) 2023-present The Project Contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
from cl.runtime.primitive.string_util import StringUtil

_DESCRIPTION_MAP = {
    "x00": "Null Byte",
    "n": "Newline",
    "t": "Tab",
    "r": "Carriage Return",
    "ufeff": "UTF-8 BOM",
    "uFFFD": "Unicode replacement character",
    "u201C": "Left Double Quotation Mark",
    "u201D": "Right Double Quotation Mark",
}
"""Map of special unprintable characters to their descriptions."""

_FLAGGED_CHARS = [
    "x00",  # Null Byte
    "uFFFD",  # Unicode replacement character
]
"""List of characters that will trigger an error during normalization."""

_REMOVED_CHARS = [
    "r",  # Carriage Return
    "ufeff",  # UTF-8 BOM
    "u201C",  # Left Double Quotation Mark
    "u201D",  # Right Double Quotation Mark
]
"""List of characters that will be removed during normalization."""

_REPLACED_CHARS = {
    "t": "    ",  # Tab
}
"""List of characters that will be replaced during normalization and their replacements."""

_FLAGGED_CHARS_REGEX = f"[{''.join(_FLAGGED_CHARS)}]"


class CharUtil:
    """Utilities for working with single characters."""

    @classmethod
    def normalize_chars(cls, value: str) -> str:
        """Flag _FLAGGED_CHARS, remove _REMOVED_CHARS and _REPLACED_CHARS."""

        # Do not normalize chars in None or an empty string
        if StringUtil.is_empty(value):
            return value

        # Search for flagged characters
        flagged_chars = list(set(re.findall(_FLAGGED_CHARS_REGEX, value)))
        if flagged_chars:
            flagged_char_names = ", ".join(CharUtil.describe_char(char) for char in flagged_chars)
            raise RuntimeError(f"The following characters are not allowed in input text: " f"{flagged_char_names}")

        # Create a translation table for replacement
        translation_table = str.maketrans(_REPLACED_CHARS)

        # Apply the translation table to replace characters
        value = value.translate(translation_table)

        # Remove characters from _REMOVED_CHARS by translating them to None
        removal_table = str.maketrans("", "", "".join(_REMOVED_CHARS))

        # Apply the removal translation
        return value.translate(removal_table)

    @classmethod
    def describe_char(cls, char: str) -> str:
        """If the character is in the special map, use its name, otherwise use repr()."""
        return _DESCRIPTION_MAP.get(char, repr(char))

Classes

class CharUtil

Utilities for working with single characters.

Expand source code
class CharUtil:
    """Utilities for working with single characters."""

    @classmethod
    def normalize_chars(cls, value: str) -> str:
        """Flag _FLAGGED_CHARS, remove _REMOVED_CHARS and _REPLACED_CHARS."""

        # Do not normalize chars in None or an empty string
        if StringUtil.is_empty(value):
            return value

        # Search for flagged characters
        flagged_chars = list(set(re.findall(_FLAGGED_CHARS_REGEX, value)))
        if flagged_chars:
            flagged_char_names = ", ".join(CharUtil.describe_char(char) for char in flagged_chars)
            raise RuntimeError(f"The following characters are not allowed in input text: " f"{flagged_char_names}")

        # Create a translation table for replacement
        translation_table = str.maketrans(_REPLACED_CHARS)

        # Apply the translation table to replace characters
        value = value.translate(translation_table)

        # Remove characters from _REMOVED_CHARS by translating them to None
        removal_table = str.maketrans("", "", "".join(_REMOVED_CHARS))

        # Apply the removal translation
        return value.translate(removal_table)

    @classmethod
    def describe_char(cls, char: str) -> str:
        """If the character is in the special map, use its name, otherwise use repr()."""
        return _DESCRIPTION_MAP.get(char, repr(char))

Static methods

def describe_char(char: str) -> str

If the character is in the special map, use its name, otherwise use repr().

def normalize_chars(value: str) -> str

Flag _FLAGGED_CHARS, remove _REMOVED_CHARS and _REPLACED_CHARS.