Module: dataset_util

Expand source code

# Copyright (C) 2023-present The Project Contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import datetime as dt
from typing import Iterable
from typing import List
from urllib.parse import unquote
from cl.runtime.primitive.date_util import DateUtil
from cl.runtime.primitive.datetime_util import DatetimeUtil
from cl.runtime.records.protocols import TPrimitive


class DatasetUtil:
    """
    Utility class for dataset validation and transformation.

    Dataset can be a list of levels, a backslash-delimited string starting from backslash, or None.
    """

    _sep = "\"
    _two_sep = "\\"

    @classmethod
    def root(cls) -> str:
        """Root dataset consists of a single separator."""
        return cls._sep

    @classmethod
    def to_levels(cls, dataset: str) -> List[str]:
        """Convert the dataset from any input format to a list of levels and perform validation."""

        if dataset is None or dataset == cls._sep:
            return []  # Root dataset has no levels

        elif isinstance(dataset, str):
            # Convert URL quoted unicode characters
            dataset = unquote(dataset)

            # Remove leading separator if present
            if dataset.startswith(cls._sep):
                dataset = dataset.removeprefix(cls._sep)

            # Split into levels according to the separator
            dataset = dataset.split(cls._sep)

        if hasattr(dataset, "__iter__"):
            # Validate all levels
            [cls._normalize_level(level) for level in dataset]
        else:
            raise RuntimeError(f"Dataset {dataset} is not a delimited string, iterable of strings, or None.")

        return dataset

    @classmethod
    def to_lookup_list(cls, dataset: str) -> List[str]:
        """
        Convert the dataset in any format to a list of datasets in string format.
        Each element of the returned list represents one step in a hierarchical lookup
        starting from the argument dataset and ending with the root dataset.
        """

        # Convert to levels
        levels = cls.to_levels(dataset)

        # Each element of this list has one less level, starting from the original list and ending with empty list
        list_of_partial_lists = [levels[: len(levels) - i] for i in range(len(levels) + 1)]

        # Convert each list element to string format
        result = [cls.combine(*partial_list) for partial_list in list_of_partial_lists]
        return result

    @classmethod
    def combine(cls, *datasets: TPrimitive | Iterable[TPrimitive] | None) -> str:
        """
        Combine one or more datasets with validation, where each argument may contain more than one level.

        Notes:
            - The arguments may optionally begin from dataset separator (backslash)
            - Arguments that are None are disregarded
        """

        # Return root dataset if no parameters are passed
        if len(datasets) == 0:
            return cls._sep

        # Convert non-empty tokens to levels with validation
        arg_levels = [cls.to_levels(p) for p in datasets if p is not None]

        # Merge lists
        all_levels = [level for dataset in arg_levels if dataset is not None for level in dataset if level is not None]

        # Convert to string
        result = cls._sep + cls._sep.join(all_levels)
        return result

    @classmethod
    def _normalize_str(cls, dataset: str) -> str:
        """
        Normalize a dataset provided in string format by converting URL quoted unicode characters.
        Validates that the dataset consists of backslash delimited levels with leading backslash.
        """

        if not isinstance(dataset, str):
            raise RuntimeError(f"Method DatasetUtil.normalize(str) is applied to non-string dataset {dataset}.")

        # Convert URL quoted unicode characters
        dataset = unquote(dataset)

        if not dataset.startswith(cls._sep):
            raise Exception(f"Dataset '{dataset}' does not start with a backslash separator.")
        if dataset.endswith(cls._sep):
            raise Exception(f"Dataset '{dataset}' must not end with a backslash separator.")
        if cls._two_sep in dataset:
            raise Exception(f"Dataset '{dataset}' contains two backslash separators in a row.")
        if dataset.startswith(" "):
            raise Exception(f"Dataset '{dataset}' has a leading space.")
        if dataset.endswith(" "):
            raise Exception(f"Dataset '{dataset}' has a trailing space.")

        return dataset

    @classmethod
    def _normalize_level(cls, dataset_level: TPrimitive | None) -> str:
        """Validate and convert input to a single dataset level."""

        if isinstance(dataset_level, str):
            # Convert URL quoted unicode characters
            dataset_level = unquote(dataset_level)

            # Validate string level format
            if dataset_level == "":
                raise Exception(f"A dataset level is an empty string.")
            if cls._sep in dataset_level:
                raise Exception(
                    f"Dataset level '{dataset_level}' includes backslash. This is not allowed "
                    f"because backslash also serves as a level separator."
                )
            if dataset_level.startswith(" "):
                raise Exception(f"Dataset level '{dataset_level}' has a leading space.")
            if dataset_level.endswith(" "):
                raise Exception(f"Dataset level '{dataset_level}' has a trailing space.")

            return dataset_level

        elif isinstance(dataset_level, dt.date):
            # Convert to ISO-8601 format for date (yyyy-mm-dd)
            return DateUtil.to_str(dataset_level)
        elif isinstance(dataset_level, dt.datetime):
            # Convert to ISO-8601 format for datetime (yyyy-mm-dd) with validation
            # Datetime must be rounded to milliseconds and in UTC timezone
            return DatetimeUtil.to_str(dataset_level)
        else:
            # TODO: Add other primitive types
            raise Exception(
                f"Dataset level '{str(dataset_level)}' has type {type(dataset_level)} which is not "
                f"one of the permitted dataset token types or their iterable."
            )

Classes

class DatasetUtil

Utility class for dataset validation and transformation.

Dataset can be a list of levels, a backslash-delimited string starting from backslash, or None.

Expand source code

class DatasetUtil:
    """
    Utility class for dataset validation and transformation.

    Dataset can be a list of levels, a backslash-delimited string starting from backslash, or None.
    """

    _sep = "\"
    _two_sep = "\\"

    @classmethod
    def root(cls) -> str:
        """Root dataset consists of a single separator."""
        return cls._sep

    @classmethod
    def to_levels(cls, dataset: str) -> List[str]:
        """Convert the dataset from any input format to a list of levels and perform validation."""

        if dataset is None or dataset == cls._sep:
            return []  # Root dataset has no levels

        elif isinstance(dataset, str):
            # Convert URL quoted unicode characters
            dataset = unquote(dataset)

            # Remove leading separator if present
            if dataset.startswith(cls._sep):
                dataset = dataset.removeprefix(cls._sep)

            # Split into levels according to the separator
            dataset = dataset.split(cls._sep)

        if hasattr(dataset, "__iter__"):
            # Validate all levels
            [cls._normalize_level(level) for level in dataset]
        else:
            raise RuntimeError(f"Dataset {dataset} is not a delimited string, iterable of strings, or None.")

        return dataset

    @classmethod
    def to_lookup_list(cls, dataset: str) -> List[str]:
        """
        Convert the dataset in any format to a list of datasets in string format.
        Each element of the returned list represents one step in a hierarchical lookup
        starting from the argument dataset and ending with the root dataset.
        """

        # Convert to levels
        levels = cls.to_levels(dataset)

        # Each element of this list has one less level, starting from the original list and ending with empty list
        list_of_partial_lists = [levels[: len(levels) - i] for i in range(len(levels) + 1)]

        # Convert each list element to string format
        result = [cls.combine(*partial_list) for partial_list in list_of_partial_lists]
        return result

    @classmethod
    def combine(cls, *datasets: TPrimitive | Iterable[TPrimitive] | None) -> str:
        """
        Combine one or more datasets with validation, where each argument may contain more than one level.

        Notes:
            - The arguments may optionally begin from dataset separator (backslash)
            - Arguments that are None are disregarded
        """

        # Return root dataset if no parameters are passed
        if len(datasets) == 0:
            return cls._sep

        # Convert non-empty tokens to levels with validation
        arg_levels = [cls.to_levels(p) for p in datasets if p is not None]

        # Merge lists
        all_levels = [level for dataset in arg_levels if dataset is not None for level in dataset if level is not None]

        # Convert to string
        result = cls._sep + cls._sep.join(all_levels)
        return result

    @classmethod
    def _normalize_str(cls, dataset: str) -> str:
        """
        Normalize a dataset provided in string format by converting URL quoted unicode characters.
        Validates that the dataset consists of backslash delimited levels with leading backslash.
        """

        if not isinstance(dataset, str):
            raise RuntimeError(f"Method DatasetUtil.normalize(str) is applied to non-string dataset {dataset}.")

        # Convert URL quoted unicode characters
        dataset = unquote(dataset)

        if not dataset.startswith(cls._sep):
            raise Exception(f"Dataset '{dataset}' does not start with a backslash separator.")
        if dataset.endswith(cls._sep):
            raise Exception(f"Dataset '{dataset}' must not end with a backslash separator.")
        if cls._two_sep in dataset:
            raise Exception(f"Dataset '{dataset}' contains two backslash separators in a row.")
        if dataset.startswith(" "):
            raise Exception(f"Dataset '{dataset}' has a leading space.")
        if dataset.endswith(" "):
            raise Exception(f"Dataset '{dataset}' has a trailing space.")

        return dataset

    @classmethod
    def _normalize_level(cls, dataset_level: TPrimitive | None) -> str:
        """Validate and convert input to a single dataset level."""

        if isinstance(dataset_level, str):
            # Convert URL quoted unicode characters
            dataset_level = unquote(dataset_level)

            # Validate string level format
            if dataset_level == "":
                raise Exception(f"A dataset level is an empty string.")
            if cls._sep in dataset_level:
                raise Exception(
                    f"Dataset level '{dataset_level}' includes backslash. This is not allowed "
                    f"because backslash also serves as a level separator."
                )
            if dataset_level.startswith(" "):
                raise Exception(f"Dataset level '{dataset_level}' has a leading space.")
            if dataset_level.endswith(" "):
                raise Exception(f"Dataset level '{dataset_level}' has a trailing space.")

            return dataset_level

        elif isinstance(dataset_level, dt.date):
            # Convert to ISO-8601 format for date (yyyy-mm-dd)
            return DateUtil.to_str(dataset_level)
        elif isinstance(dataset_level, dt.datetime):
            # Convert to ISO-8601 format for datetime (yyyy-mm-dd) with validation
            # Datetime must be rounded to milliseconds and in UTC timezone
            return DatetimeUtil.to_str(dataset_level)
        else:
            # TODO: Add other primitive types
            raise Exception(
                f"Dataset level '{str(dataset_level)}' has type {type(dataset_level)} which is not "
                f"one of the permitted dataset token types or their iterable."
            )

Static methods

Combine one or more datasets with validation, where each argument may contain more than one level.

Notes

The arguments may optionally begin from dataset separator (backslash)
Arguments that are None are disregarded

def root() -> str

Root dataset consists of a single separator.

def to_levels(dataset: str) -> List[str]

Convert the dataset from any input format to a list of levels and perform validation.

def to_lookup_list(dataset: str) -> List[str]

Convert the dataset in any format to a list of datasets in string format. Each element of the returned list represents one step in a hierarchical lookup starting from the argument dataset and ending with the root dataset.