Module: json_response_util

Expand source code

# Copyright (C) 2023-present The Project Contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import json
import regex as re


class JsonResponseUtil:
    """Helper methods for parsing JSON response from LLMs."""

    @classmethod
    def normalize_unescaped_quotes_and_load_json_str(cls, s: str, strict: bool = False) -> str:
        js_str = s
        max_replaces = s.count('"')

        if not max_replaces:
            json.loads(js_str)
            return js_str

        prev_pos = -1
        curr_pos = 0

        while curr_pos > prev_pos and max_replaces > 0:
            max_replaces -= 1
            # after while check, move marker before we overwrite it
            prev_pos = curr_pos
            try:
                json.loads(js_str)
                return js_str
            except json.JSONDecodeError as err:
                curr_pos = err.pos
                if curr_pos <= prev_pos:
                    break

                # find the previous " before e.pos
                prev_quote_index = js_str.rfind('"', 0, curr_pos)
                if prev_quote_index > 1 and js_str[prev_quote_index - 1] == "\":
                    # if the previous " is escaped, ignore it
                    prev_pos = curr_pos
                    continue

                # escape it to "
                js_str = js_str[:prev_quote_index] + "\" + js_str[prev_quote_index:]
        if strict:
            raise ValueError("Unable to normalize unescaped quotes in the provided string to a valid JSON!")
        return "{}"

    @classmethod
    def extract_json_content(cls, json_string: str) -> str:
        """Remove all symbols up to the first `{` and all symbols after the last `}` from a JSON string."""

        # Find the first and last occurrences of `{` and `}`
        first_curly_bracket = json_string.find("{")
        last_curly_bracket = json_string.rfind("}")

        # Ensure that both curly brackets were found
        if first_curly_bracket != -1 and last_curly_bracket != -1:
            first_square_bracket = json_string.find("[", 0, first_curly_bracket)
            last_square_bracket = json_string.rfind("]", last_curly_bracket, len(json_string))

            if first_square_bracket != -1 and last_square_bracket != -1:
                # Extract the substring between the square brackets, it is an array
                return json_string[first_square_bracket : last_square_bracket + 1]
            elif first_square_bracket == -1 and last_square_bracket == -1:
                # Extract the substring between the curly brackets, it is an object
                return json_string[first_curly_bracket : last_curly_bracket + 1]

        # Return original string in case one or both opened and closed curly brackets were not found
        return json_string

    @classmethod
    def fix_json_format(cls, json_string: str) -> str:
        """Fix JSON quotes and values."""

        # Remove all the symbols before the first opened bracket and after the last closed,
        # in particular, '''json<actual_json>''' wrap.
        fixed_json_string = cls.extract_json_content(json_string)

        temp_str = "__TEMP__"

        # Fix apostrophes and single quotes
        # This regex patterns uses positive lookbehind to ensure that the apostrophe is preceded by a word character
        # and positive lookahead to ensure it's followed by a word character.
        # This way, it will match only the apostrophes used as contractions or possessive form.
        #
        # sample_string = 'He's going to the doctor's office on his brothers' motorcycle.'
        # fixed_string  = 'He{temp_str}s going to the doctor{temp_str}s office on his brothers{temp_str} motorcycle.'
        fixed_json_string = re.sub(r"(?<=w)'(?=w)", temp_str, fixed_json_string)  # It's
        fixed_json_string = re.sub(r"(?<=w)\'(?=w)", temp_str, fixed_json_string)  # It's
        fixed_json_string = re.sub(r"(?<=s)'(?=s)", temp_str, fixed_json_string)  # Its'_  (_ is space)
        fixed_json_string = re.sub(r"(?<=s)\'(?=s)", temp_str, fixed_json_string)  # Its'_  (_ is space)

        # Replace all remaining single quotes with double quotes
        fixed_json_string = fixed_json_string.replace("'", '"')

        # Replace {temp_str} back to single quotes
        fixed_json_string = fixed_json_string.replace(temp_str, "'")

        # Fix boolean and None values
        fixed_json_string = fixed_json_string.replace("True", "true")
        fixed_json_string = fixed_json_string.replace("False", "false")
        fixed_json_string = fixed_json_string.replace('"None"', "None")
        # Use the following regex to replace None with "None".
        # `fixed_json_string.replace('"None"', 'None')` won't work,
        # because LLM output value may contain something like: "None of the above"
        fixed_json_string = re.sub(r'(?<="[^"]+":s*)None', '"None"', fixed_json_string)

        # Fix trailing comma before closing brace
        fixed_json_string = re.sub(r",s*}", " }", fixed_json_string)

        # Fix unnecessary escaped underscores
        fixed_json_string = fixed_json_string.replace("\_", "_")

        fixed_json_string = fixed_json_string.replace("\n", "n")

        fixed_json_string = fixed_json_string.replace('\"', "'")

        return fixed_json_string

    @classmethod
    def try_to_load_json_string(cls, json_string: str) -> str:
        """Try to load the provided json string. In case of success, return str, otherwise raise JSONDecodeError."""
        try:
            # Try to load the provided json string firstly
            json.loads(json_string)
            return json_string
        except json.JSONDecodeError:
            # In case of exception, try to normalize quotes and load it once again
            try:
                normalized_string = cls.normalize_unescaped_quotes_and_load_json_str(json_string, strict=True)
                json.loads(normalized_string)
                return normalized_string
            except ValueError:
                # LLM might not fully understand an exception message that is raised in
                # normalize_unescaped_quotes_and_load_json_str function.
                # For this reason, we suppress this exception and raise on the original one
                pass
            raise

Classes

class JsonResponseUtil

Helper methods for parsing JSON response from LLMs.

Expand source code

class JsonResponseUtil:
    """Helper methods for parsing JSON response from LLMs."""

    @classmethod
    def normalize_unescaped_quotes_and_load_json_str(cls, s: str, strict: bool = False) -> str:
        js_str = s
        max_replaces = s.count('"')

        if not max_replaces:
            json.loads(js_str)
            return js_str

        prev_pos = -1
        curr_pos = 0

        while curr_pos > prev_pos and max_replaces > 0:
            max_replaces -= 1
            # after while check, move marker before we overwrite it
            prev_pos = curr_pos
            try:
                json.loads(js_str)
                return js_str
            except json.JSONDecodeError as err:
                curr_pos = err.pos
                if curr_pos <= prev_pos:
                    break

                # find the previous " before e.pos
                prev_quote_index = js_str.rfind('"', 0, curr_pos)
                if prev_quote_index > 1 and js_str[prev_quote_index - 1] == "\":
                    # if the previous " is escaped, ignore it
                    prev_pos = curr_pos
                    continue

                # escape it to "
                js_str = js_str[:prev_quote_index] + "\" + js_str[prev_quote_index:]
        if strict:
            raise ValueError("Unable to normalize unescaped quotes in the provided string to a valid JSON!")
        return "{}"

    @classmethod
    def extract_json_content(cls, json_string: str) -> str:
        """Remove all symbols up to the first `{` and all symbols after the last `}` from a JSON string."""

        # Find the first and last occurrences of `{` and `}`
        first_curly_bracket = json_string.find("{")
        last_curly_bracket = json_string.rfind("}")

        # Ensure that both curly brackets were found
        if first_curly_bracket != -1 and last_curly_bracket != -1:
            first_square_bracket = json_string.find("[", 0, first_curly_bracket)
            last_square_bracket = json_string.rfind("]", last_curly_bracket, len(json_string))

            if first_square_bracket != -1 and last_square_bracket != -1:
                # Extract the substring between the square brackets, it is an array
                return json_string[first_square_bracket : last_square_bracket + 1]
            elif first_square_bracket == -1 and last_square_bracket == -1:
                # Extract the substring between the curly brackets, it is an object
                return json_string[first_curly_bracket : last_curly_bracket + 1]

        # Return original string in case one or both opened and closed curly brackets were not found
        return json_string

    @classmethod
    def fix_json_format(cls, json_string: str) -> str:
        """Fix JSON quotes and values."""

        # Remove all the symbols before the first opened bracket and after the last closed,
        # in particular, '''json<actual_json>''' wrap.
        fixed_json_string = cls.extract_json_content(json_string)

        temp_str = "__TEMP__"

        # Fix apostrophes and single quotes
        # This regex patterns uses positive lookbehind to ensure that the apostrophe is preceded by a word character
        # and positive lookahead to ensure it's followed by a word character.
        # This way, it will match only the apostrophes used as contractions or possessive form.
        #
        # sample_string = 'He's going to the doctor's office on his brothers' motorcycle.'
        # fixed_string  = 'He{temp_str}s going to the doctor{temp_str}s office on his brothers{temp_str} motorcycle.'
        fixed_json_string = re.sub(r"(?<=w)'(?=w)", temp_str, fixed_json_string)  # It's
        fixed_json_string = re.sub(r"(?<=w)\'(?=w)", temp_str, fixed_json_string)  # It's
        fixed_json_string = re.sub(r"(?<=s)'(?=s)", temp_str, fixed_json_string)  # Its'_  (_ is space)
        fixed_json_string = re.sub(r"(?<=s)\'(?=s)", temp_str, fixed_json_string)  # Its'_  (_ is space)

        # Replace all remaining single quotes with double quotes
        fixed_json_string = fixed_json_string.replace("'", '"')

        # Replace {temp_str} back to single quotes
        fixed_json_string = fixed_json_string.replace(temp_str, "'")

        # Fix boolean and None values
        fixed_json_string = fixed_json_string.replace("True", "true")
        fixed_json_string = fixed_json_string.replace("False", "false")
        fixed_json_string = fixed_json_string.replace('"None"', "None")
        # Use the following regex to replace None with "None".
        # `fixed_json_string.replace('"None"', 'None')` won't work,
        # because LLM output value may contain something like: "None of the above"
        fixed_json_string = re.sub(r'(?<="[^"]+":s*)None', '"None"', fixed_json_string)

        # Fix trailing comma before closing brace
        fixed_json_string = re.sub(r",s*}", " }", fixed_json_string)

        # Fix unnecessary escaped underscores
        fixed_json_string = fixed_json_string.replace("\_", "_")

        fixed_json_string = fixed_json_string.replace("\n", "n")

        fixed_json_string = fixed_json_string.replace('\"', "'")

        return fixed_json_string

    @classmethod
    def try_to_load_json_string(cls, json_string: str) -> str:
        """Try to load the provided json string. In case of success, return str, otherwise raise JSONDecodeError."""
        try:
            # Try to load the provided json string firstly
            json.loads(json_string)
            return json_string
        except json.JSONDecodeError:
            # In case of exception, try to normalize quotes and load it once again
            try:
                normalized_string = cls.normalize_unescaped_quotes_and_load_json_str(json_string, strict=True)
                json.loads(normalized_string)
                return normalized_string
            except ValueError:
                # LLM might not fully understand an exception message that is raised in
                # normalize_unescaped_quotes_and_load_json_str function.
                # For this reason, we suppress this exception and raise on the original one
                pass
            raise

Static methods

def extract_json_content(json_string: str) -> str: Remove all symbols up to the first { and all symbols after the last } from a JSON string.
def fix_json_format(json_string: str) -> str: Fix JSON quotes and values.
def normalize_unescaped_quotes_and_load_json_str(s: str, strict: bool = False) -> str
def try_to_load_json_string(json_string: str) -> str: Try to load the provided json string. In case of success, return str, otherwise raise JSONDecodeError.