Source code for mcp_server_webcrawl.extras.regex

import re

from functools import lru_cache
from typing import Final
from logging import Logger

from mcp_server_webcrawl.utils.logger import get_logger

__REGEX_PATTERNS_REGEX_HAZARDS: Final[list[str]] = [
    r"\([^)]*\*[^)]*\+",                   # (.*)*+, (.+)*+, etc.
    r"\([^)]*\+[^)]*\*",                   # (.+)*., (.*)++, etc.
    r"\([^)]*\+[^)]*\+",                   # (.+)+, (.++)+ etc.
    r"\([^)]*\*[^)]*\*",                   # (.*)*, (.**) etc.
    r"\.\*.*\.\*",                         # .*.* patterns
    r"\.\+.*\.\+",                         # .+.+ patterns
    r"\([^)]*\?\)\*",                      # (a?)* patterns
    r"\([^)]*\?\)\+",                      # (a?)+ patterns
    r"\([^)]*[*+?][^)]*[*+?][^)]*\)[*+]",  # 2+ quantifiers inside, then quantifier outside
]

logger: Logger = get_logger()

@lru_cache(maxsize=None)
def __get_compiled_hazard_patterns():
    """
    Lazy load compiled patterns
    """
    compiled_patterns = []
    for hazard in __REGEX_PATTERNS_REGEX_HAZARDS:
        try:
            compiled_patterns.append(re.compile(hazard))
        except re.error as e:
            logger.warning(f"Invalid hazard pattern {hazard}: {e}")
            continue
    return compiled_patterns

def __regex_is_hazardous(pattern: str) -> bool:
    """
    Check if a regex pattern might cause catastrophic backtracking
    or otherwise unacceptable performance over up to 100 HTML files
    """

    compiled_hazards = __get_compiled_hazard_patterns()

    for hazard_pattern in compiled_hazards:
        try:
            if hazard_pattern.search(pattern):
                logger.error(f"hazardous regex discarded {pattern} matched {hazard_pattern.pattern}")
                return True
        except re.error as e:
            logger.warning(f"Error checking hazard pattern {hazard_pattern.pattern}: {e}")
            continue

    return False


[docs]
def get_regex(headers: str, content: str, patterns: list[str]) -> list[dict[str, str | int]]:
    """
    Takes headers and content and gets regex matches

    Arguments:
        headers: The headers to search
        content: The content to search
        patterns: The regex patterns

    Returns:
        A list of dicts, with selector, value, groups, position info, and source
    """

    if not isinstance(content, str):
        content = ""
    if not isinstance(headers, str):
        headers = ""

    if not isinstance(patterns, list) or not all(isinstance(item, str) for item in patterns):
        raise ValueError("patterns must be a list of strings")

    results = []

    if content == "" and headers == "":
        return results

    re_patterns = []
    for pattern in patterns:
        if __regex_is_hazardous(pattern):
            logger.warning(f"Hazardous regex pattern '{pattern}'")
            continue

        try:
            re_pattern = re.compile(pattern)
            re_patterns.append(re_pattern)
        except re.error as ex:
            logger.warning(f"Invalid regex pattern '{pattern}': {ex}")
            continue

    # search headers and content
    search_targets = [("headers", headers), ("content", content)]

    for re_pattern in re_patterns:
        for source_name, search_text in search_targets:
            if not search_text:
                continue

            for match in re_pattern.finditer(search_text):
                regex_hit: dict[str, str | int] = {
                    "selector": re_pattern.pattern,
                    "value": match.group(0),
                    "source": source_name  # headers or content
                }

                if match.groups():
                    for i, group in enumerate(match.groups(), 1):
                        if group is not None:
                            regex_hit[f"group_{i}"] = group

                regex_hit["start"] = match.start()
                regex_hit["end"] = match.end()
                results.append(regex_hit)

    return results