Source code for mcp_server_webcrawl.crawlers.interrobot.crawler


from pathlib import Path

from mcp.types import Tool

from mcp_server_webcrawl.models.sites import SiteResult
from mcp_server_webcrawl.models.resources import (
    RESOURCES_FIELDS_DEFAULT,
    RESOURCES_FIELDS_REQUIRED,
    RESOURCES_DEFAULT_SORT_MAPPING,
)
from mcp_server_webcrawl.crawlers.base.crawler import BaseCrawler
from mcp_server_webcrawl.crawlers.interrobot.adapter import (
    get_sites,
    get_resources,
    INTERROBOT_RESOURCE_FIELD_MAPPING,
    INTERROBOT_SITE_FIELD_MAPPING,
    INTERROBOT_SITE_FIELD_REQUIRED,
)
from mcp_server_webcrawl.utils.tools import get_crawler_tools
from mcp_server_webcrawl.utils.logger import get_logger

logger = get_logger()


[docs] class InterroBotCrawler(BaseCrawler): """ A crawler implementation for InterroBot data sources. Provides functionality for accessing and searching web content from InterroBot. """
[docs] def __init__( self, datasrc: Path, ) -> None: """ Initialize the InterroBotCrawler with a data source path and required adapter functions. Args: datasrc: Path to the data source """ super().__init__(datasrc, get_sites, get_resources, resource_field_mapping=INTERROBOT_RESOURCE_FIELD_MAPPING) assert datasrc.is_file() and datasrc.suffix == ".db", f"{self.__class__.__name__} datasrc must be a db file"
[docs] async def mcp_list_tools(self) -> list[Tool]: """ List available tools for this crawler. Returns: List of Tool objects """ # get the default crawler tools, then override necessary fields all_sites: list[SiteResult] = self._adapter_get_sites(self._datasrc) default_tools: list[Tool] = get_crawler_tools(sites=all_sites) assert len(default_tools) == 2, "expected exactly 2 Tools: sites and resources" # can replace get_crawler_tools or extend, here it is overwritten from default # you'd think maybe pass changes in, but no, it's better ad hoc default_sites_tool: Tool default_resources_tool: Tool default_sites_tool, default_resources_tool = default_tools # this adds InterroBot specific Robots field sites_field_options: list[str] = list(set(INTERROBOT_SITE_FIELD_MAPPING.keys()) - set(INTERROBOT_SITE_FIELD_REQUIRED)) dst_props: dict = default_sites_tool.inputSchema["properties"] dst_props["fields"]["items"]["enum"] = sites_field_options resources_field_options: list[str] = list(set(RESOURCES_FIELDS_DEFAULT) - set(RESOURCES_FIELDS_REQUIRED)) resources_type_options: list[str] = list(set(INTERROBOT_RESOURCE_FIELD_MAPPING.keys()) - set(RESOURCES_FIELDS_REQUIRED)) resources_sort_options: list[str] = list(RESOURCES_DEFAULT_SORT_MAPPING.keys()) all_sites_display: str = ", ".join([f"{s.url} (site: {s.id})" for s in all_sites]) drt_props: dict = default_resources_tool.inputSchema["properties"] drt_props["fields"]["items"]["enum"] = resources_field_options drt_props["sort"]["enum"] = resources_sort_options drt_props["sites"]["enum"] = sites_field_options drt_props["sites"]["description"] = ("Optional " "list of project ID to filter search results to a specific site. In 95% " "of scenarios, you'd filter to only one site, but many site filtering is offered " f"for advanced search scenarios. Available sites include {all_sites_display}.") return [default_sites_tool, default_resources_tool]