Source code for mcp_server_webcrawl.utils.tools

from mcp.types import Tool

from mcp_server_webcrawl.models.resources import (
    ResourceResultType,
    RESOURCES_FIELDS_DEFAULT,
    RESOURCES_FIELDS_REQUIRED,
    RESOURCES_SORT_OPTIONS_DEFAULT,
    RESOURCES_TOOL_NAME,
)
from mcp_server_webcrawl.models.sites import (
    SiteResult,
    SITES_FIELDS_DEFAULT,
    SITES_FIELDS_REQUIRED,
    SITES_TOOL_NAME,
)

[docs] def get_crawler_tools(sites: list[SiteResult] | None = None): """ Generate crawler tools based on available sites. Args: sites: optional list of site results to include in tool descriptions Returns: List of Tool objects for sites and resources """ # you'd think maybe pass these in, but no, descriptions will also require tweaking # each crawler having its own peculiarities -- just let the subclass hack this # into whatever misshapen ball of clay it needs to be sites_field_options = list(set(SITES_FIELDS_DEFAULT) - set(SITES_FIELDS_REQUIRED)) resources_field_options = list(set(RESOURCES_FIELDS_DEFAULT) - set(RESOURCES_FIELDS_REQUIRED)) resources_type_options = list(ResourceResultType.values()) resources_sort_options = RESOURCES_SORT_OPTIONS_DEFAULT sites_display = ", ".join([f"{s.url} (site: {s.id})" for s in sites]) if sites is not None else "" tools = [ Tool( name=SITES_TOOL_NAME, description="Retrieves a list of sites (project websites or crawl directories).", inputSchema={ "type": "object", "properties": { "ids": { "type": "array", "items": {"type": "integer"}, "description": "List of project IDs to retrieve. Leave empty for all projects." }, "fields": { "type": "array", "items": { "type": "string", "enum": sites_field_options }, "description": ("List of additional fields to include in the response beyond the defaults " "(id, url) Empty list means default fields only. Options include created (ISO 8601), " "modified (ISO 8601), and norobots (str).") } }, "required": [] }, ), Tool( name=RESOURCES_TOOL_NAME, description= ("Searches for resources (webpages, images, CSS, JS, etc.) across web crawler projects and " "retrieves specified fields. " "Supports boolean queries and field searching, along with site filtering to " "filter with fine control. " "To find a site homepage or index of a site, query type: html with sort='+modified' and a limit of 1. " "Most sites indexed by this tool will be small to moderately sized websites. " "Don't assume most keywords will generate results; start broader on first search (until you have a feel for results). " "A vital aspect of this API is field control; you can open up the limit wide when dealing with lightweight " "fields and dial way back when using larger fields, like content. Adjust dynamically. The best strategy " "balances preserving the user's context window while minimizing number of queries necessary to answer their question." ), inputSchema={ "type": "object", "properties": { "query": { "type": "string", "description": ("The query field is the workhorse of the API and supports fulltext boolean queries " "along with field searching using the name: value pattern. " "Fields supported include page/resource id as id: <resource_id|int> (OR together for multiple docs), " "HTTP status as status: <code|int>, URL as url: <url|str>, and content type as type: <type|str>. " f"Valid types include ({', '.join(resources_type_options)}). " "Additionally, headers as headers: <term|str> and content as content: <term|str> can be " "searched specifically. You would only search content when fulltext search is diluted by other field hits. " "For the status field, numerical operators are supported, e.g. status: >=400. " "For the url and type fields, along with fulltext search terms (fieldless), FTS5 stem* suffix " "wildcarding is enabled. An empty query returns all results. " "A query MUST use one of these formats: (1) empty query for unfiltered results, (2) single keyword, " "(3) quoted phrase: \"keyword1 keyword2\", (4) " "explicit AND: keyword1 AND type: html, (5) explicit OR: keyword1 OR keyword2, or (6) advanced boolean: " "(keyword1 OR keyword2) AND (status: 200 AND type: html). " "The search index does not support stemming, use wildcards (keyword*), or the boolean OR and your " "imagination instead." ) }, "sites": { "type": "array", "items": {"type": "integer"}, "description": ("Optional list of project ID to filter search results to a specific site. In 95% " "of scenarios, you'd filter to only one site, but multiple site filtering is offered for " f"advanced search scenarios. Available sites include {sites_display}.") }, "fields": { "type": "array", "items": { "type": "string", "enum": resources_field_options }, "description": ("List of additional fields to include in the response beyond the defaults " f"({', '.join(resources_field_options)}). Empty list means default fields only. " "The content field can lead to large results and should be used judiously with LIMIT.") }, "sort": { "type": "string", "enum": resources_sort_options, "description": ("Sort order for results. Prefixed with + for ascending, - for descending. " "? is a special option for random sort, useful in statistical sampling.") }, "limit": { "type": "integer", "description": "Maximum number of results to return. Default is 20, max is 100." }, "offset": { "type": "integer", "description": "Number of results to skip for pagination. Default is 0." }, "extras": { "type": "array", "items": { "type": "string", "enum": ["thumbnails", "markdown", "snippets"] }, "description": ("Optional array of extra features to include in results. Available options include:\n" "- 'thumbnails': generates base64 encoded thumbnails for image resources that can be viewed and " "analyzed by AI models. Enables image description, content analysis, and visual understanding while" "keeping token output minimal. Only works for image " "(img) types, which can be filtered using `type: img` in queries. SVG is not supported.\n" "- 'markdown': transforms the HTML content field into concise markdown, " "reducing token usage and improving readability for LLMs.\n" "- 'snippets': matches fulltext queries to contextual keyword usage within the content. When " "used without requesting the content field (or markdown extra), it can provide an efficient means " "of refining a search without pulling down the complete page contents. Also great for rendering " "old school hit-highlighted results as a list, like Google search in 1999. Works with HTML, CSS, JS, " "or any text-based, crawled file." "") }, }, "required": [] }, ), ] return tools