Source code for mcp_server_webcrawl.utils.tools

from mcp.types import Tool

from mcp_server_webcrawl.models.resources import (
    ResourceResultType,
    RESOURCES_FIELDS_BASE,
    RESOURCES_FIELDS_OPTIONS,
    RESOURCES_DEFAULT_SORT_MAPPING,
    RESOURCES_TOOL_NAME,
)
from mcp_server_webcrawl.models.sites import (
    SiteResult,
    SITES_FIELDS_DEFAULT,
    SITES_FIELDS_BASE,
    SITES_TOOL_NAME,
)

[docs] def get_crawler_tools(sites: list[SiteResult] | None = None): """ Generate crawler tools based on available sites. Args: sites: optional list of site results to include in tool descriptions Returns: List of Tool objects for sites and resources """ # you'd think maybe pass these in, but no, descriptions will also require tweaking # each crawler having its own peculiarities -- just let the subclass hack this # into whatever misshapen ball of clay it needs to be sites_field_options = list(set(SITES_FIELDS_DEFAULT) - set(SITES_FIELDS_BASE)) resources_type_options = list(ResourceResultType.values()) resources_sort_options = list(RESOURCES_DEFAULT_SORT_MAPPING.keys()) sites_display = ", ".join([f"{s.url} (site: {s.id})" for s in sites]) if sites is not None else "" tools = [ Tool( name=SITES_TOOL_NAME, description="Retrieves a list of sites (project websites or crawl directories).", inputSchema={ "type": "object", "properties": { "ids": { "type": "array", "items": {"type": "integer"}, "description": "List of project IDs to retrieve. Leave empty for all projects." }, "fields": { "type": "array", "items": { "enum": sites_field_options }, "description": ("List of additional fields to include in the response beyond the defaults " "(id, url) Empty list means default fields only. Options include created (ISO 8601), " "modified (ISO 8601), and norobots (str).") } }, "required": [] }, ), Tool( name=RESOURCES_TOOL_NAME, description= ("Searches for resources (webpages, images, CSS, JS, etc.) across web crawler projects and " "retrieves specified fields. " "Supports boolean queries and field searching, along with site filtering to " "filter with fine control. " "To find a site homepage reliably, query type: html AND url: example.com (crawled domain) with sort='+url' and a LIMIT of 1. " "This pattern works consistently across all crawlers." "Most sites indexed by this tool will be small to moderately sized websites. " "Don't assume most keywords will generate results; start broader on first search (until you have a feel for results). " "A vital aspect of this API is field control; you can open up the limit wide when dealing with lightweight " "fields and dial way back when using larger fields, like content. Adjust dynamically. The best strategy " "balances preserving the user's context window while minimizing number of queries necessary to answer their question." ), inputSchema={ "type": "object", "properties": { "query": { "type": "string", "description": ("The query field is the workhorse of the API and supports fulltext boolean queries " "along with field searching using the name: value pattern. " "Fields supported include page/resource id as id: <resource_id|int> (OR together for multiple docs), " "HTTP status as status: <code|int>, URL as url: <url|str>, and content type as type: <type|str>. " f"Valid types include ({', '.join(resources_type_options)}). " "Additionally, headers as headers: <term|str> and content as content: <term|str> can be " "searched specifically. You would only search content when fulltext search is diluted by other field hits. " "For the status field, numerical operators are supported, e.g. status: >=400. " "For the url and type fields, along with fulltext search terms (fieldless), FTS5 stem* suffix " "wildcarding is enabled. An empty query returns all results. " "A query MUST use one of these formats: (1) empty query for unfiltered results, (2) single keyword, " "(3) quoted phrase: \"keyword1 keyword2\", (4) " "explicit AND: keyword1 AND type: html, (5) explicit OR: keyword1 OR keyword2, or (6) advanced boolean: " "(keyword1 OR keyword2) AND (status: 200 AND type: html). " "The search index does not support stemming, use wildcards (keyword*), or the boolean OR and your " "imagination instead." ) }, "sites": { "type": "array", "items": {"type": "integer"}, "description": ("List of crawl site IDs to filter search results to a specific site. In most " "scenarios, you should filter to only one site, but multiple site filtering is offered for " f"advanced search scenarios. Available sites include {sites_display}.") }, "fields": { "type": "array", "items": { "enum": RESOURCES_FIELDS_OPTIONS }, "description": ("List of additional fields to include in the response beyond the base fields " f"({', '.join(RESOURCES_FIELDS_BASE)}) returned for all results. " "Empty list means base fields only. Use headers and content to retrieve raw HTTP contents, " "and size to collect file size in bytes. " "The content field can lead to large results and should be used judiciously with LIMIT. " "Fields must be explicitly requested, even when used with sort. ") }, "sort": { "enum": resources_sort_options, "default": "+url", "description": ("Sort order for results. Prefixed with + for ascending, - for descending " f"({', '.join(resources_sort_options)}). " "? is a special option for random sort, useful in statistical sampling. The API expects exactly " "one of the enum values above, not a quoted string.") }, "limit": { "type": "integer", "description": "Maximum number of results to return. Default is 20, max is 100." }, "offset": { "type": "integer", "description": "Number of results to skip for pagination. Default is 0." }, "extras": { "type": "array", "items": { "enum": ["thumbnails", "markdown", "snippets", "regex", "xpath"] }, "description": ("Optional array of extra features to include in results. Available options include:\n" "- 'thumbnails': generates base64 encoded thumbnails for image resources that can be viewed and " "analyzed by AI models. Enables image description, content analysis, and visual understanding while" "keeping token output minimal. Only works for image " "(img) types, which can be filtered using `type: img` in queries. SVG is not supported.\n" "- 'markdown': transforms the HTML content field into concise markdown, " "reducing token usage and improving readability for LLMs.\n" "- 'snippets': matches fulltext queries to contextual keyword usage within the content. When " "used without requesting the content field (or markdown extra), it can provide an efficient means " "of refining a search without pulling down the complete page contents. Also great for rendering " "old school hit-highlighted results as a list, like Google search in 1999. Works with HTML, CSS, JS, " "or any text-based, crawled file.\n" "- 'regex': extracts regular expression matches from crawled files such as HTML, CSS, JavaScript, " "etc.. Not as precise a tool as xpath for HTML, but supports any text file as a data source. " "- 'xpath': extracts xpath selector data. Supports count(). Use xpath's text() for " "text only, element selectors for HTML data. Only supported for HTML, other " "types will be ignored. Sometimes referred to as scraping." "") }, "extrasRegex": { "type": "array", "items": { "type": "string" }, "description": ("Array of regular expression patterns to extract content. " "Examples: `\\d{3}-\\d{3}-\\d{4}` (phone numbers), `https?://[^\\s]+` (URLs). " "Use capture groups `(pattern)` to extract specific parts. " "Only used when 'regex' is included in the extras array. " "Results include matches, capture groups, and position information.") }, "extrasXpath": { "type": "array", "items": { "type": "string" }, "description": ("Array of XPath expressions to extract specific content from HTML resources. " "Each XPath should be a valid selector expression like `/html/body/h1`, `//h1/text()`, " "//a, //a/@href, or count(//a). If you need many values (such as connected a/text() " "and a/@href), request elements to preserve the relationship. " "Use text() or @name when targeting text, elements will return outer HTML. " "Only used when 'xpath' is included in the extras array. Many xpath expressions can be " "passed at once to extract multiple selectors. Results are grouped by document within results. ") } }, "required": [] }, ), ] return tools