Source code for mcp_server_webcrawl.utils.tools
from mcp.types import Tool
from mcp_server_webcrawl.models.resources import (
ResourceResultType,
RESOURCES_FIELDS_DEFAULT,
RESOURCES_FIELDS_REQUIRED,
RESOURCES_SORT_OPTIONS_DEFAULT,
RESOURCES_TOOL_NAME,
)
from mcp_server_webcrawl.models.sites import (
SiteResult,
SITES_FIELDS_DEFAULT,
SITES_FIELDS_REQUIRED,
SITES_TOOL_NAME,
)
[docs]
def get_crawler_tools(sites: list[SiteResult] | None = None):
"""
Generate crawler tools based on available sites.
Args:
sites: optional list of site results to include in tool descriptions
Returns:
List of Tool objects for sites and resources
"""
# you'd think maybe pass these in, but no, descriptions will also require tweaking
# each crawler having its own peculiarities -- just let the subclass hack this
# into whatever misshapen ball of clay it needs to be
sites_field_options = list(set(SITES_FIELDS_DEFAULT) - set(SITES_FIELDS_REQUIRED))
resources_field_options = list(set(RESOURCES_FIELDS_DEFAULT) - set(RESOURCES_FIELDS_REQUIRED))
resources_type_options = list(ResourceResultType.values())
resources_sort_options = RESOURCES_SORT_OPTIONS_DEFAULT
sites_display = ", ".join([f"{s.url} (site: {s.id})" for s in sites]) if sites is not None else ""
tools = [
Tool(
name=SITES_TOOL_NAME,
description="Retrieves a list of sites (project websites or crawl directories).",
inputSchema={
"type": "object",
"properties": {
"ids": {
"type": "array",
"items": {"type": "integer"},
"description": "List of project IDs to retrieve. Leave empty for all projects."
},
"fields": {
"type": "array",
"items": {
"type": "string",
"enum": sites_field_options
},
"description": ("List of additional fields to include in the response beyond the defaults "
"(id, url) Empty list means default fields only. Options include created (ISO 8601), "
"modified (ISO 8601), and norobots (str).")
}
},
"required": []
},
),
Tool(
name=RESOURCES_TOOL_NAME,
description= ("Searches for resources (webpages, images, CSS, JS, etc.) across web crawler projects and "
"retrieves specified fields. "
"Supports boolean queries and field searching, along with site filtering to "
"filter with fine control. "
"To find a site homepage or index of a site, query type: html with sort='+modified' and a limit of 1. "
"Most sites indexed by this tool will be small to moderately sized websites. "
"Don't assume most keywords will generate results; start broader on first search (until you have a feel for results). "
"A vital aspect of this API is field control; you can open up the limit wide when dealing with lightweight "
"fields and dial way back when using larger fields, like content. Adjust dynamically. The best strategy "
"balances preserving the user's context window while minimizing number of queries necessary to answer their question."
),
inputSchema={
"type": "object",
"properties": {
"query": {
"type": "string",
"description": ("The query field is the workhorse of the API and supports fulltext boolean queries "
"along with field searching using the name: value pattern. "
"Fields supported include page/resource id as id: <resource_id|int> (OR together for multiple docs), "
"HTTP status as status: <code|int>, URL as url: <url|str>, and content type as type: <type|str>. "
f"Valid types include ({', '.join(resources_type_options)}). "
"Additionally, headers as headers: <term|str> and content as content: <term|str> can be "
"searched specifically. You would only search content when fulltext search is diluted by other field hits. "
"For the status field, numerical operators are supported, e.g. status: >=400. "
"For the url and type fields, along with fulltext search terms (fieldless), FTS5 stem* suffix "
"wildcarding is enabled. An empty query returns all results. "
"A query MUST use one of these formats: (1) empty query for unfiltered results, (2) single keyword, "
"(3) quoted phrase: \"keyword1 keyword2\", (4) "
"explicit AND: keyword1 AND type: html, (5) explicit OR: keyword1 OR keyword2, or (6) advanced boolean: "
"(keyword1 OR keyword2) AND (status: 200 AND type: html). "
"The search index does not support stemming, use wildcards (keyword*), or the boolean OR and your "
"imagination instead."
)
},
"sites": {
"type": "array",
"items": {"type": "integer"},
"description": ("Optional list of project ID to filter search results to a specific site. In 95% "
"of scenarios, you'd filter to only one site, but multiple site filtering is offered for "
f"advanced search scenarios. Available sites include {sites_display}.")
},
"fields": {
"type": "array",
"items": {
"type": "string",
"enum": resources_field_options
},
"description": ("List of additional fields to include in the response beyond the defaults "
f"({', '.join(resources_field_options)}). Empty list means default fields only. "
"The content field can lead to large results and should be used judiously with LIMIT.")
},
"sort": {
"type": "string",
"enum": resources_sort_options,
"description": ("Sort order for results. Prefixed with + for ascending, - for descending. "
"? is a special option for random sort, useful in statistical sampling.")
},
"limit": {
"type": "integer",
"description": "Maximum number of results to return. Default is 20, max is 100."
},
"offset": {
"type": "integer",
"description": "Number of results to skip for pagination. Default is 0."
},
"extras": {
"type": "array",
"items": {
"type": "string",
"enum": ["thumbnails", "markdown", "snippets"]
},
"description": ("Optional array of extra features to include in results. Available options include:\n"
"- 'thumbnails': generates base64 encoded thumbnails for image resources that can be viewed and "
"analyzed by AI models. Enables image description, content analysis, and visual understanding while"
"keeping token output minimal. Only works for image "
"(img) types, which can be filtered using `type: img` in queries. SVG is not supported.\n"
"- 'markdown': transforms the HTML content field into concise markdown, "
"reducing token usage and improving readability for LLMs.\n"
"- 'snippets': matches fulltext queries to contextual keyword usage within the content. When "
"used without requesting the content field (or markdown extra), it can provide an efficient means "
"of refining a search without pulling down the complete page contents. Also great for rendering "
"old school hit-highlighted results as a list, like Google search in 1999. Works with HTML, CSS, JS, "
"or any text-based, crawled file."
"")
},
},
"required": []
},
),
]
return tools