import unittest
import asyncio
import sys
from datetime import datetime
from logging import Logger
from mcp_server_webcrawl.crawlers.base.crawler import BaseCrawler
from mcp_server_webcrawl.models.resources import ResourceResultType
from mcp_server_webcrawl.utils.logger import get_logger
logger: Logger = get_logger()
[docs]
class BaseCrawlerTests(unittest.TestCase):
[docs]
def setUp(self):
# quiet asyncio error on tests, occurring after sucessful completion
if sys.platform == "win32":
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
[docs]
def run_pragmar_search_tests(self, crawler: BaseCrawler, site_id: int):
"""
Run a battery of database checks on the crawler and Boolean validation
"""
resources_json = crawler.get_resources_api()
self.assertTrue(resources_json.total > 0, "Should have some resources in database")
site_resources = crawler.get_resources_api(sites=[site_id])
self.assertTrue(site_resources.total > 0, "Pragmar site should have resources")
primary_keyword = "crawler"
secondary_keyword = "privacy"
hyphenated_keyword = "one-click"
primary_resources = crawler.get_resources_api(
sites=[site_id],
query=primary_keyword,
fields=["content", "headers"],
limit=1,
)
self.assertTrue(primary_resources.total > 0, f"Keyword '{primary_keyword}' should return results")
for resource in primary_resources._results:
resource_dict = resource.to_dict()
found = False
for field, value in resource_dict.items():
if isinstance(value, str) and primary_keyword.rstrip("*") in value.lower():
found = True
break
self.assertTrue(found, f"Primary keyword not found in any field of resource {resource.id}")
secondary_resources = crawler.get_resources_api(
sites=[site_id],
query=secondary_keyword,
limit=1,
)
self.assertTrue(secondary_resources.total > 0, f"Keyword '{secondary_keyword}' should return results")
hyphenated_resources = crawler.get_resources_api(
sites=[site_id],
query=hyphenated_keyword,
limit=1,
)
self.assertTrue(hyphenated_resources.total > 0, f"Keyword '{hyphenated_keyword}' should return results")
primary_not_secondary = crawler.get_resources_api(
sites=[site_id],
query=f"{primary_keyword} NOT {secondary_keyword}"
)
secondary_not_primary = crawler.get_resources_api(
sites=[site_id],
query=f"{secondary_keyword} NOT {primary_keyword}"
)
primary_or_secondary = crawler.get_resources_api(
sites=[site_id],
query=f"{primary_keyword} OR {secondary_keyword}"
)
self.assertTrue(primary_not_secondary.total <= primary_resources.total,
"'crawler NOT privacy' should be subset of 'crawler'")
self.assertTrue(secondary_not_primary.total <= secondary_resources.total,
"'privacy NOT crawler' should be subset of 'privacy'")
self.assertTrue(primary_or_secondary.total >= primary_resources.total,
"OR should include all primary term results")
self.assertTrue(primary_or_secondary.total >= secondary_resources.total,
"OR should include all secondary term results")
calculated_overlap = primary_resources.total + secondary_resources.total - primary_or_secondary.total
self.assertTrue(calculated_overlap >= 0, "Overlap cannot be negative")
reconstructed_total = primary_not_secondary.total + secondary_not_primary.total + calculated_overlap
self.assertEqual(reconstructed_total, primary_or_secondary.total,
"Sum of exclusive sets plus overlap should equal OR total")
complex_and = crawler.get_resources_api(
sites=[site_id],
query=f"{primary_keyword} AND type:html AND status:200"
)
self.assertTrue(complex_and.total <= primary_resources.total,
"Adding AND conditions should not increase results")
grouped_or = crawler.get_resources_api(
sites=[site_id],
query=f"({primary_keyword} OR {secondary_keyword}) AND type:html AND status:200"
)
self.assertTrue(grouped_or.total <= primary_or_secondary.total,
"Adding AND conditions to OR should not increase results")
snippet_resources = crawler.get_resources_api(
sites=[site_id],
query=f"{primary_keyword} AND type: html",
extras=["snippets"],
limit=1,
)
self.assertIn("snippets", snippet_resources._results[0].to_dict()["extras"],
"First result should have snippets in extras")
markdown_resources = crawler.get_resources_api(
sites=[site_id],
query=primary_keyword,
extras=["markdown"],
limit=1,
)
self.assertIn("markdown", markdown_resources._results[0].to_dict()["extras"],
"First result should have markdown in extras")
combined_resources = crawler.get_resources_api(
sites=[site_id],
query=primary_keyword,
extras=["snippets", "markdown"],
limit=1,
)
first_result = combined_resources._results[0].to_dict()
self.assertIn("extras", first_result, "First result should have extras field")
self.assertIn("snippets", first_result["extras"], "First result should have snippets in extras")
self.assertIn("markdown", first_result["extras"], "First result should have markdown in extras")
self.assertTrue(primary_resources.total <= site_resources.total,
"Search should return less than or equivalent results to site total")
self.assertTrue(secondary_resources.total <= site_resources.total,
"Search should return less than or equivalent results to site total")
[docs]
def run_pragmar_image_tests(self, crawler: BaseCrawler, pragmar_site_id: int):
"""
Test InterroBot-specific image handling and thumbnails.
"""
img_results = crawler.get_resources_api(sites=[pragmar_site_id], query="type: img", limit=5)
self.assertTrue(img_results.total > 0, "Image type filter should return results")
self.assertTrue(
all(r.type.value == "img" for r in img_results._results),
"All filtered resources should have type 'img'"
)
[docs]
def run_sites_resources_tests(self, crawler: BaseCrawler, pragmar_site_id: int, example_site_id: int):
resources_json = crawler.get_resources_api()
self.assertTrue(resources_json.total > 0, "Should have some resources in database")
site_resources = crawler.get_resources_api(sites=[pragmar_site_id])
self.assertTrue(site_resources.total > 0, "Pragmar site should have resources")
# basic resource retrieval
resources_json = crawler.get_resources_api()
self.assertTrue(resources_json.total > 0)
# fulltext keyword search
query_keyword1 = "privacy"
timestamp_resources = crawler.get_resources_api(
sites=[pragmar_site_id],
query=query_keyword1,
fields=["created", "modified", "time"],
limit=5,
)
self.assertTrue(timestamp_resources.total > 0, "Search query should return results")
for resource in timestamp_resources._results:
resource_dict = resource.to_dict()
self.assertIsNotNone(resource_dict["created"], "Created timestamp should not be None")
self.assertIsNotNone(resource_dict["modified"], "Modified timestamp should not be None")
self.assertIsNotNone(resource_dict["time"], "Modified timestamp should not be None")
# resource ID filtering
if resources_json.total > 0:
first_resource = resources_json._results[0]
id_resources = crawler.get_resources_api(
sites=[first_resource.site],
query=f"id: {first_resource.id}",
limit=1,
)
self.assertEqual(id_resources.total, 1)
self.assertEqual(id_resources._results[0].id, first_resource.id)
# site filtering
site_resources = crawler.get_resources_api(sites=[pragmar_site_id])
self.assertTrue(site_resources.total > 0, "Site filtering should return results")
for resource in site_resources._results:
self.assertEqual(resource.site, pragmar_site_id)
# type filtering for HTML pages
html_resources = crawler.get_resources_api(
sites=[pragmar_site_id],
query= f"type: {ResourceResultType.PAGE.value}",
)
self.assertTrue(html_resources.total > 0, "HTML filtering should return results")
for resource in html_resources._results:
self.assertEqual(resource.type, ResourceResultType.PAGE)
# type filtering for multiple resource types
mixed_resources = crawler.get_resources_api(
sites=[pragmar_site_id],
query= f"type: {ResourceResultType.PAGE.value} OR type: {ResourceResultType.SCRIPT.value}",
)
if mixed_resources.total > 0:
types_found = {r.type for r in mixed_resources._results}
self.assertTrue(
len(types_found) > 0,
"Should find at least one of the requested resource types"
)
for resource_type in types_found:
self.assertIn(
resource_type,
[ResourceResultType.PAGE, ResourceResultType.SCRIPT]
)
# custom fields in response
custom_fields = ["content", "headers", "time"]
field_resources = crawler.get_resources_api(
query="type: html",
sites=[pragmar_site_id],
fields=custom_fields,
limit=1,
)
self.assertTrue(field_resources.total > 0)
resource_dict = field_resources._results[0].to_dict()
for field in custom_fields:
self.assertIn(field, resource_dict, f"Field '{field}' should be in response")
asc_resources = crawler.get_resources_api(sites=[pragmar_site_id], sort="+url")
if asc_resources.total > 1:
self.assertTrue(asc_resources._results[0].url <= asc_resources._results[1].url)
desc_resources = crawler.get_resources_api(sites=[pragmar_site_id], sort="-url")
if desc_resources.total > 1:
self.assertTrue(desc_resources._results[0].url >= desc_resources._results[1].url)
limit_resources = crawler.get_resources_api(sites=[pragmar_site_id], limit=3)
self.assertTrue(len(limit_resources._results) <= 3)
offset_resources = crawler.get_resources_api(sites=[pragmar_site_id], offset=2, limit=2)
self.assertTrue(len(offset_resources._results) <= 2)
if resources_json.total > 4:
self.assertNotEqual(
resources_json._results[0].id,
offset_resources._results[0].id,
"Offset results should differ from first page"
)
# status code filtering
status_resources = crawler.get_resources_api(
sites=[pragmar_site_id],
query=f"status: 200",
limit=5,
)
self.assertTrue(status_resources.total > 0, "Status filtering should return results")
for resource in status_resources._results:
self.assertEqual(resource.status, 200)
# status code filtering
appstat_resources = crawler.get_resources_api(
sites=[pragmar_site_id],
query=f"status: 200 AND url: https://pragmar.com/appstat*",
limit=5,
)
self.assertTrue(appstat_resources.total > 0, "Status filtering should return results")
self.assertGreaterEqual(len(appstat_resources._results), 3, f"Unexpected page count\n{len(appstat_resources._results)}")
# multiple status codes
multi_status_resources = crawler.get_resources_api(
query=f"status: 200 OR status: 404",
)
if multi_status_resources.total > 0:
found_statuses = {r.status for r in multi_status_resources._results}
for status in found_statuses:
self.assertIn(status, [200, 404])
# combined filtering
combined_resources = crawler.get_resources_api(
sites=[pragmar_site_id],
query= f"style AND type: {ResourceResultType.PAGE.value}",
fields=["content", "headers"],
sort="+url",
limit=3,
)
if combined_resources.total > 0:
for resource in combined_resources._results:
self.assertEqual(resource.site, pragmar_site_id)
self.assertEqual(resource.type, ResourceResultType.PAGE)
resource_dict = resource.to_dict()
self.assertIn("content", resource_dict)
self.assertIn("headers", resource_dict)
# multi-site search, verify we got results from both sites
multisite_resources = crawler.get_resources_api(
sites=[example_site_id, pragmar_site_id],
query= f"type: {ResourceResultType.PAGE.value}",
sort="+url",
limit=10,
)
found_sites = set()
for resource in multisite_resources._results:
found_sites.add(resource.site)
self.assertEqual(len(found_sites), 2, "Should have results from both sites")
# Boolean workout
# result counts are fragile, intersections should not be
# counts are worth the fragility, for now
claude_resources = crawler.get_resources_api(
sites=[pragmar_site_id],
query=f"type: html AND (claude)",
limit=4,
)
# varies by crawler, katana doesn't crawl /help/ depth by default
self.assertTrue(claude_resources.total in [1,2,3,4,5,6], f"Claude search returned {claude_resources.total}, expected 3/4/5 results")
mcp_resources = crawler.get_resources_api(
sites=[pragmar_site_id],
query=f"type: html AND (mcp)",
limit=12,
)
# varies by crawler, katana doesn't crawl /help/ depth by default
self.assertTrue(mcp_resources.total in [2, 3, 4, 5, 6, 7, 8, 12, 13, 16, 17], f"MCP returned {mcp_resources.total}, expected")
# AND
claude_and_mcp_resources = crawler.get_resources_api(
sites=[pragmar_site_id],
query=f"type: html AND (claude AND mcp)",
limit=1,
)
self.assertTrue(claude_resources.total in [1,2,3,4,5,6], f"Claude AND MCP returned {claude_resources.total}, expected 3/4")
# OR
claude_or_mcp_resources = crawler.get_resources_api(
sites=[pragmar_site_id],
query=f"type: html AND (claude OR mcp)",
limit=1,
)
self.assertTrue(claude_or_mcp_resources.total in [2,3,4,5,6,12,13,15,16,17], f"Claude OR MCP returned {claude_or_mcp_resources.total}, expected 16 results (union)")
# NOT
claude_not_mcp_resources = crawler.get_resources_api(
sites=[pragmar_site_id],
query=f"type: html AND (claude NOT mcp)",
limit=1,
)
self.assertEqual(claude_not_mcp_resources.total, 0, "Claude NOT MCP should return 0 results")
mcp_not_claude_resources = crawler.get_resources_api(
sites=[pragmar_site_id],
query=f"type: html AND (mcp NOT claude)",
limit=1,
)
self.assertTrue(mcp_not_claude_resources.total in [1,2,3,4,7,8,9,10,11,12,13], f"MCP NOT Claude returned {mcp_not_claude_resources.total}, expected 11/12/13")
# logical relationships
self.assertEqual(
claude_and_mcp_resources.total,
claude_resources.total + mcp_resources.total - claude_or_mcp_resources.total,
"Intersection should equal A + B - Union (inclusion-exclusion principle)"
)
self.assertEqual(
claude_not_mcp_resources.total + claude_and_mcp_resources.total,
claude_resources.total,
"Claude NOT MCP + Claude AND MCP should equal total Claude results"
)
self.assertEqual(
mcp_not_claude_resources.total + claude_and_mcp_resources.total,
mcp_resources.total,
"MCP NOT Claude + Claude AND MCP should equal total MCP results"
)
self.assertEqual(
claude_not_mcp_resources.total + mcp_not_claude_resources.total + claude_and_mcp_resources.total,
claude_or_mcp_resources.total,
"Sum of exclusive sets plus intersection should equal union"
)
# complex boolean with field constraints
# url: pragmar used without .com to support WARC too
claude_and_html_resources = crawler.get_resources_api(
sites=[pragmar_site_id],
query=f"type: html AND (claude)",
limit=1,
)
self.assertTrue(claude_and_html_resources.total in [1,2,3,4,5,6], f"Claude AND type:html returned {claude_and_html_resources.total}, expected 3/6")
self.assertTrue(
claude_and_html_resources.total <= claude_resources.total,
"Adding AND constraints should not increase result count"
)
# Parentheses grouping
grouped_resources = crawler.get_resources_api(
sites=[pragmar_site_id],
query=f"type: html AND (claude OR mcp)",
limit=1,
)
self.assertTrue(grouped_resources.total in [2, 3, 4, 5, 6, 11, 12, 13], f"Grouped OR with HTML filter returned {grouped_resources.total}, expected 3/6")
[docs]
def run_pragmar_tokenizer_tests(self, crawler: BaseCrawler, site_id:int):
"""
fts hyphens and underscores are particularly challenging, thus
have a dedicated test. these must be configured in multiple places
including CREATE TABLE ... tokenizer, as well as handled by the query
parser.
"""
mcp_resources_keyword = crawler.get_resources_api(
sites=[site_id],
query='"mcp-server-webcrawl"',
fields=[],
limit=1,
)
mcp_resources_quoted = crawler.get_resources_api(
sites=[site_id],
query='"mcp-server-webcrawl"',
fields=[],
limit=1,
)
self.assertTrue(mcp_resources_keyword.total > 0, "Should find mcp-server-webcrawl in HTML")
self.assertTrue(mcp_resources_quoted.total > 0, "Should find \"mcp-server-webcrawl\" (phrase) in HTML")
self.assertTrue(mcp_resources_quoted.total == mcp_resources_keyword.total, "Quoted and unquoted equivalence expected")
mcp_resources_wildcarded = crawler.get_resources_api(
sites=[site_id],
query='mcp*',
fields=[],
limit=1,
)
self.assertTrue(mcp_resources_wildcarded.total > 0, "Should find mcp-server-* in HTML")
combo_and_resources_keyword = crawler.get_resources_api(
sites=[site_id],
query='"mcp-server-webcrawl" AND "one-click"',
fields=[],
limit=1,
)
combo_and_resources_quoted = crawler.get_resources_api(
sites=[site_id],
query='mcp-server-webcrawl AND one-click',
fields=[],
limit=1,
)
self.assertTrue(combo_and_resources_keyword.total > 0, "Should find mcp-server-webcrawl in HTML")
self.assertTrue(combo_and_resources_quoted.total > 0, "Should find \"mcp-server-webcrawl\" (phrase) in HTML")
self.assertTrue(combo_and_resources_keyword.total == combo_and_resources_quoted.total, "Quoted and unquoted equivalence expected")
combo_or_resources_keyword = crawler.get_resources_api(
sites=[site_id],
query='"mcp-server-webcrawl" OR "one-click"',
fields=[],
limit=1,
)
combo_or_resources_quoted = crawler.get_resources_api(
sites=[site_id],
query='mcp-server-webcrawl OR one-click',
fields=[],
limit=1,
)
self.assertTrue(combo_or_resources_keyword.total > 0, "Should find mcp-server-webcrawl in HTML")
self.assertTrue(combo_or_resources_quoted.total > 0, "Should find \"mcp-server-webcrawl\" (phrase) in HTML")
self.assertTrue(combo_or_resources_keyword.total == combo_or_resources_quoted.total, "Quoted and unquoted equivalence expected")
combo_not_resources_keyword = crawler.get_resources_api(
sites=[site_id],
query='"mcp-server-webcrawl" NOT "one-click"',
fields=[],
limit=1,
)
combo_not_resources_quoted = crawler.get_resources_api(
sites=[site_id],
query='mcp-server-webcrawl NOT one-click',
fields=[],
limit=1,
)
combo_and_not_resources_quoted = crawler.get_resources_api(
sites=[site_id],
query='mcp-server-webcrawl AND NOT one-click',
fields=[],
limit=1,
)
self.assertTrue(combo_not_resources_keyword.total > 0, "Should find mcp-server-webcrawl in HTML")
self.assertTrue(combo_not_resources_quoted.total > 0, "Should find \"mcp-server-webcrawl\" (phrase) in HTML")
self.assertTrue(combo_not_resources_keyword.total == combo_not_resources_quoted.total, "Quoted and unquoted equivalence expected")
self.assertTrue(combo_not_resources_keyword.total == combo_and_not_resources_quoted.total, f"NOT ({combo_not_resources_keyword.total}) and AND NOT ({combo_and_not_resources_quoted.total}) equivalence expected")
self.assertTrue(mcp_resources_keyword.total >= combo_and_resources_keyword.total, "Total records should be greater or equal to ANDs.")
self.assertTrue(mcp_resources_keyword.total <= combo_or_resources_keyword.total, "Total records should be less than or equal to ORs.")
self.assertTrue(mcp_resources_keyword.total > combo_not_resources_keyword.total, "Total records should be greater than to NOTs.")
[docs]
def run_pragmar_site_tests(self, crawler: BaseCrawler, site_id:int):
# all sites
sites_json = crawler.get_sites_api()
self.assertTrue(sites_json.total >= 2)
# single site
site_one_json = crawler.get_sites_api(ids=[site_id])
self.assertTrue(site_one_json.total == 1)
# site with fields
site_field_json = crawler.get_sites_api(ids=[site_id], fields=["created", "modified"])
site_field_result = site_field_json._results[0].to_dict()
self.assertTrue("created" in site_field_result)
self.assertTrue("modified" in site_field_result)
[docs]
def run_pragmar_sort_tests(self, crawler: BaseCrawler, site_id:int):
random1_resources = crawler.get_resources_api(sites=[site_id], sort="?", limit=20)
self.assertTrue(random1_resources.total > 0, "Database should contain resources")
random1_ids = [r.id for r in random1_resources._results]
random2_resources = crawler.get_resources_api(sites=[site_id], sort="?", limit=20)
self.assertTrue(random2_resources.total > 0, "Random sort should return results")
random2_ids = [r.id for r in random2_resources._results]
if random2_resources.total >= 10:
self.assertNotEqual(
random1_ids,
random2_ids,
"Random sort should produce different order than standard sort.\nStandard: "
f"{random1_ids}\nRandom: {random2_ids}"
)
else:
logger.info(f"Skip randomness verification: Not enough resources ({random2_resources.total})")
[docs]
def run_pragmar_content_tests(self, crawler: BaseCrawler, site_id:int, html_leniency: bool):
html_resources = crawler.get_resources_api(
sites=[site_id],
query= f"type: {ResourceResultType.PAGE.value}",
fields=["content", "headers"]
)
self.assertTrue(html_resources.total > 0, "Should find HTML resources")
for resource in html_resources._results:
resource_dict = resource.to_dict()
if "content" in resource_dict:
content = resource_dict["content"].lower()
self.assertTrue(
"<!DOCTYPE html>" in content or
"<html" in content or
"<meta" in content or
html_leniency,
f"HTML content should contain HTML markup: {resource.url}\n\n{resource.content}"
)
if "headers" in resource_dict and resource_dict["headers"]:
self.assertTrue(
"Content-Type:" in resource_dict["headers"],
f"Headers should contain Content-Type: {resource.url}"
)
# script content detection
script_resources = crawler.get_resources_api(
sites=[site_id],
query= f"type: {ResourceResultType.SCRIPT.value}",
fields=["content", "headers"],
limit=1,
)
if script_resources.total > 0:
for resource in script_resources._results:
self.assertEqual(resource.type, ResourceResultType.SCRIPT)
# css content detection
css_resources = crawler.get_resources_api(
sites=[site_id],
query= f"type: {ResourceResultType.CSS.value}",
fields=["content", "headers"],
limit=1,
)
if css_resources.total > 0:
for resource in css_resources._results:
self.assertEqual(resource.type, ResourceResultType.CSS)
[docs]
def run_pragmar_report(self, crawler: BaseCrawler, site_id: int, heading: str):
"""
Generate a comprehensive report of all resources for a site.
Returns a formatted string with counts and URLs by type.
"""
all_resources = crawler.get_resources_api(
sites=[site_id],
query="",
limit=100,
)
html_resources = crawler.get_resources_api(
sites=[site_id],
query=f"type: {ResourceResultType.PAGE.value}",
limit=100,
)
css_resources = crawler.get_resources_api(
sites=[site_id],
query=f"type: {ResourceResultType.CSS.value}",
limit=100,
)
js_resources = crawler.get_resources_api(
sites=[site_id],
query=f"type: {ResourceResultType.SCRIPT.value}",
limit=100,
)
image_resources = crawler.get_resources_api(
sites=[site_id],
query=f"type: {ResourceResultType.IMAGE.value}",
limit=100,
)
mcp_resources = crawler.get_resources_api(
sites=[site_id],
query=f"type: html AND (mcp)",
limit=100,
)
report_lines = []
sections = [
("Total pages", all_resources),
("Total HTML", html_resources),
("Total MCP search hits", mcp_resources),
("Total CSS", css_resources),
("Total JS", js_resources),
("Total Images", image_resources)
]
for i, (section_name, resource_obj) in enumerate(sections):
report_lines.append(f"{section_name}: {resource_obj.total}")
for resource in resource_obj._results:
report_lines.append(resource.url)
if i < len(sections) - 1:
report_lines.append("")
now = datetime.now()
lines_together = "\n".join(report_lines)
return f"""
**********************************************************************************
* {heading} {now.isoformat()} *
**********************************************************************************
{lines_together}
"""