from mcp_server_webcrawl.crawlers.archivebox.crawler import ArchiveBoxCrawler
from mcp_server_webcrawl.crawlers.archivebox.adapter import ArchiveBoxManager
from mcp_server_webcrawl.crawlers.base.tests import BaseCrawlerTests
from mcp_server_webcrawl.crawlers import get_fixture_directory
from mcp_server_webcrawl.utils.logger import get_logger
# calculate ids for ArchiveBox working directories using the same hash function as adapter
EXAMPLE_SITE_ID = ArchiveBoxManager.string_to_id("example")
PRAGMAR_SITE_ID = ArchiveBoxManager.string_to_id("pragmar")
logger = get_logger()
[docs]
class ArchiveBoxTests(BaseCrawlerTests):
"""
Test suite for the ArchiveBox crawler implementation.
Uses wrapped test methods from BaseCrawlerTests adapted for ArchiveBox's multi-instance structure.
"""
[docs]
def setUp(self):
"""
Set up the test environment with fixture data.
"""
super().setUp()
self._datasrc = get_fixture_directory() / "archivebox"
[docs]
def test_archivebox_pulse(self):
"""
Test basic crawler initialization.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
self.assertIsNotNone(crawler)
self.assertTrue(self._datasrc.is_dir())
[docs]
def test_archivebox_sites(self):
"""
Test site retrieval API functionality.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
# should have multiple sites (example and pragmar working directories)
sites_json = crawler.get_sites_api()
self.assertGreaterEqual(sites_json.total, 2, "ArchiveBox should have multiple working directories as sites")
# test pragmar site specifically
self.run_pragmar_site_tests(crawler, PRAGMAR_SITE_ID)
[docs]
def test_archivebox_search(self):
"""
Test boolean search functionality.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
self.run_pragmar_search_tests(crawler, PRAGMAR_SITE_ID)
[docs]
def test_pragmar_tokenizer(self):
"""
Test tokenizer search functionality.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
self.run_pragmar_tokenizer_tests(crawler, PRAGMAR_SITE_ID)
[docs]
def test_archivebox_resources(self):
"""
Test resource retrieval API functionality with various parameters.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
self.run_sites_resources_tests(crawler, PRAGMAR_SITE_ID, EXAMPLE_SITE_ID)
[docs]
def test_interrobot_images(self):
"""
Test InterroBot-specific image handling and thumbnails.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
self.run_pragmar_image_tests(crawler, PRAGMAR_SITE_ID)
[docs]
def test_archivebox_sorts(self):
"""
Test random sort functionality using the '?' sort parameter.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
self.run_pragmar_sort_tests(crawler, PRAGMAR_SITE_ID)
[docs]
def test_archivebox_content_parsing(self):
"""
Test content type detection and parsing for ArchiveBox resources.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
self.run_pragmar_content_tests(crawler, PRAGMAR_SITE_ID, False)
[docs]
def test_archivebox_url_reconstruction(self):
"""
Test URL reconstruction from ArchiveBox metadata.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
url_resources = crawler.get_resources_api(sites=[PRAGMAR_SITE_ID], limit=20)
self.assertGreater(url_resources.total, 0, "Should have resources with reconstructed URLs")
for resource in url_resources._results:
# URLs should be valid HTTP/HTTPS (except for archivebox:// fallbacks)
self.assertTrue(
resource.url.startswith(('http://', 'https://', 'archivebox://')),
f"URL should have valid scheme: {resource.url}"
)
# should not end with index.html (stripped during reconstruction)
self.assertFalse(
resource.url.endswith('/index.html'),
f"URL should not end with index.html: {resource.url}"
)
[docs]
def test_archivebox_deduplication(self):
"""
Test resource deduplication across timestamped entries.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
# get all resources from pragmar site
all_resources = crawler.get_resources_api(sites=[PRAGMAR_SITE_ID], limit=100)
self.assertGreater(all_resources.total, 0, "Should have resources")
# check for URL uniqueness (deduplication should ensure unique URLs)
urls_found = [r.url for r in all_resources._results]
unique_urls = set(urls_found)
# should have deduplication working (though some URLs might legitimately appear multiple times
# if they're different resources, like different timestamps of the same page)
self.assertLessEqual(len(unique_urls), len(urls_found),
"URL deduplication should work properly")
[docs]
def test_archivebox_timestamped_structure(self):
"""
Test handling of ArchiveBox's timestamped entry structure.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
# get resources with timestamps from pragmar site
timestamp_resources = crawler.get_resources_api(
sites=[PRAGMAR_SITE_ID],
fields=["created", "modified"],
limit=10
)
self.assertGreater(timestamp_resources.total, 0, "Should have timestamped resources")
for resource in timestamp_resources._results:
resource_dict = resource.to_dict()
# should have timestamp information
self.assertIsNotNone(resource_dict.get("created"),
"Should have created timestamp from entry directory")
self.assertIsNotNone(resource_dict.get("modified"),
"Should have modified timestamp from entry directory")
[docs]
def test_archivebox_error_resilience(self):
"""
Test resilience to malformed JSON and missing files.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
# should continue processing despite any JSON parsing errors
all_resources = crawler.get_resources_api(sites=[PRAGMAR_SITE_ID])
# verify we got some resources despite potential errors
self.assertGreater(all_resources.total, 0,
"Should process entries even with JSON parsing errors")
# verify resources have reasonable defaults
for resource in all_resources._results:
self.assertIsNotNone(resource.url, "URL should always be set")
self.assertIsInstance(resource.status, int, "Status should be integer")
self.assertGreaterEqual(resource.status, 0, "Status should be non-negative")
self.assertLessEqual(resource.status, 599, "Status should be valid HTTP status")
[docs]
def test_archivebox_multi_site(self):
"""
Test that multiple ArchiveBox working directories are treated as separate sites.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
# get resources from each site separately
example_resources = crawler.get_resources_api(sites=[EXAMPLE_SITE_ID], limit=10)
pragmar_resources = crawler.get_resources_api(
query="url: pragmar.com",
sites=[PRAGMAR_SITE_ID],
limit=10)
# print(example_resources.to_dict())
# print(pragmar_resources.to_dict())
# both sites should have resources
self.assertGreater(example_resources.total, 0, "Example site should have resources")
self.assertGreater(pragmar_resources.total, 0, "Pragmar site should have resources")
# URLs should reflect the appropriate domains
example_urls = [r.url for r in example_resources._results]
pragmar_urls = [r.url for r in pragmar_resources._results]
# verify site separation (pragmar resources should be about pragmar.com)
pragmar_domain_urls = [url for url in pragmar_urls if "pragmar.com" in url]
self.assertGreater(len(pragmar_domain_urls), 0,
"Pragmar site should contain pragmar.com URLs")
[docs]
def test_report(self):
"""
Run test report for ArchiveBox archive.
"""
crawler = ArchiveBoxCrawler(self._datasrc)
# generate report using pragmar site ID
report = self.run_pragmar_report(crawler, PRAGMAR_SITE_ID, "ArchiveBox")
logger.info(report)
# basic validation that report contains expected content
self.assertIn("ArchiveBox", report, "Report should mention ArchiveBox")
self.assertIn("Total pages:", report, "Report should show page counts")