Source code for mcp_server_webcrawl.crawlers.archivebox.tests

from mcp_server_webcrawl.crawlers.archivebox.crawler import ArchiveBoxCrawler
from mcp_server_webcrawl.crawlers.archivebox.adapter import ArchiveBoxManager
from mcp_server_webcrawl.crawlers.base.tests import BaseCrawlerTests
from mcp_server_webcrawl.crawlers import get_fixture_directory
from mcp_server_webcrawl.utils.logger import get_logger

# calculate ids for ArchiveBox working directories using the same hash function as adapter
EXAMPLE_SITE_ID = ArchiveBoxManager.string_to_id("example")
PRAGMAR_SITE_ID = ArchiveBoxManager.string_to_id("pragmar")

logger = get_logger()


[docs]
class ArchiveBoxTests(BaseCrawlerTests):
    """
    Test suite for the ArchiveBox crawler implementation.
    Uses wrapped test methods from BaseCrawlerTests adapted for ArchiveBox's multi-instance structure.
    """


[docs]
    def setUp(self):
        """
        Set up the test environment with fixture data.
        """
        super().setUp()
        self._datasrc = get_fixture_directory() / "archivebox"



[docs]
    def test_archivebox_pulse(self):
        """
        Test basic crawler initialization.
        """
        crawler = ArchiveBoxCrawler(self._datasrc)
        self.assertIsNotNone(crawler)
        self.assertTrue(self._datasrc.is_dir())



[docs]
    def test_archivebox_sites(self):
        """
        Test site retrieval API functionality.
        """
        crawler = ArchiveBoxCrawler(self._datasrc)

        # should have multiple sites (example and pragmar working directories)
        sites_json = crawler.get_sites_api()
        self.assertGreaterEqual(sites_json.total, 2, "ArchiveBox should have multiple working directories as sites")

        # test pragmar site specifically
        self.run_pragmar_site_tests(crawler, PRAGMAR_SITE_ID)



[docs]
    def test_archivebox_search(self):
        """
        Test boolean search functionality.
        """
        crawler = ArchiveBoxCrawler(self._datasrc)
        self.run_pragmar_search_tests(crawler, PRAGMAR_SITE_ID)



[docs]
    def test_pragmar_tokenizer(self):
        """
        Test tokenizer search functionality.
        """
        crawler = ArchiveBoxCrawler(self._datasrc)
        self.run_pragmar_tokenizer_tests(crawler, PRAGMAR_SITE_ID)



[docs]
    def test_archivebox_resources(self):
        """
        Test resource retrieval API functionality with various parameters.
        """
        crawler = ArchiveBoxCrawler(self._datasrc)
        self.run_sites_resources_tests(crawler, PRAGMAR_SITE_ID, EXAMPLE_SITE_ID)



[docs]
    def test_interrobot_images(self):
        """
        Test InterroBot-specific image handling and thumbnails.
        """
        crawler = ArchiveBoxCrawler(self._datasrc)
        self.run_pragmar_image_tests(crawler, PRAGMAR_SITE_ID)



[docs]
    def test_archivebox_sorts(self):
        """
        Test random sort functionality using the '?' sort parameter.
        """
        crawler = ArchiveBoxCrawler(self._datasrc)
        self.run_pragmar_sort_tests(crawler, PRAGMAR_SITE_ID)



[docs]
    def test_archivebox_content_parsing(self):
        """
        Test content type detection and parsing for ArchiveBox resources.
        """
        crawler = ArchiveBoxCrawler(self._datasrc)
        self.run_pragmar_content_tests(crawler, PRAGMAR_SITE_ID, False)



[docs]
    def test_archivebox_url_reconstruction(self):
        """
        Test URL reconstruction from ArchiveBox metadata.
        """
        crawler = ArchiveBoxCrawler(self._datasrc)

        url_resources = crawler.get_resources_api(sites=[PRAGMAR_SITE_ID], limit=20)
        self.assertGreater(url_resources.total, 0, "Should have resources with reconstructed URLs")

        for resource in url_resources._results:
            # URLs should be valid HTTP/HTTPS (except for archivebox:// fallbacks)
            self.assertTrue(
                resource.url.startswith(('http://', 'https://', 'archivebox://')),
                f"URL should have valid scheme: {resource.url}"
            )

            # should not end with index.html (stripped during reconstruction)
            self.assertFalse(
                resource.url.endswith('/index.html'),
                f"URL should not end with index.html: {resource.url}"
            )



[docs]
    def test_archivebox_deduplication(self):
        """
        Test resource deduplication across timestamped entries.
        """
        crawler = ArchiveBoxCrawler(self._datasrc)

        # get all resources from pragmar site
        all_resources = crawler.get_resources_api(sites=[PRAGMAR_SITE_ID], limit=100)
        self.assertGreater(all_resources.total, 0, "Should have resources")

        # check for URL uniqueness (deduplication should ensure unique URLs)
        urls_found = [r.url for r in all_resources._results]
        unique_urls = set(urls_found)

        # should have deduplication working (though some URLs might legitimately appear multiple times
        # if they're different resources, like different timestamps of the same page)
        self.assertLessEqual(len(unique_urls), len(urls_found),
                            "URL deduplication should work properly")



[docs]
    def test_archivebox_metadata_parsing(self):
        """
        Test JSON metadata parsing from ArchiveBox files.
        """
        crawler = ArchiveBoxCrawler(self._datasrc)

        # get resources with headers from pragmar site
        header_resources = crawler.get_resources_api(
            sites=[PRAGMAR_SITE_ID],
            fields=["headers"],
            limit=10
        )

        if header_resources.total > 0:
            headers_found = 0
            for resource in header_resources._results:
                resource_dict = resource.to_dict()
                if "headers" in resource_dict and resource_dict["headers"]:
                    headers_found += 1
                    self.assertIn("HTTP/1.0", resource_dict["headers"],
                                "Headers should contain HTTP status line")

            # at least some resources should have parsed headers
            self.assertGreater(headers_found, 0, "Should find resources with parsed headers")



[docs]
    def test_archivebox_timestamped_structure(self):
        """
        Test handling of ArchiveBox's timestamped entry structure.
        """
        crawler = ArchiveBoxCrawler(self._datasrc)

        # get resources with timestamps from pragmar site
        timestamp_resources = crawler.get_resources_api(
            sites=[PRAGMAR_SITE_ID],
            fields=["created", "modified"],
            limit=10
        )

        self.assertGreater(timestamp_resources.total, 0, "Should have timestamped resources")

        for resource in timestamp_resources._results:
            resource_dict = resource.to_dict()

            # should have timestamp information
            self.assertIsNotNone(resource_dict.get("created"),
                                "Should have created timestamp from entry directory")
            self.assertIsNotNone(resource_dict.get("modified"),
                                "Should have modified timestamp from entry directory")



[docs]
    def test_archivebox_error_resilience(self):
        """
        Test resilience to malformed JSON and missing files.
        """
        crawler = ArchiveBoxCrawler(self._datasrc)

        # should continue processing despite any JSON parsing errors
        all_resources = crawler.get_resources_api(sites=[PRAGMAR_SITE_ID])

        # verify we got some resources despite potential errors
        self.assertGreater(all_resources.total, 0,
                          "Should process entries even with JSON parsing errors")

        # verify resources have reasonable defaults
        for resource in all_resources._results:
            self.assertIsNotNone(resource.url, "URL should always be set")
            self.assertIsInstance(resource.status, int, "Status should be integer")
            self.assertGreaterEqual(resource.status, 0, "Status should be non-negative")
            self.assertLessEqual(resource.status, 599, "Status should be valid HTTP status")



[docs]
    def test_archivebox_multi_site(self):
        """
        Test that multiple ArchiveBox working directories are treated as separate sites.
        """
        crawler = ArchiveBoxCrawler(self._datasrc)

        # get resources from each site separately
        example_resources = crawler.get_resources_api(sites=[EXAMPLE_SITE_ID], limit=10)
        pragmar_resources = crawler.get_resources_api(
            query="url: pragmar.com",
            sites=[PRAGMAR_SITE_ID],
            limit=10)

        # print(example_resources.to_dict())
        # print(pragmar_resources.to_dict())

        # both sites should have resources
        self.assertGreater(example_resources.total, 0, "Example site should have resources")
        self.assertGreater(pragmar_resources.total, 0, "Pragmar site should have resources")

        # URLs should reflect the appropriate domains
        example_urls = [r.url for r in example_resources._results]
        pragmar_urls = [r.url for r in pragmar_resources._results]

        # verify site separation (pragmar resources should be about pragmar.com)
        pragmar_domain_urls = [url for url in pragmar_urls if "pragmar.com" in url]
        self.assertGreater(len(pragmar_domain_urls), 0,
                          "Pragmar site should contain pragmar.com URLs")



[docs]
    def test_report(self):
        """
        Run test report for ArchiveBox archive.
        """
        crawler = ArchiveBoxCrawler(self._datasrc)

        # generate report using pragmar site ID
        report = self.run_pragmar_report(crawler, PRAGMAR_SITE_ID, "ArchiveBox")
        logger.info(report)

        # basic validation that report contains expected content
        self.assertIn("ArchiveBox", report, "Report should mention ArchiveBox")
        self.assertIn("Total pages:", report, "Report should show page counts")