Source code for mcp_server_webcrawl.crawlers.katana.tests

from mcp_server_webcrawl.crawlers.katana.crawler import KatanaCrawler
from mcp_server_webcrawl.crawlers.katana.adapter import KatanaManager
from mcp_server_webcrawl.models.resources import ResourceResultType
from mcp_server_webcrawl.crawlers.base.tests import BaseCrawlerTests
from mcp_server_webcrawl.crawlers import get_fixture_directory

# calculate ids for test directories using the same hash function as adapter
EXAMPLE_SITE_ID = KatanaManager.string_to_id("example.com")
PRAGMAR_SITE_ID = KatanaManager.string_to_id("pragmar.com")


[docs]
class KatanaTests(BaseCrawlerTests):
    """
    test suite for the HTTP text crawler implementation.
    tests parsing and retrieval of web content from HTTP text files.
    """


[docs]
    def setUp(self):
        """
        set up the test environment with fixture data.
        """
        super().setUp()
        self._datasrc = get_fixture_directory() / "katana"



[docs]
    def test_katana_pulse(self):
        """
        basic crawler initialization.
        """
        crawler = KatanaCrawler(self._datasrc)
        self.assertIsNotNone(crawler)
        self.assertTrue(self._datasrc.is_dir())



[docs]
    def test_katana_sites(self):
        """
        site retrieval API functionality.
        """
        crawler = KatanaCrawler(self._datasrc)

        # all sites
        sites_json = crawler.get_sites_api()
        self.assertTrue(sites_json.total >= 2)

        # single site
        site_one_json = crawler.get_sites_api(ids=[EXAMPLE_SITE_ID])
        self.assertTrue(site_one_json.total == 1)

        # site with fields
        site_field_json = crawler.get_sites_api(ids=[PRAGMAR_SITE_ID], fields=["created", "modified"])
        site_field_result = site_field_json._results[0].to_dict()
        self.assertTrue("created" in site_field_result)
        self.assertTrue("modified" in site_field_result)



[docs]
    def test_katana_resources(self):
        """
        resource retrieval API functionality with various parameters.
        """
        crawler = KatanaCrawler(self._datasrc)

        # basic resource retrieval
        resources_json = crawler.get_resources_api()
        self.assertTrue(resources_json.total > 0)

        # query parameter for content search
        query_resources = crawler.get_resources_api(
            sites=[PRAGMAR_SITE_ID],
            query="qbit",
            fields=["content", "headers"]
        )
        self.assertTrue(query_resources.total > 0, "Search query should return results")

        # verify search term exists in returned resources
        for resource in query_resources._results:
            resource_dict = resource.to_dict()
            found = False
            for field, value in resource_dict.items():
                if isinstance(value, str) and "qbit" in value.lower():
                    found = True
                    break
            self.assertTrue(found, f"Search term not found in any field of resource {resource.id}")

        # resource ID filtering
        if resources_json.total > 0:
            first_resource = resources_json._results[0]
            id_resources = crawler.get_resources_api(
                sites=[first_resource.site],
                ids=[first_resource.id]
            )
            self.assertEqual(id_resources.total, 1)
            self.assertEqual(id_resources._results[0].id, first_resource.id)

        # site filtering
        site_resources = crawler.get_resources_api(sites=[PRAGMAR_SITE_ID])
        self.assertTrue(site_resources.total > 0, "Site filtering should return results")
        for resource in site_resources._results:
            self.assertEqual(resource.site, PRAGMAR_SITE_ID)

        # type filtering for HTML pages
        html_resources = crawler.get_resources_api(
            sites=[PRAGMAR_SITE_ID],
            types=[ResourceResultType.PAGE.value]
        )
        self.assertTrue(html_resources.total > 0, "HTML filtering should return results")
        for resource in html_resources._results:
            self.assertEqual(resource.type, ResourceResultType.PAGE)

        # type filtering for multiple resource types
        mixed_resources = crawler.get_resources_api(
            sites=[PRAGMAR_SITE_ID],
            types=[ResourceResultType.PAGE.value, ResourceResultType.SCRIPT.value]
        )
        if mixed_resources.total > 0:
            types_found = {r.type for r in mixed_resources._results}
            self.assertTrue(
                len(types_found) > 0,
                "Should find at least one of the requested resource types"
            )
            for resource_type in types_found:
                self.assertIn(
                    resource_type,
                    [ResourceResultType.PAGE, ResourceResultType.SCRIPT]
                )

        # custom fields in response
        custom_fields = ["content", "headers", "time"]
        field_resources = crawler.get_resources_api(
            sites=[PRAGMAR_SITE_ID],
            fields=custom_fields
        )
        self.assertTrue(field_resources.total > 0)
        resource_dict = field_resources._results[0].to_dict()
        for field in custom_fields:
            self.assertIn(field, resource_dict, f"Field '{field}' should be in response")

        # URL sorting (ascending)
        asc_resources = crawler.get_resources_api(sites=[PRAGMAR_SITE_ID], sort="+url")
        if asc_resources.total > 1:
            self.assertTrue(asc_resources._results[0].url <= asc_resources._results[1].url)

        # URL sorting (descending)
        desc_resources = crawler.get_resources_api(sites=[PRAGMAR_SITE_ID], sort="-url")
        if desc_resources.total > 1:
            self.assertTrue(desc_resources._results[0].url >= desc_resources._results[1].url)

        # pagination (limit)
        limit_resources = crawler.get_resources_api(sites=[PRAGMAR_SITE_ID], limit=3)
        self.assertTrue(len(limit_resources._results) <= 3)

        # pagination (offset)
        offset_resources = crawler.get_resources_api(sites=[PRAGMAR_SITE_ID], offset=2, limit=2)
        self.assertTrue(len(offset_resources._results) <= 2)
        if resources_json.total > 4:
            self.assertNotEqual(
                resources_json._results[0].id,
                offset_resources._results[0].id,
                "Offset results should differ from first page"
            )

        # status code filtering
        status_resources = crawler.get_resources_api(sites=[PRAGMAR_SITE_ID], statuses=[200])
        self.assertTrue(status_resources.total > 0, "Status filtering should return results")
        for resource in status_resources._results:
            self.assertEqual(resource.status, 200)

        # multiple status codes
        multi_status_resources = crawler.get_resources_api(statuses=[200, 404])
        if multi_status_resources.total > 0:
            found_statuses = {r.status for r in multi_status_resources._results}
            for status in found_statuses:
                self.assertIn(status, [200, 404])

        # combined filtering
        combined_resources = crawler.get_resources_api(
            sites=[PRAGMAR_SITE_ID],
            query="style",
            types=[ResourceResultType.PAGE.value],
            fields=["content", "headers"],
            sort="+url",
            limit=3
        )

        if combined_resources.total > 0:
            for resource in combined_resources._results:
                self.assertEqual(resource.site, PRAGMAR_SITE_ID)
                self.assertEqual(resource.type, ResourceResultType.PAGE)
                resource_dict = resource.to_dict()
                self.assertIn("content", resource_dict)
                self.assertIn("headers", resource_dict)

        # multi-site search
        multisite_resources = crawler.get_resources_api(
            sites=[EXAMPLE_SITE_ID, PRAGMAR_SITE_ID],
            types=[ResourceResultType.PAGE.value],
            sort="+url",
            limit=100
        )

        self.assertTrue(multisite_resources.total > 0, "Multi-site search should return results")

        # track which sites we find results from
        found_sites = set()
        for resource in multisite_resources._results:
            found_sites.add(resource.site)

        # verify we got results from both sites
        self.assertEqual(
            len(found_sites),
            2,
            "Should have results from both sites"
        )
        self.assertIn(
            EXAMPLE_SITE_ID,
            found_sites,
            "Should have results from example.com"
        )
        self.assertIn(
            PRAGMAR_SITE_ID,
            found_sites,
            "Should have results from pragmar.com"
        )



[docs]
    def test_katana_random_sort(self):
        """
        random sort functionality using the '?' sort parameter.
        """
        crawler = KatanaCrawler(self._datasrc)
        random1_resources = crawler.get_resources_api(sites=[PRAGMAR_SITE_ID], sort="?", limit=20)
        self.assertTrue(random1_resources.total > 0, "Database should contain resources")
        random1_ids = [r.id for r in random1_resources._results]
        random2_resources = crawler.get_resources_api(sites=[PRAGMAR_SITE_ID], sort="?", limit=20)
        self.assertTrue(random2_resources.total > 0, "Random sort should return results")
        random2_ids = [r.id for r in random2_resources._results]
        if random2_resources.total >= 10:
            self.assertNotEqual(
                random1_ids,
                random2_ids,
                f"Random sort should produce different order than standard sort.\nStandard: {random1_ids}\nRandom: {random2_ids}"
            )
        else:
            print(f"Skip randomness verification: Not enough resources ({random2_resources.total})")



[docs]
    def test_katana_content_parsing(self):
        """
        content type detection and parsing for HTTP text files.
        """
        crawler = KatanaCrawler(self._datasrc)

        # HTML content detection
        html_resources = crawler.get_resources_api(
            sites=[PRAGMAR_SITE_ID],
            types=[ResourceResultType.PAGE.value],
            fields=["content", "headers"]
        )
        self.assertTrue(html_resources.total > 0, "Should find HTML resources")
        for resource in html_resources._results:
            resource_dict = resource.to_dict()
            if "content" in resource_dict and resource_dict["content"]:
                self.assertTrue(
                    "<!DOCTYPE html>" in resource_dict["content"] or
                    "<html" in resource_dict["content"],
                    f"HTML content should contain HTML markups: {resource.url}"
                )

            if "headers" in resource_dict and resource_dict["headers"]:
                self.assertTrue(
                    "Content-Type:" in resource_dict["headers"],
                    f"Headers should contain Content-Type: {resource.url}"
                )

        # script content detection
        script_resources = crawler.get_resources_api(
            sites=[PRAGMAR_SITE_ID],
            types=[ResourceResultType.SCRIPT.value],
            fields=["content", "headers"]
        )
        if script_resources.total > 0:
            for resource in script_resources._results:
                self.assertEqual(resource.type, ResourceResultType.SCRIPT)

        # css content detection
        css_resources = crawler.get_resources_api(
            sites=[PRAGMAR_SITE_ID],
            types=[ResourceResultType.CSS.value],
            fields=["content", "headers"]
        )
        if css_resources.total > 0:
            for resource in css_resources._results:
                self.assertEqual(resource.type, ResourceResultType.CSS)