from mcp_server_webcrawl.crawlers.httrack.crawler import HtTrackCrawler
from mcp_server_webcrawl.crawlers.httrack.adapter import HtTrackManager
from mcp_server_webcrawl.crawlers.base.tests import BaseCrawlerTests
from mcp_server_webcrawl.crawlers import get_fixture_directory
from mcp_server_webcrawl.utils.logger import get_logger
logger = get_logger()
# Calculate using same hash function as adapter
EXAMPLE_SITE_ID = HtTrackManager.string_to_id("example")
PRAGMAR_SITE_ID = HtTrackManager.string_to_id("pragmar")
[docs]
class HtTrackTests(BaseCrawlerTests):
"""
Test suite for the HTTrack crawler implementation.
Uses all wrapped test methods from BaseCrawlerTests plus HTTrack-specific features.
"""
[docs]
def setUp(self):
"""
Set up the test environment with fixture data.
"""
super().setUp()
self._datasrc = get_fixture_directory() / "httrack"
[docs]
def test_httrack_pulse(self):
"""
Test basic crawler initialization.
"""
crawler = HtTrackCrawler(self._datasrc)
self.assertIsNotNone(crawler)
self.assertTrue(self._datasrc.is_dir())
[docs]
def test_httrack_sites(self):
"""
Test site retrieval API functionality.
"""
crawler = HtTrackCrawler(self._datasrc)
self.run_pragmar_site_tests(crawler, PRAGMAR_SITE_ID)
[docs]
def test_httrack_search(self):
"""
Test boolean search functionality
"""
crawler = HtTrackCrawler(self._datasrc)
self.run_pragmar_search_tests(crawler, PRAGMAR_SITE_ID)
pass
[docs]
def test_httrack_resources(self):
"""
Test resource retrieval API functionality with various arguments.
"""
crawler = HtTrackCrawler(self._datasrc)
self.run_sites_resources_tests(crawler, PRAGMAR_SITE_ID, EXAMPLE_SITE_ID)
[docs]
def test_httrack_images(self):
"""
Test HTTrack image handling and thumbnails.
"""
crawler = HtTrackCrawler(self._datasrc)
self.run_pragmar_image_tests(crawler, PRAGMAR_SITE_ID)
[docs]
def test_httrack_sorts(self):
"""
Test random sort functionality using the sort argument.
"""
crawler = HtTrackCrawler(self._datasrc)
self.run_pragmar_sort_tests(crawler, PRAGMAR_SITE_ID)
[docs]
def test_httrack_content_parsing(self):
"""
Test content type detection and parsing.
"""
crawler = HtTrackCrawler(self._datasrc)
self.run_pragmar_content_tests(crawler, PRAGMAR_SITE_ID, False)
[docs]
def test_httrack_tokenizer(self):
"""
Test HTTrack-specific tokenizer functionality for hyphenated terms.
"""
crawler = HtTrackCrawler(self._datasrc)
self.run_pragmar_tokenizer_tests(crawler, PRAGMAR_SITE_ID)
[docs]
def test_httrack_log_parsing_features(self):
"""
Test HTTrack-specific features related to hts-log.txt parsing.
"""
crawler = HtTrackCrawler(self._datasrc)
# Test that 404 errors from log are properly indexed
error_resources = crawler.get_resources_api(
sites=[PRAGMAR_SITE_ID],
query="status: 404"
)
if error_resources.total > 0:
for resource in error_resources._results:
self.assertEqual(resource.status, 404, "404 status should be preserved from log parsing")
# Test that redirects are properly indexed
redirect_resources = crawler.get_resources_api(
sites=[PRAGMAR_SITE_ID],
query="status: 302"
)
if redirect_resources.total > 0:
for resource in redirect_resources._results:
self.assertEqual(resource.status, 302, "Redirect status should be detected from log")
# Test successful resources default to 200
success_resources = crawler.get_resources_api(
sites=[PRAGMAR_SITE_ID],
query="status: 200",
limit=5
)
self.assertTrue(success_resources.total > 0, "Should have successful resources with status 200")
for resource in success_resources._results:
self.assertEqual(resource.status, 200)
[docs]
def test_httrack_url_reconstruction(self):
"""
Test HTTrack URL reconstruction from project and domain structure.
"""
crawler = HtTrackCrawler(self._datasrc)
# Get all resources to test URL patterns
all_resources = crawler.get_resources_api(
sites=[PRAGMAR_SITE_ID],
limit=10
)
self.assertTrue(all_resources.total > 0, "Should have resources with reconstructed URLs")
for resource in all_resources._results:
# URLs should be properly formatted
self.assertTrue(resource.url.startswith("https://"),
f"URL should start with https://: {resource.url}")
# URLs should not contain file system artifacts
self.assertNotIn("\\", resource.url, "URLs should not contain backslashes")
self.assertNotIn("hts-", resource.url, "URLs should not contain HTTrack artifacts")
[docs]
def test_httrack_domain_detection(self):
"""
Test HTTrack domain directory detection and multi-domain handling.
"""
crawler = HtTrackCrawler(self._datasrc)
sites_result = crawler.get_sites_api()
self.assertTrue(sites_result.total > 0, "Should detect HTTrack project directories as sites")
specific_site = crawler.get_sites_api(ids=[PRAGMAR_SITE_ID])
if specific_site.total > 0:
site_data = specific_site._results[0].to_dict()
self.assertIn("url", site_data, "Site should have reconstructed URL")
self.assertTrue(site_data["url"].startswith("https://"), "Site URL should be properly formatted")
[docs]
def test_httrack_file_exclusion(self):
"""
Test that HTTrack-generated files are properly excluded.
"""
crawler = HtTrackCrawler(self._datasrc)
# Search for any resources that might be HTTrack artifacts
all_resources = crawler.get_resources_api(
sites=[PRAGMAR_SITE_ID],
query="",
limit=50
)
for resource in all_resources._results:
# Should not find project-level index.html (HTTrack-generated)
if resource.url.endswith("/index.html"):
# This should be domain-level index.html, not project-level
self.assertNotEqual(resource.url, "https://pragmar/index.html",
"Should not index project-level HTTrack-generated index.html")
# Should not find hts-log.txt as a resource
self.assertNotIn("hts-log.txt", resource.url, "Should not index hts-log.txt as resource")
self.assertNotIn("hts-cache", resource.url, "Should not index hts-cache contents as resources")
[docs]
def test_httrack_advanced_features(self):
"""
Test HTTrack-specific advanced features not covered by base tests.
"""
crawler = HtTrackCrawler(self._datasrc)
# Test field retrieval with HTTrack-specific metadata
field_resources = crawler.get_resources_api(
sites=[PRAGMAR_SITE_ID],
query="type: html",
fields=["content", "headers", "created", "modified"],
limit=3
)
if field_resources.total > 0:
resource_dict = field_resources._results[0].to_dict()
# Test timestamps from file system
self.assertIn("created", resource_dict, "Should have created timestamp from file stat")
self.assertIn("modified", resource_dict, "Should have modified timestamp from file stat")
# Test headers generation
if "headers" in resource_dict and resource_dict["headers"]:
headers = resource_dict["headers"]
self.assertIn("Content-Type:", headers, "Should have generated Content-Type header")
self.assertIn("Content-Length:", headers, "Should have generated Content-Length header")
# Test that resources have proper size information
size_resources = crawler.get_resources_api(
sites=[PRAGMAR_SITE_ID],
fields=["size"],
limit=5
)
if size_resources.total > 0:
for resource in size_resources._results:
resource_dict = resource.to_dict()
self.assertIn("size", resource_dict, "Resource should have size field")
self.assertGreaterEqual(resource_dict["size"], 0, "Size should be non-negative")
[docs]
def test_report(self):
"""
Run test report, save to data directory.
"""
crawler = HtTrackCrawler(self._datasrc)
logger.info(self.run_pragmar_report(crawler, PRAGMAR_SITE_ID, "HTTrack"))