Source code for mcp_server_webcrawl.crawlers


import sys
from pathlib import Path
from mcp_server_webcrawl.settings import FIXTURES_DIRECTORY

VALID_CRAWLER_CHOICES: list[str] = ["archivebox", "httrack", "interrobot", "katana", "siteone", "warc", "wget"]

[docs] def get_fixture_directory() -> Path: # only to be used for devs on test runs, configured in settings_local.py # settings_local.py added as sibling of settings.py if not present # download https://github.com/pragmar/mcp-server-webcrawl-fixtures assert FIXTURES_DIRECTORY is not None and FIXTURES_DIRECTORY.is_dir(), \ f"Fixtures not configured in settings_local.py, or is not a valid directory.\nFIXTURES_DIRECTORY: {FIXTURES_DIRECTORY}" return FIXTURES_DIRECTORY
[docs] def get_crawler(crawler_name: str) -> str | None: """ lazy load crawler, some classes have additional package dependencies """ if crawler_name is None: return None crawler_name = crawler_name.lower() if crawler_name == "archivebox": from mcp_server_webcrawl.crawlers.archivebox.crawler import ArchiveBoxCrawler return ArchiveBoxCrawler elif crawler_name == "httrack": from mcp_server_webcrawl.crawlers.httrack.crawler import HtTrackCrawler return HtTrackCrawler elif crawler_name == "interrobot": from mcp_server_webcrawl.crawlers.interrobot.crawler import InterroBotCrawler return InterroBotCrawler elif crawler_name == "katana": from mcp_server_webcrawl.crawlers.katana.crawler import KatanaCrawler return KatanaCrawler elif crawler_name == "siteone": from mcp_server_webcrawl.crawlers.siteone.crawler import SiteOneCrawler return SiteOneCrawler elif crawler_name == "warc": from mcp_server_webcrawl.crawlers.warc.crawler import WarcCrawler return WarcCrawler elif crawler_name == "wget": from mcp_server_webcrawl.crawlers.wget.crawler import WgetCrawler return WgetCrawler else: valid_choices = ", ".join(VALID_CRAWLER_CHOICES) raise ValueError(f"unsupported crawler '{crawler_name}' ({valid_choices})")