Source code for mcp_server_webcrawl.crawlers.base.tests

import sys
import unittest
import asyncio

from typing import Final
from datetime import datetime
from logging import Logger

from mcp_server_webcrawl.crawlers.base.crawler import BaseCrawler
from mcp_server_webcrawl.models.resources import ResourceResultType
from mcp_server_webcrawl.crawlers.base.api import BaseJsonApi
from mcp_server_webcrawl.utils.logger import get_logger

logger: Logger = get_logger()


[docs] class BaseCrawlerTests(unittest.TestCase): __PRAGMAR_PRIMARY_KEYWORD: Final[str] = "crawler" __PRAGMAR_SECONDARY_KEYWORD: Final[str] = "privacy" __PRAGMAR_HYPHENATED_KEYWORD: Final[str] = "one-click"
[docs] def setUp(self): # quiet asyncio error on tests, occurring after sucessful completion if sys.platform == "win32": asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
[docs] def run_pragmar_search_tests(self, crawler: BaseCrawler, site_id: int): """ Run a battery of database checks on the crawler and Boolean validation """ resources_json = crawler.get_resources_api() self.assertTrue(resources_json.total > 0, "Should have some resources in database") site_resources = crawler.get_resources_api(sites=[site_id]) self.assertTrue(site_resources.total > 0, "Pragmar site should have resources") primary_resources = crawler.get_resources_api( sites=[site_id], query=self.__PRAGMAR_PRIMARY_KEYWORD, fields=["content", "headers"], limit=1, ) self.assertTrue(primary_resources.total > 0, f"Keyword '{self.__PRAGMAR_PRIMARY_KEYWORD}' should return results") secondary_resources = crawler.get_resources_api( sites=[site_id], query=self.__PRAGMAR_SECONDARY_KEYWORD, limit=1, ) self.assertTrue(secondary_resources.total > 0, f"Keyword '{self.__PRAGMAR_SECONDARY_KEYWORD}' should return results") self.__run_pragmar_search_tests_fulltext(crawler, site_id, site_resources) self.__run_pragmar_search_tests_field_status(crawler, site_id) self.__run_pragmar_search_tests_field_headers(crawler, site_id) self.__run_pragmar_search_tests_field_content(crawler, site_id) self.__run_pragmar_search_tests_field_type(crawler, site_id, site_resources) self.__run_pragmar_search_tests_extras(crawler, site_id, site_resources, primary_resources, secondary_resources)
[docs] def run_pragmar_image_tests(self, crawler: BaseCrawler, pragmar_site_id: int): """ Test InterroBot-specific image handling and thumbnails. """ img_results = crawler.get_resources_api(sites=[pragmar_site_id], query="type: img", limit=5) self.assertTrue(img_results.total > 0, "Image type filter should return results") self.assertTrue( all(r.type.value == "img" for r in img_results._results), "All filtered resources should have type 'img'" )
[docs] def run_sites_resources_tests(self, crawler: BaseCrawler, pragmar_site_id: int, example_site_id: int): resources_json = crawler.get_resources_api() self.assertTrue(resources_json.total > 0, "Should have some resources in database") site_resources = crawler.get_resources_api(sites=[pragmar_site_id]) self.assertTrue(site_resources.total > 0, "Pragmar site should have resources") # basic resource retrieval resources_json = crawler.get_resources_api() self.assertTrue(resources_json.total > 0) # fulltext keyword search query_keyword1 = "privacy" timestamp_resources = crawler.get_resources_api( sites=[pragmar_site_id], query=query_keyword1, fields=["created", "modified", "time"], limit=5, ) self.assertTrue(timestamp_resources.total > 0, "Search query should return results") for resource in timestamp_resources._results: resource_dict = resource.to_dict() self.assertIsNotNone(resource_dict["created"], "Created timestamp should not be None") self.assertIsNotNone(resource_dict["modified"], "Modified timestamp should not be None") self.assertIsNotNone(resource_dict["time"], "Modified timestamp should not be None") # resource ID filtering if resources_json.total > 0: first_resource = resources_json._results[0] id_resources = crawler.get_resources_api( sites=[first_resource.site], query=f"id: {first_resource.id}", limit=1, ) self.assertEqual(id_resources.total, 1) self.assertEqual(id_resources._results[0].id, first_resource.id) # site filtering site_resources = crawler.get_resources_api(sites=[pragmar_site_id]) self.assertTrue(site_resources.total > 0, "Site filtering should return results") for resource in site_resources._results: self.assertEqual(resource.site, pragmar_site_id) # type filtering for HTML pages html_resources = crawler.get_resources_api( sites=[pragmar_site_id], query= f"type: {ResourceResultType.PAGE.value}", ) self.assertTrue(html_resources.total > 0, "HTML filtering should return results") for resource in html_resources._results: self.assertEqual(resource.type, ResourceResultType.PAGE) # type filtering for multiple resource types mixed_resources = crawler.get_resources_api( sites=[pragmar_site_id], query= f"type: {ResourceResultType.PAGE.value} OR type: {ResourceResultType.SCRIPT.value}", ) if mixed_resources.total > 0: types_found = {r.type for r in mixed_resources._results} self.assertTrue( len(types_found) > 0, "Should find at least one of the requested resource types" ) for resource_type in types_found: self.assertIn( resource_type, [ResourceResultType.PAGE, ResourceResultType.SCRIPT] ) # custom fields in response custom_fields = ["content", "headers", "time"] field_resources = crawler.get_resources_api( query="type: html", sites=[pragmar_site_id], fields=custom_fields, limit=1, ) self.assertTrue(field_resources.total > 0) resource_dict = field_resources._results[0].to_dict() for field in custom_fields: self.assertIn(field, resource_dict, f"Field '{field}' should be in response") asc_resources = crawler.get_resources_api(sites=[pragmar_site_id], sort="+url") if asc_resources.total > 1: self.assertTrue(asc_resources._results[0].url <= asc_resources._results[1].url) desc_resources = crawler.get_resources_api(sites=[pragmar_site_id], sort="-url") if desc_resources.total > 1: self.assertTrue(desc_resources._results[0].url >= desc_resources._results[1].url) limit_resources = crawler.get_resources_api(sites=[pragmar_site_id], limit=3) self.assertTrue(len(limit_resources._results) <= 3) offset_resources = crawler.get_resources_api(sites=[pragmar_site_id], offset=2, limit=2) self.assertTrue(len(offset_resources._results) <= 2) if resources_json.total > 4: self.assertNotEqual( resources_json._results[0].id, offset_resources._results[0].id, "Offset results should differ from first page" ) # multi-site search, verify we got results from both sites multisite_resources = crawler.get_resources_api( sites=[example_site_id, pragmar_site_id], query= f"type: {ResourceResultType.PAGE.value}", sort="+url", limit=10, ) found_sites = set() for resource in multisite_resources._results: found_sites.add(resource.site) self.assertEqual(len(found_sites), 2, "Should have results from both sites")
[docs] def run_pragmar_tokenizer_tests(self, crawler: BaseCrawler, site_id:int): """ fts hyphens and underscores are particularly challenging, thus have a dedicated test. these must be configured in multiple places including CREATE TABLE ... tokenizer, as well as handled by the query parser. """ mcp_resources_keyword = crawler.get_resources_api( sites=[site_id], query='"mcp-server-webcrawl"', fields=[], limit=1, ) mcp_resources_quoted = crawler.get_resources_api( sites=[site_id], query='"mcp-server-webcrawl"', fields=[], limit=1, ) self.assertTrue(mcp_resources_keyword.total > 0, "Should find mcp-server-webcrawl in HTML") self.assertTrue(mcp_resources_quoted.total > 0, "Should find \"mcp-server-webcrawl\" (phrase) in HTML") self.assertTrue(mcp_resources_quoted.total == mcp_resources_keyword.total, "Quoted and unquoted equivalence expected") mcp_resources_wildcarded = crawler.get_resources_api( sites=[site_id], query='mcp*', fields=[], limit=1, ) self.assertTrue(mcp_resources_wildcarded.total > 0, "Should find mcp-server-* in HTML") combo_and_resources_keyword = crawler.get_resources_api( sites=[site_id], query='"mcp-server-webcrawl" AND "one-click"', fields=[], limit=1, ) combo_and_resources_quoted = crawler.get_resources_api( sites=[site_id], query='mcp-server-webcrawl AND one-click', fields=[], limit=1, ) self.assertTrue(combo_and_resources_keyword.total > 0, "Should find mcp-server-webcrawl in HTML") self.assertTrue(combo_and_resources_quoted.total > 0, "Should find \"mcp-server-webcrawl\" (phrase) in HTML") self.assertTrue(combo_and_resources_keyword.total == combo_and_resources_quoted.total, "Quoted and unquoted equivalence expected") combo_or_resources_keyword = crawler.get_resources_api( sites=[site_id], query='"mcp-server-webcrawl" OR "one-click"', fields=[], limit=1, ) combo_or_resources_quoted = crawler.get_resources_api( sites=[site_id], query='mcp-server-webcrawl OR one-click', fields=[], limit=1, ) self.assertTrue(combo_or_resources_keyword.total > 0, "Should find mcp-server-webcrawl in HTML") self.assertTrue(combo_or_resources_quoted.total > 0, "Should find \"mcp-server-webcrawl\" (phrase) in HTML") self.assertTrue(combo_or_resources_keyword.total == combo_or_resources_quoted.total, "Quoted and unquoted equivalence expected") combo_not_resources_keyword = crawler.get_resources_api( sites=[site_id], query='"mcp-server-webcrawl" NOT "one-click"', fields=[], limit=1, ) combo_not_resources_quoted = crawler.get_resources_api( sites=[site_id], query='mcp-server-webcrawl NOT one-click', fields=[], limit=1, ) combo_and_not_resources_quoted = crawler.get_resources_api( sites=[site_id], query='mcp-server-webcrawl AND NOT one-click', fields=[], limit=1, ) self.assertTrue(combo_not_resources_keyword.total > 0, "Should find mcp-server-webcrawl in HTML") self.assertTrue(combo_not_resources_quoted.total > 0, "Should find \"mcp-server-webcrawl\" (phrase) in HTML") self.assertTrue(combo_not_resources_keyword.total == combo_not_resources_quoted.total, "Quoted and unquoted equivalence expected") self.assertTrue(combo_not_resources_keyword.total == combo_and_not_resources_quoted.total, f"NOT ({combo_not_resources_keyword.total}) and AND NOT ({combo_and_not_resources_quoted.total}) equivalence expected") self.assertTrue(mcp_resources_keyword.total >= combo_and_resources_keyword.total, "Total records should be greater or equal to ANDs.") self.assertTrue(mcp_resources_keyword.total <= combo_or_resources_keyword.total, "Total records should be less than or equal to ORs.") self.assertTrue(mcp_resources_keyword.total > combo_not_resources_keyword.total, "Total records should be greater than to NOTs.")
[docs] def run_pragmar_site_tests(self, crawler: BaseCrawler, site_id:int): # all sites sites_json = crawler.get_sites_api() self.assertTrue(sites_json.total >= 2) # single site site_one_json = crawler.get_sites_api(ids=[site_id]) self.assertTrue(site_one_json.total == 1) # site with fields site_field_json = crawler.get_sites_api(ids=[site_id], fields=["created", "modified"]) site_field_result = site_field_json._results[0].to_dict() self.assertTrue("created" in site_field_result) self.assertTrue("modified" in site_field_result)
[docs] def run_pragmar_sort_tests(self, crawler: BaseCrawler, site_id: int): """ Test sorting functionality with performance optimizations. """ sorted_default = crawler.get_resources_api(sites=[site_id], limit=3, fields=[]) sorted_url_ascending = crawler.get_resources_api(sites=[site_id], sort="+url", limit=3, fields=[]) sorted_url_descending = crawler.get_resources_api(sites=[site_id], sort="-url", limit=3, fields=[]) self.assertTrue(sorted_url_ascending.total > 0, "Database should contain resources") self.assertTrue(sorted_url_descending.total > 0, "Database should contain resources") if len(sorted_default._results) > 0 and len(sorted_url_ascending._results) > 0: default_urls = [r.url for r in sorted_default._results] ascending_urls = [r.url for r in sorted_url_ascending._results] self.assertEqual(default_urls, ascending_urls, "Default sort should match +url sort") sorted_size_ascending = crawler.get_resources_api(sites=[site_id], sort="+size", limit=3, fields=["size"]) sorted_size_descending = crawler.get_resources_api(sites=[site_id], sort="-size", limit=3, fields=["size"]) if len(sorted_url_ascending._results) > 1: for i in range(len(sorted_url_ascending._results) - 1): self.assertLessEqual(sorted_url_ascending._results[i].url, sorted_url_ascending._results[i + 1].url, "URLs should be ascending") if len(sorted_url_descending._results) > 1: for i in range(len(sorted_url_descending._results) - 1): self.assertGreaterEqual(sorted_url_descending._results[i].url, sorted_url_descending._results[i + 1].url, "URLs should be descending") if len(sorted_size_ascending._results) > 1: for i in range(len(sorted_size_ascending._results) - 1): self.assertLessEqual(sorted_size_ascending._results[i].to_dict()["size"], sorted_size_ascending._results[i + 1].to_dict()["size"], "Sizes should be ascending") if len(sorted_size_descending._results) > 1: for i in range(len(sorted_size_descending._results) - 1): self.assertGreaterEqual(sorted_size_descending._results[i].to_dict()["size"], sorted_size_descending._results[i + 1].to_dict()["size"], "Sizes should be descending") random_1 = crawler.get_resources_api(sites=[site_id], sort="?", limit=20, fields=[]) random_2 = crawler.get_resources_api(sites=[site_id], sort="?", limit=20, fields=[]) self.assertTrue(random_1.total > 0, "Random sort should return results") if random_1.total >= 10: self.assertNotEqual([r.id for r in random_1._results], [r.id for r in random_2._results], "Random sort should produce different orders") else: logger.info(f"Skip randomness verification: Not enough resources ({random_1.total})")
[docs] def run_pragmar_content_tests(self, crawler: BaseCrawler, site_id:int, html_leniency: bool): html_resources = crawler.get_resources_api( sites=[site_id], query= f"type: {ResourceResultType.PAGE.value}", fields=["content", "headers"] ) self.assertTrue(html_resources.total > 0, "Should find HTML resources") for resource in html_resources._results: resource_dict = resource.to_dict() if "content" in resource_dict: content = resource_dict["content"].lower() self.assertTrue( "<!DOCTYPE html>" in content or "<html" in content or "<meta" in content or html_leniency, f"HTML content should contain HTML markup: {resource.url}\n\n{resource.content}" ) if "headers" in resource_dict and resource_dict["headers"]: self.assertTrue( "Content-Type:" in resource_dict["headers"], f"Headers should contain Content-Type: {resource.url}" ) # script content detection script_resources = crawler.get_resources_api( sites=[site_id], query= f"type: {ResourceResultType.SCRIPT.value}", fields=["content", "headers"], limit=1, ) if script_resources.total > 0: for resource in script_resources._results: self.assertEqual(resource.type, ResourceResultType.SCRIPT) # css content detection css_resources = crawler.get_resources_api( sites=[site_id], query= f"type: {ResourceResultType.CSS.value}", fields=["content", "headers"], limit=1, ) if css_resources.total > 0: for resource in css_resources._results: self.assertEqual(resource.type, ResourceResultType.CSS)
[docs] def run_pragmar_report(self, crawler: BaseCrawler, site_id: int, heading: str): """ Generate a comprehensive report of all resources for a site. Returns a formatted string with counts and URLs by type. """ site_resources = crawler.get_resources_api( sites=[site_id], query="", limit=100, ) html_resources = crawler.get_resources_api( sites=[site_id], query=f"type: {ResourceResultType.PAGE.value}", limit=100, ) css_resources = crawler.get_resources_api( sites=[site_id], query=f"type: {ResourceResultType.CSS.value}", limit=100, ) js_resources = crawler.get_resources_api( sites=[site_id], query=f"type: {ResourceResultType.SCRIPT.value}", limit=100, ) image_resources = crawler.get_resources_api( sites=[site_id], query=f"type: {ResourceResultType.IMAGE.value}", limit=100, ) mcp_resources = crawler.get_resources_api( sites=[site_id], query=f"type: html AND (mcp)", limit=100, ) report_lines = [] sections = [ ("Total pages", site_resources), ("Total HTML", html_resources), ("Total MCP search hits", mcp_resources), ("Total CSS", css_resources), ("Total JS", js_resources), ("Total Images", image_resources) ] for i, (section_name, resource_obj) in enumerate(sections): report_lines.append(f"{section_name}: {resource_obj.total}") for resource in resource_obj._results: report_lines.append(resource.url) if i < len(sections) - 1: report_lines.append("") now = datetime.now() lines_together = "\n".join(report_lines) return f""" ********************************************************************************** * {heading} {now.isoformat()} * ********************************************************************************** {lines_together} """
def __run_pragmar_search_tests_field_status(self, crawler: BaseCrawler, site_id: int) -> None: # status code filtering status_resources = crawler.get_resources_api( sites=[site_id], query=f"status: 200", limit=5, ) self.assertTrue(status_resources.total > 0, "Status filtering should return results") for resource in status_resources._results: self.assertEqual(resource.status, 200) # status code filtering appstat_resources = crawler.get_resources_api( sites=[site_id], query=f"status: 200 AND url: https://pragmar.com/appstat*", limit=5, ) self.assertTrue(appstat_resources.total > 0, "Status filtering should return results") self.assertGreaterEqual(len(appstat_resources._results), 3, f"Unexpected page count\n{len(appstat_resources._results)}") # multiple status codes multi_status_resources = crawler.get_resources_api( query=f"status: 200 OR status: 404", ) if multi_status_resources.total > 0: found_statuses = {r.status for r in multi_status_resources._results} for status in found_statuses: self.assertIn(status, [200, 404]) def __run_pragmar_search_tests_field_headers(self, crawler: BaseCrawler, site_id: int) -> None: # supported crawls only (genuine headers data) if not self.__class__.__name__ in ("InterroBotTests","KatanaTests", "WarcTests"): return appstat_any = crawler.get_resources_api( sites=[site_id], query=f"appstat", extras=[], limit=1, ) appstat_headers_js = crawler.get_resources_api( sites=[site_id], query=f"appstat AND headers: javascript", extras=[], limit=1, ) # https://pragmar.com/media/static/scripts/js/appstat.min.js self.assertEqual(appstat_headers_js.total, 1, "Should have exactly one resource in database (appstat.min.js)") appstat_headers_nojs = crawler.get_resources_api( sites=[site_id], query=f"appstat NOT headers: javascript", extras=[], limit=1, ) self.assertGreater(appstat_headers_nojs.total, 1, "Should have many appstat non-js resources in database") appstat_sum: int = appstat_headers_js.total + appstat_headers_nojs.total self.assertEqual(appstat_sum, appstat_any.total, "appstat non-js + js resources should sum to all appstat") def __run_pragmar_search_tests_field_content(self, crawler: BaseCrawler, site_id: int) -> None: mcp_any = crawler.get_resources_api( sites=[site_id], query=f"mcp", extras=[], limit=1, ) mcp_content_configuration = crawler.get_resources_api( sites=[site_id], query=f"mcp AND content: configuration", extras=[], limit=1, ) # https://pragmar.com/mcp-server-webcrawl/ self.assertGreaterEqual(mcp_content_configuration.total, 1, "Should have one, possibly more resources (mcp-server-webcrawl)") mcp_content_no_configuration = crawler.get_resources_api( sites=[site_id], query=f"mcp NOT content: configuration", extras=[], limit=1, ) self.assertGreater(mcp_content_no_configuration.total, 1, "Should have many mcp non-configuration resources") mcp_sum: int = mcp_content_configuration.total + mcp_content_no_configuration.total self.assertEqual(mcp_sum, mcp_any.total, "mcp non-config + config resources should sum to all mcp") mcp_html_content_config = crawler.get_resources_api( sites=[site_id], query=f"type: html AND mcp AND content: configuration", extras=[], limit=1, ) self.assertTrue( mcp_html_content_config.total <= mcp_content_configuration.total, "Adding type constraint should not increase results" ) wildcard_content_search = crawler.get_resources_api( sites=[site_id], query=f'content: config*', extras=[], limit=1, ) exact_config_search = crawler.get_resources_api( sites=[site_id], query=f'content: configuration', extras=[], limit=1, ) self.assertTrue( wildcard_content_search.total >= exact_config_search.total, "Wildcard content search should return at least as many results as exact match" ) def __run_pragmar_search_tests_field_type(self, crawler: BaseCrawler, site_id: int, site_resources:BaseJsonApi) -> None: html_resources = crawler.get_resources_api( sites=[site_id], query="type: html", extras=[], limit=1, ) # page count varies by crawler, 10 is conservative low end self.assertGreater(html_resources.total, 10, "Should have exactly 34 HTML resources") not_html_resources = crawler.get_resources_api( sites=[site_id], query="NOT type: html", extras=[], limit=1, ) self.assertGreater(not_html_resources.total, 10, "Should have exactly 48 non-HTML resources") html_sum: int = html_resources.total + not_html_resources.total self.assertEqual(html_sum, site_resources.total, "HTML + non-HTML should sum to all resources") # keyword + type combination appstat_any = crawler.get_resources_api( sites=[site_id], query="appstat", extras=[], limit=1, ) appstat_script = crawler.get_resources_api( sites=[site_id], query="appstat AND type: script", extras=[], limit=1, ) # https://pragmar.com/media/static/scripts/js/appstat.min.js self.assertEqual(appstat_script.total, 1, "Should have exactly one appstat script (appstat.min.js)") appstat_not_script = crawler.get_resources_api( sites=[site_id], query="appstat NOT type: script", extras=[], limit=1, ) self.assertGreater(appstat_not_script.total, 1, "Should have many appstat non-script resources") appstat_sum: int = appstat_script.total + appstat_not_script.total self.assertEqual(appstat_sum, appstat_any.total, "appstat script + non-script should sum to all appstat") # type OR combinations html_or_img = crawler.get_resources_api( sites=[site_id], query="type: html OR type: img", extras=[], limit=1, ) self.assertGreater(html_or_img.total, 20, "HTML + IMG should equal 66 resources (34+32)") img_resources = crawler.get_resources_api( sites=[site_id], query="type: img", extras=[], limit=1, ) self.assertTrue( html_or_img.total >= html_resources.total, "OR should include all HTML resources" ) self.assertTrue( html_or_img.total >= img_resources.total, "OR should include all IMG resources" ) # combined filtering combined_resources = crawler.get_resources_api( sites=[site_id], query= f"style AND type: {ResourceResultType.PAGE.value}", fields=[], sort="+url", limit=3, ) if combined_resources.total > 0: for resource in combined_resources._results: self.assertEqual(resource.site, site_id) self.assertEqual(resource.type, ResourceResultType.PAGE) def __run_pragmar_search_tests_fulltext( self, crawler: BaseCrawler, site_id: int, site_resources:BaseJsonApi ) -> None: # Boolean workout # result counts are fragile, intersections should not be # counts are worth the fragility, for now boolean_primary_resources = crawler.get_resources_api( sites=[site_id], query=f"type: html AND ({self.__PRAGMAR_PRIMARY_KEYWORD})", limit=4, ) # varies by crawler, katana doesn't crawl /help/ depth by default self.assertTrue(boolean_primary_resources .total > 0, f"Primary search returned {boolean_primary_resources .total}, expected results") boolean_secondary_resources = crawler.get_resources_api( sites=[site_id], query=f"type: html AND ({self.__PRAGMAR_SECONDARY_KEYWORD})", limit=12, ) # re: all these > 0 checks, result counts vary by crawler, all have default crawl behaviors/depths/externals self.assertTrue(boolean_secondary_resources.total > 0, f"Secondary returned {boolean_secondary_resources.total}, expected results") # AND primary_and_secondary_resources = crawler.get_resources_api( sites=[site_id], query=f"type: html AND ({self.__PRAGMAR_PRIMARY_KEYWORD} AND {self.__PRAGMAR_SECONDARY_KEYWORD})", limit=1, ) self.assertTrue(primary_and_secondary_resources.total >= 0, f"Primary AND Secondary returned {primary_and_secondary_resources.total}, expected results") # OR primary_or_secondary_resources = crawler.get_resources_api( sites=[site_id], query=f"type: html AND ({self.__PRAGMAR_PRIMARY_KEYWORD} OR {self.__PRAGMAR_SECONDARY_KEYWORD})", limit=1, ) self.assertTrue(primary_or_secondary_resources.total > 0, f"Primary OR Secondary returned {primary_or_secondary_resources.total}, expected results (union)") # NOT primary_not_secondary_resources = crawler.get_resources_api( sites=[site_id], query=f"type: html AND ({self.__PRAGMAR_PRIMARY_KEYWORD} NOT {self.__PRAGMAR_SECONDARY_KEYWORD})", limit=1, ) secondary_not_primary_resources = crawler.get_resources_api( sites=[site_id], query=f"type: html AND ({self.__PRAGMAR_SECONDARY_KEYWORD} NOT {self.__PRAGMAR_PRIMARY_KEYWORD})", limit=1, ) self.assertTrue(secondary_not_primary_resources.total >= 0, f"Secondary NOT Primary returned {secondary_not_primary_resources.total}, expected results") # logical relationships self.assertEqual( primary_and_secondary_resources.total, boolean_primary_resources .total + boolean_secondary_resources.total - primary_or_secondary_resources.total, "Intersection should equal A + B - Union (inclusion-exclusion principle)" ) self.assertEqual( primary_not_secondary_resources.total + primary_and_secondary_resources.total, boolean_primary_resources .total, "Primary NOT Secondary + Primary AND Secondary should equal total Primary results" ) self.assertEqual( secondary_not_primary_resources.total + primary_and_secondary_resources.total, boolean_secondary_resources.total, "Secondary NOT Primary + Primary AND Secondary should equal total Secondary results" ) self.assertEqual( primary_not_secondary_resources.total + secondary_not_primary_resources.total + primary_and_secondary_resources.total, primary_or_secondary_resources.total, "Sum of exclusive sets plus intersection should equal union" ) # complex boolean with field constraints primary_and_html_resources = crawler.get_resources_api( sites=[site_id], query=f"type: html AND ({self.__PRAGMAR_PRIMARY_KEYWORD})", limit=1, ) self.assertTrue(primary_and_html_resources.total > 0, f"Primary AND type:html returned {primary_and_html_resources.total}, expected results") self.assertTrue( primary_and_html_resources.total <= boolean_primary_resources .total, "Adding AND constraints should not increase result count" ) # Parentheses grouping grouped_resources = crawler.get_resources_api( sites=[site_id], query=f"type: html AND ({self.__PRAGMAR_PRIMARY_KEYWORD} OR {self.__PRAGMAR_SECONDARY_KEYWORD})", limit=1, ) self.assertTrue(grouped_resources.total > 0, f"Grouped OR with HTML filter returned {grouped_resources.total}, expected results") hyphenated_resources = crawler.get_resources_api( sites=[site_id], query=self.__PRAGMAR_HYPHENATED_KEYWORD, limit=1, ) self.assertTrue(hyphenated_resources.total > 0, f"Keyword '{self.__PRAGMAR_HYPHENATED_KEYWORD}' should return results") double_or_resources = crawler.get_resources_api( sites=[site_id], query=f"({self.__PRAGMAR_PRIMARY_KEYWORD} OR {self.__PRAGMAR_SECONDARY_KEYWORD} OR moffitor)" ) self.assertGreater( double_or_resources.total, 0, f"OR query should return some results" ) self.assertLess( double_or_resources.total, site_resources.total, f"OR query should be less than all results" ) parens_or_and_resources = crawler.get_resources_api( sites=[site_id], query=f"({self.__PRAGMAR_PRIMARY_KEYWORD} OR {self.__PRAGMAR_SECONDARY_KEYWORD}) AND collaborations " ) # respect the AND, there should be only one result # (A OR B) AND C vs. A OR B AND C self.assertEqual( parens_or_and_resources.total, 1, f"(A OR B) AND C should be 1 result (AND collaborations, unless fixture changed)" ) parens_or_and_resources_reverse = crawler.get_resources_api( sites=[site_id], query=f"collaborations AND ({self.__PRAGMAR_PRIMARY_KEYWORD} OR {self.__PRAGMAR_SECONDARY_KEYWORD}) " ) # respect the AND, there should be only one result # (A OR B) AND C vs. A OR B AND C self.assertEqual( parens_or_and_resources_reverse.total, 1, f"A AND (B OR C) should be 1 result (collaborations AND, unless fixture changed)" ) wide_type_resources = crawler.get_resources_api( sites=[site_id], query=f"type: script OR type: style OR type: iframe OR type: font OR type: text OR type: rss OR type: other" ) self.assertLess( wide_type_resources.total, site_resources.total, f"A long chained OR should not return all results" ) self.assertGreater( wide_type_resources.total, 0, f"A long chained OR should return some results" ) complex_and = crawler.get_resources_api( sites=[site_id], query=f"{self.__PRAGMAR_PRIMARY_KEYWORD} AND type:html AND status:200" ) self.assertTrue(complex_and.total <= boolean_primary_resources .total, "Adding AND conditions should not increase results") grouped_or = crawler.get_resources_api( sites=[site_id], query=f"({self.__PRAGMAR_PRIMARY_KEYWORD} OR {self.__PRAGMAR_SECONDARY_KEYWORD}) AND type:html AND status:200" ) self.assertTrue(grouped_or.total <= primary_or_secondary_resources.total, "Adding AND conditions to OR should not increase results") def __run_pragmar_search_tests_extras( self, crawler: BaseCrawler, site_id: int, site_resources:BaseJsonApi, primary_resources:BaseJsonApi, secondary_resources:BaseJsonApi, ) -> None: snippet_resources = crawler.get_resources_api( sites=[site_id], query=f"{self.__PRAGMAR_PRIMARY_KEYWORD} AND type: html", extras=["snippets"], limit=1, ) self.assertIn("snippets", snippet_resources._results[0].to_dict()["extras"], "First result should have snippets in extras") xpath_count_resources = crawler.get_resources_api( sites=[site_id], query=self.__PRAGMAR_PRIMARY_KEYWORD, extras=["markdown"], limit=1, ) self.assertIn("markdown", xpath_count_resources._results[0].to_dict()["extras"], "First result should have markdown in extras") xpath_count_resources = crawler.get_resources_api( sites=[site_id], query="url: pragmar.com AND status: 200", extras=["xpath"], extrasXpath=["count(//h1)"], limit=1, sort="-url" ) self.assertIn("xpath", xpath_count_resources._results[0].to_dict()["extras"], "First result should have xpath in extras") self.assertEqual(len(xpath_count_resources._results[0].to_dict()["extras"]["xpath"]), 1, "Should be exactly one H1 hit in xpath extras") xpath_h1_text_resources = crawler.get_resources_api( sites=[site_id], query="url: https://pragmar.com AND status: 200", extras=["xpath"], extrasXpath=["//h1/text()"], limit=1, sort="+url" ) self.assertIn("xpath", xpath_h1_text_resources._results[0].to_dict()["extras"], "First result should have xpath in extras") self.assertTrue( xpath_h1_text_resources._results[0].to_dict()["extras"] is not None, "Should have pragmar in fixture h1") # should be pragmar homepage, assert "pragmar" in h1 first_xpath_result = xpath_h1_text_resources._results[0].to_dict()["extras"]["xpath"][0]["value"].lower() self.assertTrue("pragmar" in first_xpath_result, f"Should have pragmar in fixture homepage h1 ({first_xpath_result})") combined_resources = crawler.get_resources_api( sites=[site_id], query=self.__PRAGMAR_PRIMARY_KEYWORD, extras=["snippets", "markdown"], limit=1, ) first_result = combined_resources._results[0].to_dict() self.assertIn("extras", first_result, "First result should have extras field") self.assertIn("snippets", first_result["extras"], "First result should have snippets in extras") self.assertIn("markdown", first_result["extras"], "First result should have markdown in extras") self.assertTrue(primary_resources.total <= site_resources.total, "Search should return less than or equivalent results to site total") self.assertTrue(secondary_resources.total <= site_resources.total, "Search should return less than or equivalent results to site total")