Source code for mcp_server_webcrawl.templates.tests

import re
import unittest

from importlib import resources
from urllib.request import urlopen
from mcp_server_webcrawl.utils.logger import get_logger
from mcp_server_webcrawl.extras.markdown import get_markdown

logger = get_logger()

[docs] class TemplateTests(unittest.TestCase): """ Test suite for the custom HTML to markdown converter. Why custom? It's a bit faster, that is the only reason. Maximum load is 100 transforms (1 per result for a max result of 100), so speed matters. A default set is 20. This converter does a few things differently to tailor to LLM interaction. * aggressively removes images (html2text selectively renders) * links with block decendents will render like a <p> (html2text treats as <a><br>) """
[docs] def setUp(self): """ Set up the test environment with fixture data. """ super().setUp()
[docs] def test_core_html(self): core_html: str = resources.read_text("mcp_server_webcrawl.templates", "tests_core.html") markdown = get_markdown(core_html) # h1-6 self.assertIn("# Lorem Ipsum Dolor Sit Amet", markdown) self.assertIn("## Consectetur Adipiscing Elit", markdown) self.assertIn("### Nemo Enim Ipsam Voluptatem", markdown) self.assertIn("#### Sed Quia Non Numquam", markdown) self.assertIn("##### Nisi Ut Aliquid Ex Ea", markdown) self.assertIn("###### At Vero Eos Et Accusamus", markdown) # no content loss - key phrases should be preserved self.assertIn("Lorem ipsum dolor sit amet", markdown) self.assertIn("Definition List Example", markdown) self.assertIn("More Text Elements", markdown) # inline formatting (proper spacing) self.assertIn("amet, **consectetur adipiscing elit**. Sed", markdown) self.assertIn("laborum. **Sed ut perspiciatis** unde", markdown) self.assertIn("consequat. *Duis aute irure dolor* in", markdown) self.assertIn("laudantium. *Totam rem aperiam*, eaque", markdown) # link formatting (proper spacing) self.assertIn("veniam, quis nostrud exercitation ullamco", markdown) # Fragment links as plain text self.assertIn("and a link back to top. Nam", markdown) # list formatting self.assertIn("* Similique sunt in culpa", markdown) self.assertIn("1. Temporibus autem quibusdam", markdown) # dl/dt self.assertIn("**Lorem Ipsum**", markdown) self.assertIn(" Dolor sit amet, consectetur adipiscing elit", markdown) self.assertIn("**Ut Enim**", markdown) self.assertIn(" Ad minim veniam, quis nostrud exercitation", markdown) self.assertIn("**Duis Aute**", markdown) self.assertIn(" Irure dolor in reprehenderit in voluptate", markdown) # table structure self.assertIn("| Lorem | Ipsum | Dolor | Sit |", markdown) self.assertIn("|---|---|---|---|", markdown) self.assertIn("| Consectetur | Adipiscing | Elit | Sed |", markdown) # code formatting self.assertIn("Here we have some `inline code` and", markdown) self.assertIn("```\nfunction lorem() {\n return \"ipsum dolor sit amet\";\n}\n```", markdown) # blockquotes self.assertIn("> \"Sed ut perspiciatis unde omnis iste natus", markdown) # horizontal rule self.assertIn("---", markdown) # no double spacing for inline elements self.assertNotIn("** ", markdown) # No double spaces after bold self.assertNotIn(" **", markdown) # No double spaces before bold self.assertNotIn("* ", markdown) # No double spaces after emphasis self.assertNotIn(" *", markdown) # No double spaces before emphasis # structural integrity - count major elements heading_count = len(re.findall(r"^#{1,6} ", markdown, re.MULTILINE)) self.assertEqual(heading_count, 11, "Should have exactly 6 headings") table_count = len(re.findall(r"^\|.*\|$", markdown, re.MULTILINE)) self.assertGreater(table_count, 5, "Should have multiple table rows")