scripts/compare.py

#!/usr/bin/env python3
"""Compare Astro preview site content against original scraped HTML."""
import os, re, sys, urllib.request, html

PREVIEW_BASE = "https://componentowl-preview.pages.dev"
ORIGINAL_DIR = "/tmp/cowl-site/www.componentowl.com"

def strip_html(text):
    text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL)
    text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL)
    text = re.sub(r'<[^>]+>', ' ', text)
    text = html.unescape(text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def extract_links(text):
    hrefs = set(re.findall(r'href="([^"]+)"', text))
    return hrefs

def fetch_url(url):
    try:
        req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
        with urllib.request.urlopen(req, timeout=10) as r:
            return r.read().decode('utf-8', errors='ignore')
    except:
        return None

pages = [
    '', 'better-listview', 'better-thumbnail-browser', 'better-splitbutton',
    'better-listview-express', 'blog', 'articles', 'sitemap',
    'about-us', 'support', 'eula', 'privacy-policy', 'disclaimer',
    'pricing-licensing/better-listview', 'pricing-licensing/better-thumbnail-browser',
    'pricing-licensing/better-splitbutton', 'comics'
]

print(f"{'Page':<45} | {'Orig':>6} | {'Preview':>7} | {'Diff':>6} | {'Links ±':>8}")
print("-" * 85)

issues = []
for page in pages:
    orig_file = page if page else 'index'
    orig_path = os.path.join(ORIGINAL_DIR, orig_file + '.html')
    if not os.path.exists(orig_path):
        orig_path = os.path.join(ORIGINAL_DIR, orig_file, 'index.html')

    if os.path.exists(orig_path):
        orig_html = open(orig_path).read()
    else:
        print(f"{page or 'index':<45} | {'NORIG':>6} |")
        continue

    preview_html = fetch_url(f"{PREVIEW_BASE}/{page}") or fetch_url(f"{PREVIEW_BASE}/{page}.html")
    if not preview_html:
        print(f"{page or 'index':<45} | {len(strip_html(orig_html)):>6} | {'MISS':>7} |")
        issues.append(f"MISSING: {page or 'index'}")
        continue

    ot = strip_html(orig_html)
    pt = strip_html(preview_html)
    ol, pl = len(ot), len(pt)
    diff = ((pl - ol) / max(ol, 1)) * 100

    oh = extract_links(orig_html)
    ph = extract_links(preview_html)
    ml = len(oh - ph)

    flag = ""
    if diff < -20:
        flag = " ⚠️"
        issues.append(f"LOSS {diff:.0f}%: {page or 'index'}")
    if ml > 5:
        flag += " 🔗"
        issues.append(f"LINKS -{ml}: {page or 'index'}")

    print(f"{page or 'index':<45} | {ol:>6} | {pl:>7} | {diff:>+5.0f}% | {-ml:>+7}{flag}")

print()
if issues:
    print(f"ISSUES ({len(issues)}):")
    for i in issues: print(f"  {i}")
else:
    print("All pages look good!")
Fix content loss on critical/moderate pages 2026-03-24 04:03:11 +00:00			`#!/usr/bin/env python3`
			`"""Compare Astro preview site content against original scraped HTML."""`
			`import os, re, sys, urllib.request, html`

			`PREVIEW_BASE = "https://componentowl-preview.pages.dev"`
			`ORIGINAL_DIR = "/tmp/cowl-site/www.componentowl.com"`

			`def strip_html(text):`
			`text = re.sub(r'<script[^>]>.?</script>', '', text, flags=re.DOTALL)`
			`text = re.sub(r'<style[^>]>.?</style>', '', text, flags=re.DOTALL)`
			`text = re.sub(r'<[^>]+>', ' ', text)`
			`text = html.unescape(text)`
			`text = re.sub(r'\s+', ' ', text).strip()`
			`return text`

			`def extract_links(text):`
			`hrefs = set(re.findall(r'href="([^"]+)"', text))`
			`return hrefs`

			`def fetch_url(url):`
			`try:`
			`req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})`
			`with urllib.request.urlopen(req, timeout=10) as r:`
			`return r.read().decode('utf-8', errors='ignore')`
			`except:`
			`return None`

			`pages = [`
			`'', 'better-listview', 'better-thumbnail-browser', 'better-splitbutton',`
			`'better-listview-express', 'blog', 'articles', 'sitemap',`
			`'about-us', 'support', 'eula', 'privacy-policy', 'disclaimer',`
			`'pricing-licensing/better-listview', 'pricing-licensing/better-thumbnail-browser',`
			`'pricing-licensing/better-splitbutton', 'comics'`
			`]`

			`print(f"{'Page':<45} \| {'Orig':>6} \| {'Preview':>7} \| {'Diff':>6} \| {'Links ±':>8}")`
			`print("-" * 85)`

			`issues = []`
			`for page in pages:`
			`orig_file = page if page else 'index'`
			`orig_path = os.path.join(ORIGINAL_DIR, orig_file + '.html')`
			`if not os.path.exists(orig_path):`
			`orig_path = os.path.join(ORIGINAL_DIR, orig_file, 'index.html')`

			`if os.path.exists(orig_path):`
			`orig_html = open(orig_path).read()`
			`else:`
			`print(f"{page or 'index':<45} \| {'NORIG':>6} \|")`
			`continue`

			`preview_html = fetch_url(f"{PREVIEW_BASE}/{page}") or fetch_url(f"{PREVIEW_BASE}/{page}.html")`
			`if not preview_html:`
			`print(f"{page or 'index':<45} \| {len(strip_html(orig_html)):>6} \| {'MISS':>7} \|")`
			`issues.append(f"MISSING: {page or 'index'}")`
			`continue`

			`ot = strip_html(orig_html)`
			`pt = strip_html(preview_html)`
			`ol, pl = len(ot), len(pt)`
			`diff = ((pl - ol) / max(ol, 1)) * 100`

			`oh = extract_links(orig_html)`
			`ph = extract_links(preview_html)`
			`ml = len(oh - ph)`

			`flag = ""`
			`if diff < -20:`
			`flag = " ⚠️"`
			`issues.append(f"LOSS {diff:.0f}%: {page or 'index'}")`
			`if ml > 5:`
			`flag += " 🔗"`
			`issues.append(f"LINKS -{ml}: {page or 'index'}")`

			`print(f"{page or 'index':<45} \| {ol:>6} \| {pl:>7} \| {diff:>+5.0f}% \| {-ml:>+7}{flag}")`

			`print()`
			`if issues:`
			`print(f"ISSUES ({len(issues)}):")`
			`for i in issues: print(f" {i}")`
			`else:`
			`print("All pages look good!")`