#!/usr/bin/env python3 """Compare Astro preview site content against original scraped HTML.""" import os, re, sys, urllib.request, html PREVIEW_BASE = "https://componentowl-preview.pages.dev" ORIGINAL_DIR = "/tmp/cowl-site/www.componentowl.com" def strip_html(text): text = re.sub(r']*>.*?', '', text, flags=re.DOTALL) text = re.sub(r']*>.*?', '', text, flags=re.DOTALL) text = re.sub(r'<[^>]+>', ' ', text) text = html.unescape(text) text = re.sub(r'\s+', ' ', text).strip() return text def extract_links(text): hrefs = set(re.findall(r'href="([^"]+)"', text)) return hrefs def fetch_url(url): try: req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}) with urllib.request.urlopen(req, timeout=10) as r: return r.read().decode('utf-8', errors='ignore') except: return None pages = [ '', 'better-listview', 'better-thumbnail-browser', 'better-splitbutton', 'better-listview-express', 'blog', 'articles', 'sitemap', 'about-us', 'support', 'eula', 'privacy-policy', 'disclaimer', 'pricing-licensing/better-listview', 'pricing-licensing/better-thumbnail-browser', 'pricing-licensing/better-splitbutton', 'comics' ] print(f"{'Page':<45} | {'Orig':>6} | {'Preview':>7} | {'Diff':>6} | {'Links ±':>8}") print("-" * 85) issues = [] for page in pages: orig_file = page if page else 'index' orig_path = os.path.join(ORIGINAL_DIR, orig_file + '.html') if not os.path.exists(orig_path): orig_path = os.path.join(ORIGINAL_DIR, orig_file, 'index.html') if os.path.exists(orig_path): orig_html = open(orig_path).read() else: print(f"{page or 'index':<45} | {'NORIG':>6} |") continue preview_html = fetch_url(f"{PREVIEW_BASE}/{page}") or fetch_url(f"{PREVIEW_BASE}/{page}.html") if not preview_html: print(f"{page or 'index':<45} | {len(strip_html(orig_html)):>6} | {'MISS':>7} |") issues.append(f"MISSING: {page or 'index'}") continue ot = strip_html(orig_html) pt = strip_html(preview_html) ol, pl = len(ot), len(pt) diff = ((pl - ol) / max(ol, 1)) * 100 oh = extract_links(orig_html) ph = extract_links(preview_html) ml = len(oh - ph) flag = "" if diff < -20: flag = " ⚠️" issues.append(f"LOSS {diff:.0f}%: {page or 'index'}") if ml > 5: flag += " 🔗" issues.append(f"LINKS -{ml}: {page or 'index'}") print(f"{page or 'index':<45} | {ol:>6} | {pl:>7} | {diff:>+5.0f}% | {-ml:>+7}{flag}") print() if issues: print(f"ISSUES ({len(issues)}):") for i in issues: print(f" {i}") else: print("All pages look good!")