Fix content loss on critical/moderate pages

2026-03-24 04:03:11 +00:00
parent d9a8593a29
commit 30aad962fc
9 changed files with 707 additions and 612 deletions
--- a/scripts/compare.py
+++ b/scripts/compare.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+"""Compare Astro preview site content against original scraped HTML."""
+import os, re, sys, urllib.request, html
+
+PREVIEW_BASE = "https://componentowl-preview.pages.dev"
+ORIGINAL_DIR = "/tmp/cowl-site/www.componentowl.com"
+
+def strip_html(text):
+    text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL)
+    text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL)
+    text = re.sub(r'<[^>]+>', ' ', text)
+    text = html.unescape(text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+
+def extract_links(text):
+    hrefs = set(re.findall(r'href="([^"]+)"', text))
+    return hrefs
+
+def fetch_url(url):
+    try:
+        req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
+        with urllib.request.urlopen(req, timeout=10) as r:
+            return r.read().decode('utf-8', errors='ignore')
+    except:
+        return None
+
+pages = [
+    '', 'better-listview', 'better-thumbnail-browser', 'better-splitbutton',
+    'better-listview-express', 'blog', 'articles', 'sitemap',
+    'about-us', 'support', 'eula', 'privacy-policy', 'disclaimer',
+    'pricing-licensing/better-listview', 'pricing-licensing/better-thumbnail-browser',
+    'pricing-licensing/better-splitbutton', 'comics'
+]
+
+print(f"{'Page':<45} | {'Orig':>6} | {'Preview':>7} | {'Diff':>6} | {'Links ±':>8}")
+print("-" * 85)
+
+issues = []
+for page in pages:
+    orig_file = page if page else 'index'
+    orig_path = os.path.join(ORIGINAL_DIR, orig_file + '.html')
+    if not os.path.exists(orig_path):
+        orig_path = os.path.join(ORIGINAL_DIR, orig_file, 'index.html')
+
+    if os.path.exists(orig_path):
+        orig_html = open(orig_path).read()
+    else:
+        print(f"{page or 'index':<45} | {'NORIG':>6} |")
+        continue
+
+    preview_html = fetch_url(f"{PREVIEW_BASE}/{page}") or fetch_url(f"{PREVIEW_BASE}/{page}.html")
+    if not preview_html:
+        print(f"{page or 'index':<45} | {len(strip_html(orig_html)):>6} | {'MISS':>7} |")
+        issues.append(f"MISSING: {page or 'index'}")
+        continue
+
+    ot = strip_html(orig_html)
+    pt = strip_html(preview_html)
+    ol, pl = len(ot), len(pt)
+    diff = ((pl - ol) / max(ol, 1)) * 100
+
+    oh = extract_links(orig_html)
+    ph = extract_links(preview_html)
+    ml = len(oh - ph)
+
+    flag = ""
+    if diff < -20:
+        flag = " ⚠️"
+        issues.append(f"LOSS {diff:.0f}%: {page or 'index'}")
+    if ml > 5:
+        flag += " 🔗"
+        issues.append(f"LINKS -{ml}: {page or 'index'}")
+
+    print(f"{page or 'index':<45} | {ol:>6} | {pl:>7} | {diff:>+5.0f}% | {-ml:>+7}{flag}")
+
+print()
+if issues:
+    print(f"ISSUES ({len(issues)}):")
+    for i in issues: print(f"  {i}")
+else:
+    print("All pages look good!")