Fix content loss on critical/moderate pages

This commit is contained in:
Jarvis Prime
2026-03-24 04:03:11 +00:00
parent d9a8593a29
commit 30aad962fc
9 changed files with 707 additions and 612 deletions

82
scripts/compare.py Normal file
View File

@@ -0,0 +1,82 @@
#!/usr/bin/env python3
"""Compare Astro preview site content against original scraped HTML."""
import os, re, sys, urllib.request, html
PREVIEW_BASE = "https://componentowl-preview.pages.dev"
ORIGINAL_DIR = "/tmp/cowl-site/www.componentowl.com"
def strip_html(text):
text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL)
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL)
text = re.sub(r'<[^>]+>', ' ', text)
text = html.unescape(text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def extract_links(text):
hrefs = set(re.findall(r'href="([^"]+)"', text))
return hrefs
def fetch_url(url):
try:
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
with urllib.request.urlopen(req, timeout=10) as r:
return r.read().decode('utf-8', errors='ignore')
except:
return None
pages = [
'', 'better-listview', 'better-thumbnail-browser', 'better-splitbutton',
'better-listview-express', 'blog', 'articles', 'sitemap',
'about-us', 'support', 'eula', 'privacy-policy', 'disclaimer',
'pricing-licensing/better-listview', 'pricing-licensing/better-thumbnail-browser',
'pricing-licensing/better-splitbutton', 'comics'
]
print(f"{'Page':<45} | {'Orig':>6} | {'Preview':>7} | {'Diff':>6} | {'Links ±':>8}")
print("-" * 85)
issues = []
for page in pages:
orig_file = page if page else 'index'
orig_path = os.path.join(ORIGINAL_DIR, orig_file + '.html')
if not os.path.exists(orig_path):
orig_path = os.path.join(ORIGINAL_DIR, orig_file, 'index.html')
if os.path.exists(orig_path):
orig_html = open(orig_path).read()
else:
print(f"{page or 'index':<45} | {'NORIG':>6} |")
continue
preview_html = fetch_url(f"{PREVIEW_BASE}/{page}") or fetch_url(f"{PREVIEW_BASE}/{page}.html")
if not preview_html:
print(f"{page or 'index':<45} | {len(strip_html(orig_html)):>6} | {'MISS':>7} |")
issues.append(f"MISSING: {page or 'index'}")
continue
ot = strip_html(orig_html)
pt = strip_html(preview_html)
ol, pl = len(ot), len(pt)
diff = ((pl - ol) / max(ol, 1)) * 100
oh = extract_links(orig_html)
ph = extract_links(preview_html)
ml = len(oh - ph)
flag = ""
if diff < -20:
flag = " ⚠️"
issues.append(f"LOSS {diff:.0f}%: {page or 'index'}")
if ml > 5:
flag += " 🔗"
issues.append(f"LINKS -{ml}: {page or 'index'}")
print(f"{page or 'index':<45} | {ol:>6} | {pl:>7} | {diff:>+5.0f}% | {-ml:>+7}{flag}")
print()
if issues:
print(f"ISSUES ({len(issues)}):")
for i in issues: print(f" {i}")
else:
print("All pages look good!")