Fix content loss on critical/moderate pages
This commit is contained in:
82
scripts/compare.py
Normal file
82
scripts/compare.py
Normal file
@@ -0,0 +1,82 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Compare Astro preview site content against original scraped HTML."""
|
||||
import os, re, sys, urllib.request, html
|
||||
|
||||
PREVIEW_BASE = "https://componentowl-preview.pages.dev"
|
||||
ORIGINAL_DIR = "/tmp/cowl-site/www.componentowl.com"
|
||||
|
||||
def strip_html(text):
|
||||
text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL)
|
||||
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL)
|
||||
text = re.sub(r'<[^>]+>', ' ', text)
|
||||
text = html.unescape(text)
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
return text
|
||||
|
||||
def extract_links(text):
|
||||
hrefs = set(re.findall(r'href="([^"]+)"', text))
|
||||
return hrefs
|
||||
|
||||
def fetch_url(url):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
|
||||
with urllib.request.urlopen(req, timeout=10) as r:
|
||||
return r.read().decode('utf-8', errors='ignore')
|
||||
except:
|
||||
return None
|
||||
|
||||
pages = [
|
||||
'', 'better-listview', 'better-thumbnail-browser', 'better-splitbutton',
|
||||
'better-listview-express', 'blog', 'articles', 'sitemap',
|
||||
'about-us', 'support', 'eula', 'privacy-policy', 'disclaimer',
|
||||
'pricing-licensing/better-listview', 'pricing-licensing/better-thumbnail-browser',
|
||||
'pricing-licensing/better-splitbutton', 'comics'
|
||||
]
|
||||
|
||||
print(f"{'Page':<45} | {'Orig':>6} | {'Preview':>7} | {'Diff':>6} | {'Links ±':>8}")
|
||||
print("-" * 85)
|
||||
|
||||
issues = []
|
||||
for page in pages:
|
||||
orig_file = page if page else 'index'
|
||||
orig_path = os.path.join(ORIGINAL_DIR, orig_file + '.html')
|
||||
if not os.path.exists(orig_path):
|
||||
orig_path = os.path.join(ORIGINAL_DIR, orig_file, 'index.html')
|
||||
|
||||
if os.path.exists(orig_path):
|
||||
orig_html = open(orig_path).read()
|
||||
else:
|
||||
print(f"{page or 'index':<45} | {'NORIG':>6} |")
|
||||
continue
|
||||
|
||||
preview_html = fetch_url(f"{PREVIEW_BASE}/{page}") or fetch_url(f"{PREVIEW_BASE}/{page}.html")
|
||||
if not preview_html:
|
||||
print(f"{page or 'index':<45} | {len(strip_html(orig_html)):>6} | {'MISS':>7} |")
|
||||
issues.append(f"MISSING: {page or 'index'}")
|
||||
continue
|
||||
|
||||
ot = strip_html(orig_html)
|
||||
pt = strip_html(preview_html)
|
||||
ol, pl = len(ot), len(pt)
|
||||
diff = ((pl - ol) / max(ol, 1)) * 100
|
||||
|
||||
oh = extract_links(orig_html)
|
||||
ph = extract_links(preview_html)
|
||||
ml = len(oh - ph)
|
||||
|
||||
flag = ""
|
||||
if diff < -20:
|
||||
flag = " ⚠️"
|
||||
issues.append(f"LOSS {diff:.0f}%: {page or 'index'}")
|
||||
if ml > 5:
|
||||
flag += " 🔗"
|
||||
issues.append(f"LINKS -{ml}: {page or 'index'}")
|
||||
|
||||
print(f"{page or 'index':<45} | {ol:>6} | {pl:>7} | {diff:>+5.0f}% | {-ml:>+7}{flag}")
|
||||
|
||||
print()
|
||||
if issues:
|
||||
print(f"ISSUES ({len(issues)}):")
|
||||
for i in issues: print(f" {i}")
|
||||
else:
|
||||
print("All pages look good!")
|
||||
Reference in New Issue
Block a user