83 lines
2.7 KiB
Python
83 lines
2.7 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""Compare Astro preview site content against original scraped HTML."""
|
||
|
|
import os, re, sys, urllib.request, html
|
||
|
|
|
||
|
|
PREVIEW_BASE = "https://componentowl-preview.pages.dev"
|
||
|
|
ORIGINAL_DIR = "/tmp/cowl-site/www.componentowl.com"
|
||
|
|
|
||
|
|
def strip_html(text):
|
||
|
|
text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL)
|
||
|
|
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL)
|
||
|
|
text = re.sub(r'<[^>]+>', ' ', text)
|
||
|
|
text = html.unescape(text)
|
||
|
|
text = re.sub(r'\s+', ' ', text).strip()
|
||
|
|
return text
|
||
|
|
|
||
|
|
def extract_links(text):
|
||
|
|
hrefs = set(re.findall(r'href="([^"]+)"', text))
|
||
|
|
return hrefs
|
||
|
|
|
||
|
|
def fetch_url(url):
|
||
|
|
try:
|
||
|
|
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
|
||
|
|
with urllib.request.urlopen(req, timeout=10) as r:
|
||
|
|
return r.read().decode('utf-8', errors='ignore')
|
||
|
|
except:
|
||
|
|
return None
|
||
|
|
|
||
|
|
pages = [
|
||
|
|
'', 'better-listview', 'better-thumbnail-browser', 'better-splitbutton',
|
||
|
|
'better-listview-express', 'blog', 'articles', 'sitemap',
|
||
|
|
'about-us', 'support', 'eula', 'privacy-policy', 'disclaimer',
|
||
|
|
'pricing-licensing/better-listview', 'pricing-licensing/better-thumbnail-browser',
|
||
|
|
'pricing-licensing/better-splitbutton', 'comics'
|
||
|
|
]
|
||
|
|
|
||
|
|
print(f"{'Page':<45} | {'Orig':>6} | {'Preview':>7} | {'Diff':>6} | {'Links ±':>8}")
|
||
|
|
print("-" * 85)
|
||
|
|
|
||
|
|
issues = []
|
||
|
|
for page in pages:
|
||
|
|
orig_file = page if page else 'index'
|
||
|
|
orig_path = os.path.join(ORIGINAL_DIR, orig_file + '.html')
|
||
|
|
if not os.path.exists(orig_path):
|
||
|
|
orig_path = os.path.join(ORIGINAL_DIR, orig_file, 'index.html')
|
||
|
|
|
||
|
|
if os.path.exists(orig_path):
|
||
|
|
orig_html = open(orig_path).read()
|
||
|
|
else:
|
||
|
|
print(f"{page or 'index':<45} | {'NORIG':>6} |")
|
||
|
|
continue
|
||
|
|
|
||
|
|
preview_html = fetch_url(f"{PREVIEW_BASE}/{page}") or fetch_url(f"{PREVIEW_BASE}/{page}.html")
|
||
|
|
if not preview_html:
|
||
|
|
print(f"{page or 'index':<45} | {len(strip_html(orig_html)):>6} | {'MISS':>7} |")
|
||
|
|
issues.append(f"MISSING: {page or 'index'}")
|
||
|
|
continue
|
||
|
|
|
||
|
|
ot = strip_html(orig_html)
|
||
|
|
pt = strip_html(preview_html)
|
||
|
|
ol, pl = len(ot), len(pt)
|
||
|
|
diff = ((pl - ol) / max(ol, 1)) * 100
|
||
|
|
|
||
|
|
oh = extract_links(orig_html)
|
||
|
|
ph = extract_links(preview_html)
|
||
|
|
ml = len(oh - ph)
|
||
|
|
|
||
|
|
flag = ""
|
||
|
|
if diff < -20:
|
||
|
|
flag = " ⚠️"
|
||
|
|
issues.append(f"LOSS {diff:.0f}%: {page or 'index'}")
|
||
|
|
if ml > 5:
|
||
|
|
flag += " 🔗"
|
||
|
|
issues.append(f"LINKS -{ml}: {page or 'index'}")
|
||
|
|
|
||
|
|
print(f"{page or 'index':<45} | {ol:>6} | {pl:>7} | {diff:>+5.0f}% | {-ml:>+7}{flag}")
|
||
|
|
|
||
|
|
print()
|
||
|
|
if issues:
|
||
|
|
print(f"ISSUES ({len(issues)}):")
|
||
|
|
for i in issues: print(f" {i}")
|
||
|
|
else:
|
||
|
|
print("All pages look good!")
|