#!/usr/bin/env python3 """ SEO Audit: Schema/JSON-LD Extractor Extracts all JSON-LD structured data from every page in the sitemap. Machine-verified extraction — no AI interpretation. Usage: python3 schema.py """ import xml.etree.ElementTree as ET import urllib.request import ssl import sys import csv import time import re import json import os def fetch_url(url, timeout=15): ctx = ssl.create_default_context() req = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }) try: resp = urllib.request.urlopen(req, timeout=timeout, context=ctx) return resp.getcode(), resp.read().decode('utf-8', errors='replace') except: return 0, '' def get_sitemap_urls(sitemap_url): try: req = urllib.request.Request(sitemap_url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }) ctx = ssl.create_default_context() resp = urllib.request.urlopen(req, timeout=15, context=ctx) content = resp.read().decode('utf-8') content = re.sub(r'\sxmlns="[^"]+"', '', content, count=1) root = ET.fromstring(content) urls = [] for sitemap in root.findall('.//sitemap/loc'): urls.extend(get_sitemap_urls(sitemap.text.strip())) for url_elem in root.findall('.//url/loc'): urls.append(url_elem.text.strip()) return urls except: return [] def extract_jsonld(html): """Extract all JSON-LD blocks from HTML.""" pattern = r']*>(.*?)' matches = re.findall(pattern, html, re.DOTALL) schemas = [] for match in matches: try: data = json.loads(match.strip()) schemas.append(data) except json.JSONDecodeError: schemas.append({'_error': 'invalid_json', '_raw': match.strip()[:200]}) return schemas def flatten_schema(schema): """Extract key fields from a schema object.""" schema_type = schema.get('@type', 'unknown') if isinstance(schema_type, list): schema_type = ', '.join(schema_type) return { 'type': schema_type, 'name': schema.get('name', ''), 'description': str(schema.get('description', ''))[:200], 'has_error': '_error' in schema, } def main(): if len(sys.argv) < 4: print("Usage: python3 schema.py ", file=sys.stderr) sys.exit(1) domain = sys.argv[1] sitemap_url = sys.argv[2] output_dir = sys.argv[3] os.makedirs(output_dir, exist_ok=True) print(f"\n=== Schema Extraction: {domain} ===", file=sys.stderr) sitemap_urls = get_sitemap_urls(sitemap_url) print(f"Found {len(sitemap_urls)} pages", file=sys.stderr) all_schemas = [] page_schema_count = [] for i, url in enumerate(sitemap_urls): status, html = fetch_url(url) if status != 200: page_schema_count.append({'url': url, 'schema_count': 0, 'types': 'page_error'}) continue schemas = extract_jsonld(html) types = [] for schema in schemas: flat = flatten_schema(schema) all_schemas.append({ 'page_url': url, 'schema_type': flat['type'], 'schema_name': flat['name'], 'description_preview': flat['description'], 'has_error': flat['has_error'], 'full_json': json.dumps(schema)[:500] }) types.append(flat['type']) page_schema_count.append({ 'url': url, 'schema_count': len(schemas), 'types': ', '.join(types) if types else 'none' }) if (i + 1) % 20 == 0: print(f" Processed {i + 1}/{len(sitemap_urls)}...", file=sys.stderr) time.sleep(0.3) # Write detailed CSV with open(os.path.join(output_dir, f'{domain}_schema.csv'), 'w', newline='') as f: writer = csv.DictWriter(f, fieldnames=['page_url', 'schema_type', 'schema_name', 'description_preview', 'has_error', 'full_json']) writer.writeheader() writer.writerows(all_schemas) # Write page summary with open(os.path.join(output_dir, f'{domain}_schema_pages.csv'), 'w', newline='') as f: writer = csv.DictWriter(f, fieldnames=['url', 'schema_count', 'types']) writer.writeheader() writer.writerows(page_schema_count) # Count schema types type_counts = {} for s in all_schemas: t = s['schema_type'] type_counts[t] = type_counts.get(t, 0) + 1 pages_without = sum(1 for p in page_schema_count if p['schema_count'] == 0) print(f"\n Total schemas: {len(all_schemas)} | Pages without: {pages_without}", file=sys.stderr) print(f" Types: {type_counts}", file=sys.stderr) summary = { 'domain': domain, 'total_schemas': len(all_schemas), 'pages_without_schema': pages_without, 'schema_type_counts': type_counts, 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S') } with open(os.path.join(output_dir, f'{domain}_schema_summary.json'), 'w') as f: json.dump(summary, f, indent=2) if __name__ == '__main__': main()