#!/usr/bin/env python3
"""
SEO Audit: Schema/JSON-LD Extractor
Extracts all JSON-LD structured data from every page in the sitemap.
Machine-verified extraction — no AI interpretation.

Usage: python3 schema.py <domain> <sitemap_url> <output_dir>
"""
import xml.etree.ElementTree as ET
import urllib.request
import ssl
import sys
import csv
import time
import re
import json
import os


def fetch_url(url, timeout=15):
    ctx = ssl.create_default_context()
    req = urllib.request.Request(url, headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    })
    try:
        resp = urllib.request.urlopen(req, timeout=timeout, context=ctx)
        return resp.getcode(), resp.read().decode('utf-8', errors='replace')
    except:
        return 0, ''


def get_sitemap_urls(sitemap_url):
    try:
        req = urllib.request.Request(sitemap_url, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        ctx = ssl.create_default_context()
        resp = urllib.request.urlopen(req, timeout=15, context=ctx)
        content = resp.read().decode('utf-8')
        content = re.sub(r'\sxmlns="[^"]+"', '', content, count=1)
        root = ET.fromstring(content)
        urls = []
        for sitemap in root.findall('.//sitemap/loc'):
            urls.extend(get_sitemap_urls(sitemap.text.strip()))
        for url_elem in root.findall('.//url/loc'):
            urls.append(url_elem.text.strip())
        return urls
    except:
        return []


def extract_jsonld(html):
    """Extract all JSON-LD blocks from HTML."""
    pattern = r'<script\s+type="application/ld\+json"[^>]*>(.*?)</script>'
    matches = re.findall(pattern, html, re.DOTALL)
    schemas = []
    for match in matches:
        try:
            data = json.loads(match.strip())
            schemas.append(data)
        except json.JSONDecodeError:
            schemas.append({'_error': 'invalid_json', '_raw': match.strip()[:200]})
    return schemas


def flatten_schema(schema):
    """Extract key fields from a schema object."""
    schema_type = schema.get('@type', 'unknown')
    if isinstance(schema_type, list):
        schema_type = ', '.join(schema_type)
    return {
        'type': schema_type,
        'name': schema.get('name', ''),
        'description': str(schema.get('description', ''))[:200],
        'has_error': '_error' in schema,
    }


def main():
    if len(sys.argv) < 4:
        print("Usage: python3 schema.py <domain> <sitemap_url> <output_dir>", file=sys.stderr)
        sys.exit(1)

    domain = sys.argv[1]
    sitemap_url = sys.argv[2]
    output_dir = sys.argv[3]
    os.makedirs(output_dir, exist_ok=True)

    print(f"\n=== Schema Extraction: {domain} ===", file=sys.stderr)

    sitemap_urls = get_sitemap_urls(sitemap_url)
    print(f"Found {len(sitemap_urls)} pages", file=sys.stderr)

    all_schemas = []
    page_schema_count = []

    for i, url in enumerate(sitemap_urls):
        status, html = fetch_url(url)
        if status != 200:
            page_schema_count.append({'url': url, 'schema_count': 0, 'types': 'page_error'})
            continue

        schemas = extract_jsonld(html)
        types = []
        for schema in schemas:
            flat = flatten_schema(schema)
            all_schemas.append({
                'page_url': url,
                'schema_type': flat['type'],
                'schema_name': flat['name'],
                'description_preview': flat['description'],
                'has_error': flat['has_error'],
                'full_json': json.dumps(schema)[:500]
            })
            types.append(flat['type'])

        page_schema_count.append({
            'url': url, 'schema_count': len(schemas),
            'types': ', '.join(types) if types else 'none'
        })

        if (i + 1) % 20 == 0:
            print(f"  Processed {i + 1}/{len(sitemap_urls)}...", file=sys.stderr)
        time.sleep(0.3)

    # Write detailed CSV
    with open(os.path.join(output_dir, f'{domain}_schema.csv'), 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=['page_url', 'schema_type', 'schema_name', 'description_preview', 'has_error', 'full_json'])
        writer.writeheader()
        writer.writerows(all_schemas)

    # Write page summary
    with open(os.path.join(output_dir, f'{domain}_schema_pages.csv'), 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=['url', 'schema_count', 'types'])
        writer.writeheader()
        writer.writerows(page_schema_count)

    # Count schema types
    type_counts = {}
    for s in all_schemas:
        t = s['schema_type']
        type_counts[t] = type_counts.get(t, 0) + 1

    pages_without = sum(1 for p in page_schema_count if p['schema_count'] == 0)
    print(f"\n  Total schemas: {len(all_schemas)} | Pages without: {pages_without}", file=sys.stderr)
    print(f"  Types: {type_counts}", file=sys.stderr)

    summary = {
        'domain': domain, 'total_schemas': len(all_schemas),
        'pages_without_schema': pages_without,
        'schema_type_counts': type_counts,
        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
    }
    with open(os.path.join(output_dir, f'{domain}_schema_summary.json'), 'w') as f:
        json.dump(summary, f, indent=2)


if __name__ == '__main__':
    main()
