#!/usr/bin/env python3
"""
SEO Audit: Hreflang Extractor & Verifier
Extracts ALL hreflang tags from every page in the sitemap, then verifies each alternate URL.
Outputs machine-verified CSV — no AI interpretation.

Usage: python3 hreflang.py <domain> <sitemap_url> <output_dir>
"""
import xml.etree.ElementTree as ET
import urllib.request
import urllib.error
import ssl
import sys
import csv
import time
import re
import json
import os


def fetch_url(url, timeout=15):
    ctx = ssl.create_default_context()
    req = urllib.request.Request(url, headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    })
    try:
        resp = urllib.request.urlopen(req, timeout=timeout, context=ctx)
        return resp.getcode(), resp.read().decode('utf-8', errors='replace')
    except urllib.error.HTTPError as e:
        return e.code, ''
    except Exception as e:
        return 0, ''


def fetch_status_only(url, timeout=15):
    ctx = ssl.create_default_context()
    req = urllib.request.Request(url, headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    })
    try:
        resp = urllib.request.urlopen(req, timeout=timeout, context=ctx)
        return resp.getcode()
    except urllib.error.HTTPError as e:
        return e.code
    except:
        return 0


def get_sitemap_urls(sitemap_url):
    try:
        req = urllib.request.Request(sitemap_url, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        ctx = ssl.create_default_context()
        resp = urllib.request.urlopen(req, timeout=15, context=ctx)
        content = resp.read().decode('utf-8')
        content = re.sub(r'\sxmlns="[^"]+"', '', content, count=1)
        root = ET.fromstring(content)
        urls = []
        for sitemap in root.findall('.//sitemap/loc'):
            urls.extend(get_sitemap_urls(sitemap.text.strip()))
        for url_elem in root.findall('.//url/loc'):
            urls.append(url_elem.text.strip())
        return urls
    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        return []


def extract_hreflang(html):
    """Extract hreflang tags from HTML."""
    pattern = r'<link[^>]*rel="alternate"[^>]*hreflang="([^"]*)"[^>]*href="([^"]*)"[^>]*/?\s*>'
    matches = re.findall(pattern, html)
    if not matches:
        pattern = r'<link[^>]*href="([^"]*)"[^>]*hreflang="([^"]*)"[^>]*/?\s*>'
        matches = [(lang, href) for href, lang in re.findall(pattern, html)]
    return matches


def main():
    if len(sys.argv) < 4:
        print("Usage: python3 hreflang.py <domain> <sitemap_url> <output_dir>", file=sys.stderr)
        sys.exit(1)

    domain = sys.argv[1]
    sitemap_url = sys.argv[2]
    output_dir = sys.argv[3]
    os.makedirs(output_dir, exist_ok=True)

    print(f"\n=== Hreflang Audit: {domain} ===", file=sys.stderr)

    sitemap_urls = get_sitemap_urls(sitemap_url)
    print(f"Found {len(sitemap_urls)} URLs to check", file=sys.stderr)

    # Extract hreflang from every page
    all_hreflang = []
    checked_alternates = {}

    for i, page_url in enumerate(sitemap_urls):
        status, html = fetch_url(page_url)
        if status != 200 or not html:
            all_hreflang.append({
                'page_url': page_url, 'page_status': status,
                'hreflang_lang': '', 'hreflang_url': '', 'alternate_status': '',
                'is_self': '', 'issue': 'page_not_accessible'
            })
            continue

        tags = extract_hreflang(html)
        if not tags:
            all_hreflang.append({
                'page_url': page_url, 'page_status': 200,
                'hreflang_lang': '', 'hreflang_url': '', 'alternate_status': '',
                'is_self': '', 'issue': 'no_hreflang_tags'
            })
            continue

        for lang, href in tags:
            is_self = href.rstrip('/') == page_url.rstrip('/')

            # Check alternate URL status (cache results)
            if href in checked_alternates:
                alt_status = checked_alternates[href]
            else:
                alt_status = fetch_status_only(href)
                checked_alternates[href] = alt_status
                time.sleep(0.15)

            # Determine issues
            issue = ''
            if alt_status == 404:
                issue = 'alternate_404'
            elif alt_status == 403:
                issue = 'alternate_403'
            elif alt_status == 301 or alt_status == 302 or alt_status == 308:
                issue = 'alternate_redirects'
            elif alt_status == 0:
                issue = 'alternate_unreachable'

            all_hreflang.append({
                'page_url': page_url,
                'page_status': 200,
                'hreflang_lang': lang,
                'hreflang_url': href,
                'alternate_status': alt_status,
                'is_self': is_self,
                'issue': issue
            })

        if (i + 1) % 10 == 0:
            print(f"  Processed {i + 1}/{len(sitemap_urls)} pages...", file=sys.stderr)
        time.sleep(0.2)

    # Write full CSV
    output_file = os.path.join(output_dir, f'{domain}_hreflang.csv')
    with open(output_file, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=[
            'page_url', 'page_status', 'hreflang_lang', 'hreflang_url',
            'alternate_status', 'is_self', 'issue'
        ])
        writer.writeheader()
        writer.writerows(all_hreflang)

    # Write summary
    total_pages = len(sitemap_urls)
    pages_with_hreflang = len(set(r['page_url'] for r in all_hreflang if r['hreflang_lang']))
    pages_without = len(set(r['page_url'] for r in all_hreflang if r['issue'] == 'no_hreflang_tags'))
    total_alternates = len([r for r in all_hreflang if r['hreflang_lang']])
    broken = len([r for r in all_hreflang if r['issue'] in ('alternate_404', 'alternate_403', 'alternate_unreachable')])
    redirecting = len([r for r in all_hreflang if r['issue'] == 'alternate_redirects'])

    summary = {
        'domain': domain,
        'total_pages': total_pages,
        'pages_with_hreflang': pages_with_hreflang,
        'pages_without_hreflang': pages_without,
        'total_alternate_links': total_alternates,
        'broken_alternates': broken,
        'redirecting_alternates': redirecting,
        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
    }
    with open(os.path.join(output_dir, f'{domain}_hreflang_summary.json'), 'w') as f:
        json.dump(summary, f, indent=2)

    print(f"\n  Pages: {total_pages} | With hreflang: {pages_with_hreflang} | Without: {pages_without}", file=sys.stderr)
    print(f"  Alternates: {total_alternates} | Broken: {broken} | Redirecting: {redirecting}", file=sys.stderr)
    print(f"  Output: {output_file}", file=sys.stderr)


if __name__ == '__main__':
    main()
