#!/usr/bin/env python3
"""
SEO Audit: Site Crawler
Checks all sitemap URLs and crawls pages for broken internal links.
Outputs machine-verified CSV data — no AI interpretation.

Usage: python3 crawl.py <domain> <sitemap_url> <output_dir>
Example: python3 crawl.py example.com https://example.com/sitemap.xml ./crawl-data
"""
import xml.etree.ElementTree as ET
import urllib.request
import urllib.error
import ssl
import sys
import csv
import time
import re
import json
import os
from html.parser import HTMLParser
from urllib.parse import urlparse, urljoin


class LinkExtractor(HTMLParser):
    def __init__(self, base_url):
        super().__init__()
        self.base_url = base_url
        self.links = []

    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            for name, value in attrs:
                if name == 'href' and value:
                    full_url = urljoin(self.base_url, value)
                    self.links.append(full_url)


def fetch_url(url, timeout=15):
    ctx = ssl.create_default_context()
    req = urllib.request.Request(url, headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    })
    try:
        resp = urllib.request.urlopen(req, timeout=timeout, context=ctx)
        return resp.getcode(), resp.read().decode('utf-8', errors='replace'), resp.geturl()
    except urllib.error.HTTPError as e:
        return e.code, '', url
    except Exception as e:
        return 0, '', str(e)


def get_sitemap_urls(sitemap_url):
    try:
        req = urllib.request.Request(sitemap_url, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        ctx = ssl.create_default_context()
        resp = urllib.request.urlopen(req, timeout=15, context=ctx)
        content = resp.read().decode('utf-8')
        content = re.sub(r'\sxmlns="[^"]+"', '', content, count=1)
        root = ET.fromstring(content)
        urls = []
        for sitemap in root.findall('.//sitemap/loc'):
            urls.extend(get_sitemap_urls(sitemap.text.strip()))
        for url_elem in root.findall('.//url/loc'):
            urls.append(url_elem.text.strip())
        return urls
    except Exception as e:
        print(f"Error fetching sitemap {sitemap_url}: {e}", file=sys.stderr)
        return []


def check_sitemap_urls(sitemap_urls, output_file):
    """Check all sitemap URLs return 200."""
    results = []
    for i, url in enumerate(sitemap_urls):
        status, _, final_url = fetch_url(url)
        redirected = final_url != url
        results.append({
            'url': url, 'status': status,
            'redirected': redirected, 'final_url': final_url if redirected else ''
        })
        if status != 200:
            print(f"  [{status}] {url}", file=sys.stderr)
        if (i + 1) % 20 == 0:
            print(f"  Checked {i + 1}/{len(sitemap_urls)} sitemap URLs...", file=sys.stderr)
        time.sleep(0.3)

    with open(output_file, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=['url', 'status', 'redirected', 'final_url'])
        writer.writeheader()
        writer.writerows(results)

    ok = sum(1 for r in results if r['status'] == 200)
    errors = sum(1 for r in results if r['status'] not in (200, 301, 302, 308))
    print(f"  Sitemap: {len(results)} URLs | OK: {ok} | Errors: {errors}", file=sys.stderr)
    return results


def find_broken_links(sitemap_urls, base_domain, output_file):
    """Crawl all pages and find broken internal links."""
    checked = {}
    broken_links = []

    for i, page_url in enumerate(sitemap_urls):
        status, html, _ = fetch_url(page_url)
        if status != 200 or not html:
            continue

        parser = LinkExtractor(page_url)
        try:
            parser.feed(html)
        except:
            continue

        internal_links = set()
        for link in parser.links:
            parsed = urlparse(link)
            if parsed.netloc and base_domain in parsed.netloc:
                clean = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
                if not clean.endswith('/') and '.' not in parsed.path.split('/')[-1]:
                    clean += '/'
                internal_links.add(clean)

        for link in internal_links:
            if link in checked:
                link_status = checked[link]
            else:
                link_status, _, _ = fetch_url(link)
                checked[link] = link_status
                time.sleep(0.15)

            if link_status not in (200, 301, 302, 308):
                broken_links.append({
                    'broken_url': link, 'status': link_status, 'found_on': page_url
                })

        if (i + 1) % 10 == 0:
            print(f"  Crawled {i + 1}/{len(sitemap_urls)} pages, {len(checked)} unique links, {len(broken_links)} broken...", file=sys.stderr)
        time.sleep(0.2)

    with open(output_file, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=['broken_url', 'status', 'found_on'])
        writer.writeheader()
        writer.writerows(broken_links)

    unique_broken = set(r['broken_url'] for r in broken_links)
    print(f"  Crawled {len(sitemap_urls)} pages, {len(checked)} unique links", file=sys.stderr)
    print(f"  Broken: {len(broken_links)} instances, {len(unique_broken)} unique URLs", file=sys.stderr)
    return broken_links


def main():
    if len(sys.argv) < 4:
        print("Usage: python3 crawl.py <domain> <sitemap_url> <output_dir>", file=sys.stderr)
        sys.exit(1)

    domain = sys.argv[1]
    sitemap_url = sys.argv[2]
    output_dir = sys.argv[3]
    os.makedirs(output_dir, exist_ok=True)

    print(f"\n=== Crawling {domain} ===", file=sys.stderr)

    # Step 1: Get sitemap URLs
    sitemap_urls = get_sitemap_urls(sitemap_url)
    print(f"Found {len(sitemap_urls)} URLs in sitemap", file=sys.stderr)

    # Step 2: Check all sitemap URLs
    print(f"\nChecking sitemap URLs...", file=sys.stderr)
    check_sitemap_urls(sitemap_urls, os.path.join(output_dir, f'{domain}_sitemap_check.csv'))

    # Step 3: Crawl for broken internal links
    print(f"\nCrawling for broken internal links...", file=sys.stderr)
    find_broken_links(sitemap_urls, domain, os.path.join(output_dir, f'{domain}_broken_links.csv'))

    # Step 4: Write summary
    summary = {
        'domain': domain,
        'sitemap_url': sitemap_url,
        'total_urls': len(sitemap_urls),
        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
    }
    with open(os.path.join(output_dir, f'{domain}_crawl_summary.json'), 'w') as f:
        json.dump(summary, f, indent=2)

    print(f"\nDone! Output in {output_dir}/", file=sys.stderr)


if __name__ == '__main__':
    main()
