#!/usr/bin/env python3
"""
SEO Audit: AI Crawler Access Tester
Tests if AI search crawlers (GPTBot, ClaudeBot, PerplexityBot, etc.) can access the site.
Machine-verified HTTP status codes — no AI interpretation.

Usage: python3 ai_crawlers.py <url> <output_dir>
"""
import urllib.request
import urllib.error
import ssl
import sys
import csv
import json
import os
import time

CRAWLERS = [
    ('GPTBot', 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)', 'ChatGPT search'),
    ('OAI-SearchBot', 'OAI-SearchBot/1.0', 'OpenAI search features'),
    ('ClaudeBot', 'ClaudeBot/1.0; +https://www.anthropic.com/claude-bot', 'Claude search'),
    ('PerplexityBot', 'PerplexityBot/1.0', 'Perplexity search'),
    ('Googlebot', 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', 'Google search'),
    ('Bingbot', 'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)', 'Bing search'),
    ('Browser', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Normal browser'),
    ('Empty-UA', '', 'No user agent'),
    ('Python-Requests', 'python-requests/2.31.0', 'Common scraper UA'),
]


def test_crawler(url, ua_string, timeout=15):
    ctx = ssl.create_default_context()
    req = urllib.request.Request(url, headers={'User-Agent': ua_string})
    try:
        resp = urllib.request.urlopen(req, timeout=timeout, context=ctx)
        return resp.getcode()
    except urllib.error.HTTPError as e:
        return e.code
    except:
        return 0


def main():
    if len(sys.argv) < 3:
        print("Usage: python3 ai_crawlers.py <url> <output_dir>", file=sys.stderr)
        sys.exit(1)

    url = sys.argv[1]
    output_dir = sys.argv[2]
    os.makedirs(output_dir, exist_ok=True)

    from urllib.parse import urlparse
    domain = urlparse(url).netloc

    print(f"\n=== AI Crawler Access Test: {domain} ===", file=sys.stderr)

    results = []
    for name, ua, purpose in CRAWLERS:
        status = test_crawler(url, ua)
        blocked = status == 403
        results.append({
            'crawler': name, 'user_agent': ua, 'purpose': purpose,
            'status_code': status, 'blocked': blocked
        })
        icon = 'BLOCKED' if blocked else 'OK'
        print(f"  {name}: {status} ({icon})", file=sys.stderr)
        time.sleep(0.5)

    # Also check robots.txt for AI crawler directives
    robots_url = url.rstrip('/') + '/robots.txt'
    robots_status, robots_content = 0, ''
    ctx = ssl.create_default_context()
    try:
        req = urllib.request.Request(robots_url, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        resp = urllib.request.urlopen(req, timeout=15, context=ctx)
        robots_status = resp.getcode()
        robots_content = resp.read().decode('utf-8', errors='replace')
    except:
        pass

    # Check for AI crawler mentions in robots.txt
    ai_bot_names = ['GPTBot', 'ClaudeBot', 'PerplexityBot', 'OAI-SearchBot',
                    'CCBot', 'anthropic-ai', 'Google-Extended', 'Bytespider', 'cohere-ai']
    robots_directives = []
    for bot in ai_bot_names:
        if bot.lower() in robots_content.lower():
            # Find the relevant lines
            for line in robots_content.split('\n'):
                if bot.lower() in line.lower():
                    robots_directives.append({'bot': bot, 'directive': line.strip()})

    # Check llms.txt
    llms_url = url.rstrip('/') + '/llms.txt'
    try:
        req = urllib.request.Request(llms_url, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        resp = urllib.request.urlopen(req, timeout=10, context=ctx)
        llms_status = resp.getcode()
    except urllib.error.HTTPError as e:
        llms_status = e.code
    except:
        llms_status = 0

    # Write CSV
    output_file = os.path.join(output_dir, f'{domain}_ai_crawlers.csv')
    with open(output_file, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=['crawler', 'user_agent', 'purpose', 'status_code', 'blocked'])
        writer.writeheader()
        writer.writerows(results)

    # Write summary
    summary = {
        'domain': domain,
        'url_tested': url,
        'ai_crawlers_blocked': [r['crawler'] for r in results if r['blocked']],
        'ai_crawlers_allowed': [r['crawler'] for r in results if not r['blocked'] and r['crawler'] not in ('Browser', 'Empty-UA', 'Python-Requests')],
        'robots_txt_status': robots_status,
        'robots_ai_directives': robots_directives,
        'llms_txt_status': llms_status,
        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
    }
    with open(os.path.join(output_dir, f'{domain}_ai_crawlers_summary.json'), 'w') as f:
        json.dump(summary, f, indent=2)

    blocked_count = len(summary['ai_crawlers_blocked'])
    print(f"\n  AI crawlers blocked: {blocked_count}", file=sys.stderr)
    print(f"  llms.txt: {'exists' if llms_status == 200 else 'missing'} ({llms_status})", file=sys.stderr)


if __name__ == '__main__':
    main()
