#!/usr/bin/env python3
"""
SEO Audit: Meta Tag & Content Extractor
Extracts title, meta description, H1, H2s, word count, and canonical from every page.
Machine-verified data — no AI interpretation.

Usage: python3 meta.py <domain> <sitemap_url> <output_dir>
"""
import xml.etree.ElementTree as ET
import urllib.request
import ssl
import sys
import csv
import time
import re
import json
import os


def fetch_url(url, timeout=15):
    ctx = ssl.create_default_context()
    req = urllib.request.Request(url, headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    })
    try:
        resp = urllib.request.urlopen(req, timeout=timeout, context=ctx)
        return resp.getcode(), resp.read().decode('utf-8', errors='replace')
    except urllib.error.HTTPError as e:
        return e.code, ''
    except:
        return 0, ''


def get_sitemap_urls(sitemap_url):
    try:
        req = urllib.request.Request(sitemap_url, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        ctx = ssl.create_default_context()
        resp = urllib.request.urlopen(req, timeout=15, context=ctx)
        content = resp.read().decode('utf-8')
        content = re.sub(r'\sxmlns="[^"]+"', '', content, count=1)
        root = ET.fromstring(content)
        urls = []
        for sitemap in root.findall('.//sitemap/loc'):
            urls.extend(get_sitemap_urls(sitemap.text.strip()))
        for url_elem in root.findall('.//url/loc'):
            urls.append(url_elem.text.strip())
        return urls
    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        return []


def strip_html(text):
    return re.sub(r'<[^>]+>', '', text).strip()


def count_words(html):
    """Count words in main content, excluding nav/header/footer."""
    # Remove script, style, nav, header, footer
    cleaned = re.sub(r'<(script|style|nav|header|footer)[^>]*>.*?</\1>', '', html, flags=re.DOTALL | re.IGNORECASE)
    # Remove all tags
    text = strip_html(cleaned)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return len(text.split()) if text else 0


def extract_meta(url, html):
    """Extract all SEO-relevant meta tags from HTML."""
    title_match = re.search(r'<title[^>]*>(.*?)</title>', html, re.DOTALL)
    title = strip_html(title_match.group(1)) if title_match else ''

    desc_match = re.search(r'<meta\s+name="description"\s+content="([^"]*)"', html)
    if not desc_match:
        desc_match = re.search(r'<meta\s+content="([^"]*)"\s+name="description"', html)
    desc = desc_match.group(1).strip() if desc_match else ''

    robots_match = re.search(r'<meta\s+name="robots"\s+content="([^"]*)"', html)
    robots = robots_match.group(1).strip() if robots_match else ''

    canonical_match = re.search(r'<link\s+rel="canonical"\s+href="([^"]*)"', html)
    canonical = canonical_match.group(1).strip() if canonical_match else ''

    h1_matches = re.findall(r'<h1[^>]*>(.*?)</h1>', html, re.DOTALL)
    h1s = [strip_html(h) for h in h1_matches]

    h2_matches = re.findall(r'<h2[^>]*>(.*?)</h2>', html, re.DOTALL)
    h2s = [strip_html(h) for h in h2_matches]

    word_count = count_words(html)

    # OG tags
    og_title = re.search(r'<meta\s+property="og:title"\s+content="([^"]*)"', html)
    og_desc = re.search(r'<meta\s+property="og:description"\s+content="([^"]*)"', html)
    og_image = re.search(r'<meta\s+property="og:image"\s+content="([^"]*)"', html)
    og_currency = re.search(r'<meta\s+property="product:price:currency"\s+content="([^"]*)"', html)

    return {
        'url': url,
        'title': title,
        'title_length': len(title),
        'meta_description': desc,
        'meta_description_length': len(desc),
        'meta_robots': robots,
        'canonical': canonical,
        'canonical_matches_url': canonical.rstrip('/') == url.rstrip('/'),
        'h1_count': len(h1s),
        'h1_text': ' | '.join(h1s),
        'h2_count': len(h2s),
        'h2_text': ' | '.join(h2s[:10]),
        'word_count': word_count,
        'og_title': og_title.group(1) if og_title else '',
        'og_description': og_desc.group(1) if og_desc else '',
        'og_image': og_image.group(1) if og_image else '',
        'og_currency': og_currency.group(1) if og_currency else '',
    }


def main():
    if len(sys.argv) < 4:
        print("Usage: python3 meta.py <domain> <sitemap_url> <output_dir>", file=sys.stderr)
        sys.exit(1)

    domain = sys.argv[1]
    sitemap_url = sys.argv[2]
    output_dir = sys.argv[3]
    os.makedirs(output_dir, exist_ok=True)

    print(f"\n=== Meta Extraction: {domain} ===", file=sys.stderr)

    sitemap_urls = get_sitemap_urls(sitemap_url)
    print(f"Found {len(sitemap_urls)} URLs", file=sys.stderr)

    results = []
    issues = []

    for i, url in enumerate(sitemap_urls):
        status, html = fetch_url(url)
        if status != 200:
            results.append({'url': url, 'title': f'HTTP {status}', 'title_length': 0,
                            'meta_description': '', 'meta_description_length': 0,
                            'meta_robots': '', 'canonical': '', 'canonical_matches_url': False,
                            'h1_count': 0, 'h1_text': '', 'h2_count': 0, 'h2_text': '',
                            'word_count': 0, 'og_title': '', 'og_description': '',
                            'og_image': '', 'og_currency': ''})
            continue

        meta = extract_meta(url, html)
        results.append(meta)

        # Flag issues
        if meta['title_length'] == 0:
            issues.append({'url': url, 'issue': 'missing_title', 'detail': ''})
        elif meta['title_length'] > 60:
            issues.append({'url': url, 'issue': 'title_too_long', 'detail': f"{meta['title_length']} chars"})
        if meta['meta_description_length'] == 0:
            issues.append({'url': url, 'issue': 'missing_meta_description', 'detail': ''})
        elif meta['meta_description_length'] > 160:
            issues.append({'url': url, 'issue': 'meta_description_too_long', 'detail': f"{meta['meta_description_length']} chars"})
        elif meta['meta_description_length'] < 70:
            issues.append({'url': url, 'issue': 'meta_description_too_short', 'detail': f"{meta['meta_description_length']} chars"})
        if meta['h1_count'] == 0:
            issues.append({'url': url, 'issue': 'missing_h1', 'detail': ''})
        elif meta['h1_count'] > 1:
            issues.append({'url': url, 'issue': 'multiple_h1', 'detail': f"{meta['h1_count']} H1 tags"})
        if meta['title'] and meta['meta_description'] and meta['meta_description'].startswith(meta['title'][:30]):
            issues.append({'url': url, 'issue': 'description_repeats_title', 'detail': ''})
        if meta['word_count'] < 300:
            issues.append({'url': url, 'issue': 'thin_content', 'detail': f"{meta['word_count']} words"})

        if (i + 1) % 20 == 0:
            print(f"  Processed {i + 1}/{len(sitemap_urls)}...", file=sys.stderr)
        time.sleep(0.3)

    # Write results
    output_file = os.path.join(output_dir, f'{domain}_meta.csv')
    fieldnames = ['url', 'title', 'title_length', 'meta_description', 'meta_description_length',
                  'meta_robots', 'canonical', 'canonical_matches_url', 'h1_count', 'h1_text',
                  'h2_count', 'h2_text', 'word_count', 'og_title', 'og_description',
                  'og_image', 'og_currency']
    with open(output_file, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(results)

    # Write issues
    issues_file = os.path.join(output_dir, f'{domain}_meta_issues.csv')
    with open(issues_file, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=['url', 'issue', 'detail'])
        writer.writeheader()
        writer.writerows(issues)

    print(f"\n  Pages: {len(results)} | Issues: {len(issues)}", file=sys.stderr)
    print(f"  Output: {output_file}", file=sys.stderr)


if __name__ == '__main__':
    main()
