from flask import Flask, Response, request
import requests
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta
import time
import redis
import hashlib
from urllib.parse import urljoin, urlparse
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging
from functools import lru_cache

# Logging setup
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

app = Flask(__name__)

# Configuration
BASE_URL = "https://www.bksoftware.co"
CACHE_TIMEOUT = 3600  # 1 hour in seconds
REQUEST_TIMEOUT = 10
MAX_THREADS = 10
MAX_DEPTH = 3
USER_AGENT = "BKSOFTWARE-Sitemap-Generator/1.0 (+https://www.bksoftware.co/sitemap.xml)"

# Priority URLs - Important pages that should always be included
PRIORITY_URLS = [
    BASE_URL + "/",
    BASE_URL + "/en/",
    BASE_URL + "/en/about",
    BASE_URL + "/en/products",
    BASE_URL + "/en/projects",
    BASE_URL + "/en/solutions",
    BASE_URL + "/en/blog",
    BASE_URL + "/en/contact",
    BASE_URL + "/en/team",
    BASE_URL + "/en/terms",
    BASE_URL + "/en/privacypolicy",
    BASE_URL + "/en/sitemap"
]

# Language codes
LANGUAGES = ["en", "de", "fr", "tr", "es", "pl", "it", "jp", "kr"]

class SitemapGenerator:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': USER_AGENT,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        })
        
    def is_valid_url(self, url):
        """Check if URL is valid for sitemap inclusion"""
        parsed = urlparse(url)
        
        # Skip non-HTTP URLs
        if parsed.scheme not in ['http', 'https']:
            return False
            
        # Ensure URL is from our domain
        if BASE_URL not in url:
            return False
            
        # Skip common non-page URLs
        skip_extensions = ['.pdf', '.jpg', '.jpeg', '.png', '.gif', '.webp', '.ico', 
                          '.css', '.js', '.zip', '.rar', '.exe', '.mp4', '.mp3']
        if any(url.lower().endswith(ext) for ext in skip_extensions):
            return False
            
        # Skip admin/login pages
        skip_keywords = ['admin', 'login', 'logout', 'register', 'dashboard', 
                        'wp-admin', 'phpmyadmin', 'cpanel']
        if any(keyword in url.lower() for keyword in skip_keywords):
            return False
            
        # Skip URL fragments
        if '#' in url:
            return False
            
        return True
    
    def normalize_url(self, url):
        """Normalize URL by removing query parameters and fragments"""
        parsed = urlparse(url)
        # Remove query and fragment
        normalized = parsed._replace(query='', fragment='').geturl()
        return normalized
    
    def get_page_priority(self, url):
        """Assign priority based on page importance"""
        url_path = urlparse(url).path
        
        # Homepage highest priority
        if url_path in ['/', '/en/', '/en']:
            return '1.0'
            
        # Important pages
        important_pages = ['/en/about', '/en/products', '/en/solutions', '/en/contact']
        if any(url_path.startswith(page) for page in important_pages):
            return '0.9'
            
        # Project pages
        if '/en/projects' in url_path:
            return '0.8'
            
        # Blog pages
        if '/en/blog' in url_path:
            return '0.7'
            
        # Legal pages
        if '/en/terms' in url_path or '/en/privacypolicy' in url_path:
            return '0.6'
            
        # Default priority
        return '0.5'
    
    def get_change_frequency(self, url):
        """Determine change frequency based on page type"""
        url_path = urlparse(url).path
        
        # Frequently changing pages
        if '/en/blog' in url_path or '/en/news' in url_path:
            return 'weekly'
            
        # Project pages (moderate updates)
        if '/en/projects' in url_path:
            return 'monthly'
            
        # Static pages (rarely change)
        static_pages = ['/en/about', '/en/team', '/en/terms', '/en/privacypolicy']
        if any(url_path.startswith(page) for page in static_pages):
            return 'yearly'
            
        # Default
        return 'monthly'
    
    def fetch_url(self, url, depth=0):
        """Fetch URL and extract links with error handling"""
        if depth > MAX_DEPTH:
            return set()
            
        try:
            response = self.session.get(url, timeout=REQUEST_TIMEOUT)
            response.raise_for_status()
            
            # Check content type
            content_type = response.headers.get('Content-Type', '')
            if 'text/html' not in content_type:
                return set()
                
            soup = BeautifulSoup(response.text, 'html.parser')
            links = set()
            
            # Extract all links
            for link in soup.find_all("a", href=True):
                href = link["href"].strip()
                
                # Skip empty links
                if not href:
                    continue
                    
                # Convert relative URLs to absolute
                if href.startswith('/'):
                    href = urljoin(BASE_URL, href)
                elif not href.startswith('http'):
                    href = urljoin(url, href)
                
                # Normalize and validate URL
                href = self.normalize_url(href)
                if self.is_valid_url(href):
                    links.add(href)
            
            # Add language variations for main pages
            if '/en/' in url:
                for lang in LANGUAGES:
                    if lang != 'en':
                        lang_url = url.replace('/en/', f'/{lang}/')
                        if self.is_valid_url(lang_url):
                            links.add(lang_url)
            
            return links
            
        except Exception as e:
            logger.warning(f"Failed to fetch {url}: {str(e)}")
            return set()
    
    @lru_cache(maxsize=1)
    def generate_sitemap_xml(self):
        """Generate complete sitemap XML with caching"""
        logger.info("Starting sitemap generation...")
        start_time = time.time()
        
        # Start with priority URLs
        all_urls = set(PRIORITY_URLS)
        visited_urls = set()
        
        # Use ThreadPoolExecutor for parallel fetching
        with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
            # Submit initial URLs
            future_to_url = {
                executor.submit(self.fetch_url, url, 0): url 
                for url in PRIORITY_URLS
            }
            
            # Process results
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    new_links = future.result()
                    all_urls.update(new_links)
                    
                    # Submit new URLs for deeper crawling
                    for new_url in new_links:
                        if new_url not in visited_urls:
                            visited_urls.add(new_url)
                            future_to_url[
                                executor.submit(self.fetch_url, new_url, 1)
                            ] = new_url
                            
                except Exception as e:
                    logger.error(f"Error processing {url}: {str(e)}")
        
        logger.info(f"Found {len(all_urls)} URLs in {time.time() - start_time:.2f} seconds")
        
        # Create XML structure
        urlset = ET.Element("urlset")
        urlset.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
        urlset.set("xmlns:xhtml", "http://www.w3.org/1999/xhtml")
        
        # Add URLs with metadata
        today = datetime.now().strftime('%Y-%m-%d')
        
        for url in sorted(all_urls):
            # Create URL entry
            url_element = ET.SubElement(urlset, "url")
            
            # Location
            loc = ET.SubElement(url_element, "loc")
            loc.text = url
            
            # Last modification (estimate)
            lastmod = ET.SubElement(url_element, "lastmod")
            lastmod.text = today
            
            # Change frequency
            changefreq = ET.SubElement(url_element, "changefreq")
            changefreq.text = self.get_change_frequency(url)
            
            # Priority
            priority = ET.SubElement(url_element, "priority")
            priority.text = self.get_page_priority(url)
            
            # Add hreflang for multilingual pages
            if '/en/' in url:
                for lang in LANGUAGES:
                    if lang != 'en':
                        lang_url = url.replace('/en/', f'/{lang}/')
                        if lang_url in all_urls:
                            link_element = ET.SubElement(url_element, "xhtml:link")
                            link_element.set("rel", "alternate")
                            link_element.set("hreflang", lang)
                            link_element.set("href", lang_url)
            
            # Add canonical link for language pages
            if any(url.startswith(f"{BASE_URL}/{lang}/") for lang in LANGUAGES if lang != 'en'):
                canonical_url = url.replace(f"{BASE_URL}/", f"{BASE_URL}/en/")
                link_element = ET.SubElement(url_element, "xhtml:link")
                link_element.set("rel", "canonical")
                link_element.set("href", canonical_url)
        
        # Convert to string
        xml_string = ET.tostring(urlset, encoding='utf-8', method='xml')
        
        # Add XML declaration
        xml_declaration = '<?xml version="1.0" encoding="UTF-8"?>\n'
        xml_declaration += '<!-- Generated by BK SOFTWARE CO. Sitemap Generator -->\n'
        xml_declaration += '<!-- Last updated: {} -->\n'.format(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        )
        
        return xml_declaration.encode('utf-8') + xml_string

# Initialize generator
sitemap_generator = SitemapGenerator()

@app.route('/sitemap.xml')
def sitemap():
    """Serve sitemap XML"""
    # Check for cache headers
    if_none_match = request.headers.get('If-None-Match')
    if_modified_since = request.headers.get('If-Modified-Since')
    
    try:
        # Generate sitemap
        sitemap_xml = sitemap_generator.generate_sitemap_xml()
        
        # Create response with caching headers
        response = Response(
            sitemap_xml,
            mimetype='application/xml',
            headers={
                'Cache-Control': f'public, max-age={CACHE_TIMEOUT}',
                'ETag': hashlib.md5(sitemap_xml).hexdigest(),
                'Last-Modified': datetime.now().strftime('%a, %d %b %Y %H:%M:%S GMT')
            }
        )
        
        # Check for 304 Not Modified
        if if_none_match and if_none_match == response.headers.get('ETag'):
            return Response(status=304)
            
        return response
        
    except Exception as e:
        logger.error(f"Error generating sitemap: {str(e)}")
        return Response(
            '<error>Sitemap generation failed</error>',
            status=500,
            mimetype='application/xml'
        )

@app.route('/sitemap-refresh')
def refresh_sitemap():
    """Manual trigger to refresh sitemap cache (protected endpoint)"""
    # Clear cache
    sitemap_generator.generate_sitemap_xml.cache_clear()
    logger.info("Sitemap cache cleared")
    return "Sitemap cache refreshed successfully", 200

@app.route('/sitemap-stats')
def sitemap_stats():
    """Show sitemap statistics"""
    try:
        sitemap_xml = sitemap_generator.generate_sitemap_xml()
        root = ET.fromstring(sitemap_xml.split(b'-->')[-1])  # Skip comments
        url_count = len(root.findall('url'))
        
        stats = {
            'total_urls': url_count,
            'generated_at': datetime.now().isoformat(),
            'base_url': BASE_URL,
            'cache_status': 'active',
            'languages': len(LANGUAGES)
        }
        
        return stats, 200
    except Exception as e:
        return {'error': str(e)}, 500

if __name__ == '__main__':
    app.run(
        host='0.0.0.0',
        port=5000,
        debug=False,
        threaded=True
    )