import os
import math
import datetime
import xml.etree.ElementTree as ET
from xml.dom import minidom

# --- Configuration ---
# Ensure this matches the output_dir you use in j3.py
SOURCE_HTML_DIR = r"./job" 
OUTPUT_DIR = r"./"
BASE_URL = "https://joblegi.xyz/job/"
MAX_URLS_PER_SITEMAP = 3000
SITEMAP_FILENAME_PREFIX = "sitemap"
SITEMAP_INDEX_FILENAME = "sitemap_index.xml"

INCLUDE_CHANGEFREQ_PRIORITY = True
DEFAULT_CHANGEFREQ = "daily"  # Changed to daily to match your update goal
DEFAULT_PRIORITY = "0.8"

# Force current date for lastmod so Google sees the "daily change"
FORCE_CURRENT_DATE_FOR_LASTMOD = True
SITEMAP_ONLY_TODAY_URLS = False 

# --- Helper Functions ---
def find_html_files_generator(directory):
    """Yields HTML file paths from the job directory."""
    print(f"Scanning directory: {directory}")
    try:
        if not os.path.exists(directory):
            print(f"Directory {directory} does not exist!")
            return
        for entry in os.scandir(directory):
            if entry.is_file() and entry.name.lower().endswith(".html"):
                yield entry
    except Exception as e:
        print(f"Error scanning directory: {e}")

def prettify_xml(elem):
    """Return a pretty-printed XML string for the Element."""
    rough_string = ET.tostring(elem, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    return reparsed.toprettyxml(indent="  ", encoding="utf-8")

def create_sitemap_element():
    urlset = ET.Element("urlset")
    urlset.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
    return urlset

def main():
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    html_files = list(find_html_files_generator(SOURCE_HTML_DIR))
    total_files = len(html_files)
    print(f"Found {total_files} HTML files.")

    num_sitemaps = math.ceil(total_files / MAX_URLS_PER_SITEMAP)
    sitemap_files = []

    current_date = datetime.datetime.now().strftime("%Y-%m-%d")

    for i in range(num_sitemaps):
        urlset = create_sitemap_element()
        start_idx = i * MAX_URLS_PER_SITEMAP
        end_idx = min((i + 1) * MAX_URLS_PER_SITEMAP, total_files)
        
        for j in range(start_idx, end_idx):
            entry = html_files[j]
            url_node = ET.SubElement(urlset, "url")
            
            # Use the filename as the slug in the URL
            loc = ET.SubElement(url_node, "loc")
            loc.text = f"{BASE_URL}{entry.name}"
            
            lastmod = ET.SubElement(url_node, "lastmod")
            lastmod.text = current_date
            
            if INCLUDE_CHANGEFREQ_PRIORITY:
                cf = ET.SubElement(url_node, "changefreq")
                cf.text = DEFAULT_CHANGEFREQ
                pri = ET.SubElement(url_node, "priority")
                pri.text = DEFAULT_PRIORITY

        sitemap_filename = f"{SITEMAP_FILENAME_PREFIX}-{i+1:03d}.xml"
        sitemap_path = os.path.join(OUTPUT_DIR, sitemap_filename)
        
        with open(sitemap_path, "wb") as f:
            f.write(prettify_xml(urlset))
        
        sitemap_files.append(sitemap_filename)
        print(f"Created sitemap: {sitemap_filename}")

    # Create Sitemap Index
    if sitemap_files:
        sitemapindex = ET.Element("sitemapindex")
        sitemapindex.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
        
        for sf in sitemap_files:
            sitemap_node = ET.SubElement(sitemapindex, "sitemap")
            loc = ET.SubElement(sitemap_node, "loc")
            # Index points to the root where sitemaps are stored
            loc.text = f"https://joblegi.xyz/{sf}"
            lastmod = ET.SubElement(sitemap_node, "lastmod")
            lastmod.text = current_date
            
        index_path = os.path.join(OUTPUT_DIR, SITEMAP_INDEX_FILENAME)
        with open(index_path, "wb") as f:
            f.write(prettify_xml(sitemapindex))
        print(f"Sitemap index created: {SITEMAP_INDEX_FILENAME}")

if __name__ == "__main__":
    main()