# -*- coding: utf-8 -*-
#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup
import json
import xml.etree.ElementTree as ET
import re
import os
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
import html

# Global lock for writing to files and updating shared counters.
lock = threading.Lock()

# File names for output and checkpoint.
OUTPUT_FILE = "jobs.ndjson"
CHECKPOINT_FILE = "checkpoint.txt"

def extract_urls_from_sitemap(xml_content):
    urls = []
    try:
        root = ET.fromstring(xml_content)
        namespace = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        for url_elem in root.findall("sm:url", namespace):
            loc_elem = url_elem.find("sm:loc", namespace)
            if loc_elem is not None and loc_elem.text:
                urls.append(loc_elem.text.strip())
    except Exception as e:
        print(f"Error parsing XML sitemap: {e}")
    return urls

def attempt_fix_and_parse_json(json_text):
    try:
        desc_key_match = re.search(r'"description"\s*:', json_text, re.IGNORECASE)
        if not desc_key_match:
            return None
        
        actual_desc_key_start = desc_key_match.start(0)
        val_start_colon = json_text.find(':', actual_desc_key_start)
        val_start_quote = json_text.find('"', val_start_colon)
        val_start_pos = val_start_quote + 1

        end_patterns = [m.start() for m in re.finditer(r'"\s*(,|}|\s*,)', json_text[val_start_pos:])]
        if not end_patterns:
            end_patterns = [m.start() for m in re.finditer(r'"', json_text[val_start_pos:])]
            if not end_patterns:
                return None
        
        val_end_pos = val_start_pos + min(end_patterns)
        raw_desc_value_content = json_text[val_start_pos:val_end_pos]

        soup_desc = BeautifulSoup(raw_desc_value_content, 'html.parser')
        cleaned_description_text = soup_desc.get_text(separator=' ', strip=True)
        cleaned_description_text = html.unescape(cleaned_description_text)

        json_safe_cleaned_desc_full_string = json.dumps(cleaned_description_text)
        json_safe_cleaned_desc_value_content = json_safe_cleaned_desc_full_string[1:-1]

        part_before = json_text[:val_start_pos]
        part_after = json_text[val_end_pos:]
        modified_json_text = part_before + json_safe_cleaned_desc_value_content + part_after

        return json.loads(modified_json_text)
    except Exception:
        return None

def extract_job_postings(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    job_postings_data = []
    scripts = soup.find_all("script", type="application/ld+json")
    
    for script_tag in scripts:
        try:
            json_text = script_tag.string if script_tag.string else script_tag.get_text()
            if not json_text or not json_text.strip():
                continue

            try:
                parsed_data = json.loads(json_text)
            except json.JSONDecodeError:
                parsed_data = attempt_fix_and_parse_json(json_text)
            
            if not parsed_data:
                continue

            items_to_check = []
            if isinstance(parsed_data, list):
                items_to_check.extend(parsed_data)
            elif isinstance(parsed_data, dict):
                items_to_check.append(parsed_data)
            
            for item in items_to_check:
                if isinstance(item, dict):
                    if item.get("@type") == "JobPosting":
                        job_postings_data.append(item)
                    elif "@graph" in item and isinstance(item["@graph"], list):
                        for graph_item in item["@graph"]:
                            if isinstance(graph_item, dict) and graph_item.get("@type") == "JobPosting":
                                job_postings_data.append(graph_item)
                                
        except Exception:
            continue
            
    return job_postings_data

def extract_fields(job_posting):
    """
    MODIFIED: Extracts fields specifically for jobs.hiringshabs.com workflow.
    """
    job_title = job_posting.get("title") or job_posting.get("jobTitle", "Untitled Job")
    description = job_posting.get("description", "")
    
    # Extract Company Name
    hiring_org = job_posting.get("hiringOrganization", {})
    if isinstance(hiring_org, dict):
        company_name = hiring_org.get("name", "Hiring Company")
    else:
        company_name = "Hiring Company"

    # Extract Apply URL (Crucial for the 'Apply Now' button)
    apply_url = job_posting.get("url") or job_posting.get("mainEntityOfPage", "")
    if isinstance(apply_url, dict):
        apply_url = apply_url.get("@id", "")

    # Clean location logic
    location = "Remote"
    job_loc = job_posting.get("jobLocation")
    if job_loc:
        loc_item = job_loc[0] if isinstance(job_loc, list) else job_loc
        if isinstance(loc_item, dict):
            addr = loc_item.get("address", {})
            if isinstance(addr, dict):
                parts = [addr.get("addressLocality"), addr.get("addressRegion"), addr.get("addressCountry")]
                found_loc = ", ".join(filter(None, parts))
                if found_loc: location = found_loc

    return {
        "job_title": job_title.strip(),
        "company_name": company_name.strip(),
        "body_parts": [description],
        "location": location,
        "apply_url": apply_url if apply_url else "joblegi.xyz",
        "employment_type": job_posting.get("employmentType", "FULL_TIME")
    }

def process_job_page(url, headers):
    try:
        response = requests.get(url, headers=headers, timeout=30)
        if response.status_code != 200:
            return []
        
        response.encoding = response.apparent_encoding
        job_schemas = extract_job_postings(response.text)
        
        extracted_jobs = []
        for schema in job_schemas:
            data = extract_fields(schema)
            extracted_jobs.append(data)
        return extracted_jobs
    except Exception as e:
        print(f"Error processing {url}: {e}")
        return []

def save_checkpoint(url):
    with lock:
        with open(CHECKPOINT_FILE, "a", encoding="utf-8") as f:
            f.write(url + "\n")

def save_job_data(job_data):
    with lock:
        with open(OUTPUT_FILE, "a", encoding="utf-8") as outf:
            outf.write(json.dumps(job_data) + "\n")

def load_checkpoint():
    processed = set()
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, "r", encoding="utf-8") as f:
            for line in f:
                url = line.strip()
                if url: processed.add(url)
    return processed

def process_and_checkpoint(url, headers, counters, total):
    results = process_job_page(url, headers)
    for job_data in results:
        save_job_data(job_data)
    save_checkpoint(url)
    with lock:
        counters["processed"] += 1
        print(f"Progress: {counters['processed']}/{total} URLs ({url})")

def main():
    sitemap_list_file = "sitemaps.txt"
    if not os.path.exists(sitemap_list_file):
        print(f"Error: {sitemap_list_file} not found.")
        return

    with open(sitemap_list_file, "r", encoding="utf-8") as f:
        sitemap_urls = [line.strip() for line in f if line.strip()]

    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
    
    all_job_urls = []
    for sitemap_url in sitemap_urls:
        try:
            print(f"Fetching sitemap: {sitemap_url}")
            res = requests.get(sitemap_url, headers=headers, timeout=30)
            if res.status_code == 200:
                urls = extract_urls_from_sitemap(res.text)
                all_job_urls.extend(urls)
        except Exception as e:
            print(f"Sitemap error: {e}")

    all_job_urls = sorted(list(set(all_job_urls)))
    processed_urls = load_checkpoint()
    remaining_urls = [u for u in all_job_urls if u not in processed_urls]
    
    counters = {"processed": 0}
    max_workers = 10
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_and_checkpoint, url, headers, counters, len(remaining_urls)) for url in remaining_urls]
        for future in as_completed(futures):
            try:
                future.result()
            except Exception as e:
                print(f"Thread error: {e}")

    print(f"\nDone! Data saved to {OUTPUT_FILE}")

if __name__ == "__main__":
    main()