import google.generativeai as genai
import pdfplumber
import pandas as pd
from PIL import Image
import io
import os
import json
import re
import time


def extract_custom_page_with_gemini(page_image, model, custom_prompt):
    """
    Sends a page image to Gemini with custom prompt for structured extraction.
    """
    prompt = f"""
    You are a PDF-to-Excel converter. Convert the contents of this PDF page into structured tabular JSON.

    USER REQUIREMENTS:
    {custom_prompt}

    CRITICAL RULES:
    1. Return ONLY valid JSON array of objects.
    2. Each object represents one row with the EXACT columns specified above.
    3. Do NOT create any additional columns beyond what's explicitly requested.
    4. If a requested column cannot be found, leave it as empty string "".
    5. Column names must match EXACTLY as specified in user requirements.
    6. Combine "Taxpayer First Name" and "Taxpayer Last Name" into a new "Full Name" column.
    7. The "Full Name" column should be formatted as "First Last" (e.g., "PAUL ABRAHAMS").

    Formatting:
    - Dates: Use YYYY-MM-DD format
    - Numbers: Convert to floats where appropriate
    - Text: Keep as strings
    """

    try:
        response = model.generate_content([prompt, page_image])
        raw_text = response.text.strip()

        # Clean the response
        # Remove markdown code blocks if present
        raw_text = raw_text.replace('```json', '').replace('```', '').strip()

        # Try to parse JSON safely
        try:
            data = json.loads(raw_text)
            if isinstance(data, list):
                return data
            elif isinstance(data, dict):
                # If it's a dict with a 'data' key or similar
                for key in ['data', 'rows', 'transactions', 'records']:
                    if key in data and isinstance(data[key], list):
                        return data[key]
                # Wrap single dict in list
                return [data]
        except json.JSONDecodeError:
            # Try to extract JSON from text
            match = re.search(r'\[.*\]', raw_text, re.DOTALL)
            if match:
                return json.loads(match.group(0))

    except Exception as e:
        print(f"⚠️ Page extraction error: {e}")

    return []


def convert_custom_pdf_to_excel(pdf_path, excel_path, api_key, custom_prompt, max_pages=None):
    """
    Custom PDF converter that uses user-provided prompt for conversion.
    """
    if not os.path.exists(pdf_path):
        print(f"❌ ERROR: File '{pdf_path}' not found.")
        return False

    print(f"📄 Processing {pdf_path} with custom prompt...")
    print(f"📝 Custom prompt: {custom_prompt}...")

    # Configure Gemini
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel("gemini-2.5-pro")  # Using 1.5-pro for better instruction following

    all_rows = []

    try:
        with pdfplumber.open(pdf_path) as pdf:
            total_pages = len(pdf.pages)
            
            # Application of page limit
            pages_to_process = pdf.pages
            if max_pages and total_pages > max_pages:
                print(f"⚠️ Page limit applied: only processing first {max_pages} pages.")
                pages_to_process = pdf.pages[:max_pages]
                total_pages = max_pages

            for i, page in enumerate(pages_to_process, start=1):
                print(f"🔍 Processing page {i}/{total_pages} with custom instructions...")

                # Convert page to image for Gemini Vision
                pil_img = page.to_image(resolution=200).original

                # Extract with custom prompt
                page_rows = extract_custom_page_with_gemini(pil_img, model, custom_prompt)

                if page_rows:
                    all_rows.extend(page_rows)
                    print(f"✅ Page {i}: extracted {len(page_rows)} rows")
                else:
                    print(f"⚠️ Page {i}: no structured data extracted")

                # Small delay to avoid rate limiting
                if i < total_pages:
                    time.sleep(1)

    except Exception as e:
        print(f"❌ PDF processing error: {e}")
        return False

    if not all_rows:
        print("❌ No data extracted from the PDF.")
        return False

    # Convert to DataFrame
    try:
        df = pd.DataFrame(all_rows)

        # Clean column names
        df.columns = [str(col).strip() for col in df.columns]

        # Convert date columns if present
        for col in df.columns:
            if any(keyword in col.lower() for keyword in ['date', 'time']):
                try:
                    df[col] = pd.to_datetime(df[col], errors='coerce').dt.strftime('%Y-%m-%d')
                except:
                    pass

        # Convert numeric columns
        for col in df.columns:
            if any(keyword in col.lower() for keyword in ['amount', 'price', 'cost', 'debit', 'credit', 'balance']):
                try:
                    df[col] = pd.to_numeric(df[col], errors='coerce')
                except:
                    pass

        # Save to Excel
        with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
            df.to_excel(writer, sheet_name="Custom_Data", index=False)

            # Add a summary sheet
            summary_data = {
                'Summary': [
                    f"Total rows extracted: {len(df)}",
                    f"Total columns: {len(df.columns)}",
                    f"Custom prompt used: {custom_prompt[:100]}...",
                    f"Extraction completed: {time.strftime('%Y-%m-%d %H:%M:%S')}"
                ]
            }
            pd.DataFrame(summary_data).to_excel(writer, sheet_name="Summary", index=False)

        print(f"\n🎉 SUCCESS: Saved custom extracted content to {excel_path}")
        print(f"📊 Extracted {len(df)} rows with {len(df.columns)} columns")

        # Show column names
        print("📋 Columns extracted:", list(df.columns))

        return True

    except Exception as e:
        print(f"❌ DataFrame/Excel error: {e}")
        return False


def extract_custom_page_with_gemini(page_image, model, custom_prompt, max_retries=3):
    """
    Sends a page image to Gemini with custom prompt for structured extraction.
    """
    prompt = f"""
    You are a PDF-to-Excel converter. Convert the contents of this PDF page into structured tabular JSON.

    USER REQUIREMENTS:
    {custom_prompt}

    CRITICAL RULES:
    1. Return ONLY valid JSON array of objects.
    2. Each object represents one row with the EXACT columns specified above.
    3. Do NOT create any additional columns beyond what's explicitly requested.
    4. If a requested column cannot be found, leave it as empty string "".
    5. Column names must match EXACTLY as specified in user requirements.
    6. Combine "Taxpayer First Name" and "Taxpayer Last Name" into a new "Full Name" column.
    7. The "Full Name" column should be formatted as "First Last" (e.g., "PAUL ABRAHAMS").

    Formatting:
    - Dates: Use YYYY-MM-DD format
    - Numbers: Convert to floats where appropriate
    - Text: Keep as strings
    """
    for attempt in range(max_retries):
        try:
            response = model.generate_content([prompt, page_image])
            raw_text = response.text.strip()

            # Clean the response
            raw_text = raw_text.replace('```json', '').replace('```', '').strip()

            # Try to parse JSON
            try:
                data = json.loads(raw_text)
                # Process data...
                return data
            except json.JSONDecodeError:
                # Try to extract JSON from text
                match = re.search(r'\[.*\]', raw_text, re.DOTALL)
                if match:
                    return json.loads(match.group(0))

        except Exception as e:
            if "504" in str(e) and attempt < max_retries - 1:
                print(f"⚠️ Timeout on attempt {attempt + 1}, retrying...")
                time.sleep(2)  # Wait before retry
                continue
            else:
                print(f"⚠️ Page extraction error: {e}")
                break

    return []

def extract_custom_pdf_to_excel(pdf_path, excel_path, api_key, custom_prompt, max_pages=None):
    """
    Wrapper function for Flask compatibility.
    """
    return convert_custom_pdf_to_excel(pdf_path, excel_path, api_key, custom_prompt, max_pages=max_pages)