"""
nfr/ingest/pipeline.py
===========================================================
Main Ingest Pipeline — ties all NFR components together
===========================================================
The pipeline processes a single :class:`~nfr.models.db.IngestRecord`
through every stage:

1. **Download verification**  — confirms temp file is present.
2. **Hash computation**        — SHA-256, stored on the ingest record.
3. **Deduplication check**     — skip if hash already in content_hashes.
4. **Project matching**        — uses ClickUp list ID (primary) or
                                  search tokens (fallback).
5. **Sender classification**   — for email / external sources, determines
                                  the discipline subfolder in ``8- reçu``.
6. **Routing**                 — :class:`~nfr.routing.file_router.FileRouter`
                                  selects the destination subfolder.
7. **Write / review dispatch** — :class:`~nfr.nas.safe_writer.SafeNASWriter`
                                  writes to the NAS or the file is sent to
                                  the review queue.

Status transitions
------------------
pending → downloading → matching → routing → writing → done
                                                     ↘ review
                                                     ↘ duplicate
                                                     ↘ error
                                                     ↘ orphan_dumped

Fail-closed design
------------------
Any unhandled exception in any stage sets ``status = 'error'`` and logs
the traceback.  The file is never written to the NAS on an unexpected error.

Secrets policy
--------------
No secrets are referenced here.  See env.example.
"""
from __future__ import annotations

import hashlib
import logging
import traceback
from datetime import date, datetime, timezone, timedelta
from zoneinfo import ZoneInfo
from pathlib import Path
from typing import Optional

from sqlalchemy import select
from sqlalchemy.orm import Session

from nfr.models.db import ContentHash, IngestRecord, Project, RoutingDecision
from nfr.nas.path_builder import sanitize_filename
from nfr.nas.safe_writer import SafeNASWriter
from nfr.routing.file_router import FileRouter
from nfr.settings import get_settings

logger = logging.getLogger(__name__)


class IngestPipeline:
    """
    Full ingest pipeline: download → hash → dedup → match → route → write/review.

    Usage (from a Celery task or direct call)::

        pipeline = IngestPipeline()
        pipeline.process(session=db_session, ingest_record=record)
        # ingest_record.status is updated in-place; commit is the caller's responsibility.
    """

    def __init__(self) -> None:
        self._settings = get_settings()
        self._writer   = SafeNASWriter()
        self._router   = FileRouter(rules_path=self._settings.ROUTING_RULES_FILE)
        self._cached_task_data: dict | None = None  # populated by _match_project

    # ── Public API ────────────────────────────────────────────────────────────

    def process(self, session: Session, ingest_record: IngestRecord) -> None:
        """
        Execute the full pipeline for *ingest_record*.

        All exceptions are caught — on failure the record's status is set to
        ``'error'`` and the exception detail is stored in
        ``ingest_record.error_message``.

        :param session:       SQLAlchemy sync session.  This method does NOT
                              commit — the caller is responsible for committing.
        :param ingest_record: The :class:`~nfr.models.db.IngestRecord` to process.
        """
        try:
            self._run_pipeline(session, ingest_record)
        except Exception as exc:  # pylint: disable=broad-except
            logger.error(
                "PIPELINE_ERROR  ingest_id=%s  ulid=%s  error=%s",
                ingest_record.id,
                ingest_record.ulid,
                exc,
                exc_info=True,
            )
            ingest_record.status = "error"
            ingest_record.error_message = (
                f"{type(exc).__name__}: {exc}\n"
                f"{traceback.format_exc()[-1000:]}"   # last 1 000 chars only
            )
            ingest_record.processed_at = datetime.now(timezone.utc)

    # ── Internal pipeline ─────────────────────────────────────────────────────

    def _run_pipeline(
        self, session: Session, ingest_record: IngestRecord
    ) -> None:
        """Core pipeline — raises on unhandled errors (caught by :meth:`process`)."""

        # ── Stage 1: Verify temp file ─────────────────────────────────────────
        ingest_record.status = "downloading"
        temp_path = self._resolve_temp_path(ingest_record)
        if temp_path is None or not temp_path.exists():
            ingest_record.status = "error"
            ingest_record.error_message = (
                "Temp file not found — cannot begin pipeline. "
                "File must be downloaded before calling process()."
            )
            return

        # ── Stage 2: Hash computation ─────────────────────────────────────────
        file_hash = self._compute_sha256(temp_path)
        ingest_record.content_hash_sha256 = file_hash
        logger.debug("HASH_COMPUTED  ulid=%s  hash=%s", ingest_record.ulid, file_hash)

        # ── Stage 3: Deduplication check ─────────────────────────────────────
        existing_hash = session.get(ContentHash, file_hash)
        if existing_hash and existing_hash.nas_path:
            logger.info(
                "DUPLICATE_DETECTED  ulid=%s  hash=%s  existing=%s",
                ingest_record.ulid, file_hash, existing_hash.nas_path,
            )
            self._try_emit_audit(
                session, "DUPLICATE_DETECTED", ingest_record,
                payload={
                    "hash": file_hash,
                    "existing_path": existing_hash.nas_path,
                    "existing_ingest_id": existing_hash.first_ingest_id,
                },
            )
            ingest_record.status = "duplicate"
            ingest_record.duplicate_of_id = existing_hash.first_ingest_id
            ingest_record.processed_at = datetime.now(timezone.utc)
            temp_path.unlink(missing_ok=True)
            return

        # ── Stage 4: Project matching ─────────────────────────────────────────
        ingest_record.status = "matching"
        project = self._match_project(session, ingest_record)
        if project is None:
            logger.info(
                "NO_PROJECT_MATCH  ulid=%s  filename=%s",
                ingest_record.ulid, ingest_record.original_filename,
            )
            self._send_to_review(
                session, ingest_record, "NO_PROJECT_MATCH",
                reason_detail="No project matched with sufficient confidence.",
            )
            return

        ingest_record.matched_project_id = project.id
        self._try_emit_audit(
            session, "PROJECT_MATCHED", ingest_record,
            project_id=project.id,
            payload={
                "project_number": project.project_number,
                "signal": ingest_record.match_signal,
                "confidence": float(ingest_record.match_confidence or 0),
            },
        )
        logger.info(
            "PROJECT_MATCHED  ulid=%s  project=%s  signal=%s",
            ingest_record.ulid, project.project_number, ingest_record.match_signal,
        )

        # ── Stage 5+6: Sender identification + routing ────────────────────────
        # v2: routing is purely sender-based.  All files → 8- reçu/{identifier}/
        # Use Eastern time for the reception date (CMI is in Quebec, UTC-5/UTC-4)
        _eastern = ZoneInfo("America/Toronto")
        reception_date: date = (
            ingest_record.email_received_at.astimezone(_eastern).date()
            if ingest_record.email_received_at
            else datetime.now(_eastern).date()
        )

        # If project was matched via Signal 1 or 2 (list_id / folder_id),
        # _cached_task_data is still None.  Fetch it now for sender ID.
        if self._cached_task_data is None and ingest_record.clickup_task_id:
            self._cached_task_data = self._fetch_task_data(
                ingest_record.clickup_task_id
            )

        ingest_record.status = "routing"
        clickup_context = self._build_clickup_context(
            ingest_record, task_data=self._cached_task_data,
        )

        routing_result = self._router.route(
            filename=ingest_record.original_filename,
            extension=Path(ingest_record.original_filename).suffix.lower(),
            ingest_source=ingest_record.ingest_source,
            clickup_context=clickup_context,
            email_sender=ingest_record.email_sender,
        )

        # Persist routing decision
        decision = RoutingDecision(
            ingest_record_id=ingest_record.id,
            rules_evaluated=[],
            winning_rule=routing_result.signal_used,
            winning_layer="sender_identification",
            confidence_score=routing_result.confidence,
            destination_folder=routing_result.destination,
            ai_used=(routing_result.signal_used == "ai"),
        )
        session.add(decision)

        # v2: routing always returns a result (never None).
        # Low-confidence results still get written — the team renames later.
        # Only truly unidentified senders (conf=0.0) go to review.
        if routing_result.confidence == 0.0:
            logger.info(
                "UNIDENTIFIED_SENDER  ulid=%s  identifier=%s",
                ingest_record.ulid, routing_result.identifier,
            )
            # Still write to NAS under "_inconnu" but also flag for review
            self._send_to_review(
                session, ingest_record, "UNIDENTIFIED_SENDER",
                reason_detail=(
                    "No sender identification signal matched. "
                    f"File will be stored under '{routing_result.identifier}'."
                ),
                proposed_destination=routing_result.destination,
                proposed_project_id=project.id,
                proposed_confidence=routing_result.confidence,
            )
            # Do NOT return — still write the file

        # Build full path: 8- reçu/{identifier}/{YYYY-MM-DD}
        date_str = reception_date.strftime("%Y-%m-%d")
        destination_subfolder = f"{routing_result.destination}/{date_str}"

        ingest_record.routed_subfolder = destination_subfolder
        self._try_emit_audit(
            session, "ROUTE_DETERMINED", ingest_record,
            project_id=project.id,
            payload={
                "destination": destination_subfolder,
                "signal": routing_result.signal_used,
                "confidence": routing_result.confidence,
                "identifier": routing_result.identifier,
            },
        )
        logger.info(
            "ROUTE_DETERMINED  ulid=%s  dest=%s  identifier=%s  signal=%s  conf=%.2f",
            ingest_record.ulid,
            destination_subfolder,
            routing_result.identifier,
            routing_result.signal_used,
            routing_result.confidence,
        )

        # ── Stage 7: Write ────────────────────────────────────────────────────
        ingest_record.status = "writing"
        write_result = self._writer.write(
            session=session,
            ingest_record=ingest_record,
            project=project,
            destination_subfolder=destination_subfolder,
            temp_file_path=temp_path,
        )

        if write_result.skip_reason == "duplicate":
            ingest_record.status = "duplicate"
        elif write_result.skip_reason == "security_violation":
            ingest_record.status = "error"
            ingest_record.error_message = (
                f"Security violation: path traversal or boundary breach detected. "
                f"Intended path: {write_result.original_intended_path}"
            )
        elif write_result.skip_reason in (
            "extension_disallowed", "template_validation_failed"
        ):
            self._send_to_review(
                session, ingest_record, write_result.skip_reason.upper(),
                reason_detail=f"Write rejected: {write_result.skip_reason}",
                proposed_destination=destination_subfolder,
                proposed_project_id=project.id,
            )
        elif write_result.success or write_result.dry_run:
            # Status is set by the writer to 'done' or 'orphan_dumped'
            pass
        elif not write_result.success:
            ingest_record.status = "error"
            ingest_record.error_message = (
                f"NAS write failed: {write_result.skip_reason or 'unknown reason'}. "
                f"Intended path: {write_result.original_intended_path}"
            )

        ingest_record.processed_at = datetime.now(timezone.utc)

    # ── Project matching ──────────────────────────────────────────────────────

    def _match_project(
        self, session: Session, ingest_record: IngestRecord
    ) -> Optional[Project]:
        """
        Try to match *ingest_record* to a :class:`~nfr.models.db.Project`.

        Signal priority:
        1. ClickUp list ID (strongest — deterministic)
        2. ClickUp folder ID (medium)
        3. ClickUp task name contains project number (strong — tasks in
           shared lists like "Tâches en cours" always carry the project
           number in their title, e.g. ``[MASTER] 25848 - …``)
        4. Filename contains project number token (fallback)

        :returns: The matched :class:`Project` or ``None``.
        """
        # Signal 1: ClickUp list ID
        if ingest_record.clickup_list_id:
            project = session.scalar(
                select(Project).where(
                    Project.clickup_list_id == ingest_record.clickup_list_id,
                    Project.state == "active",
                )
            )
            if project:
                ingest_record.match_signal = "clickup_list_id"
                ingest_record.match_confidence = 1.00
                return project

        # Signal 2: ClickUp folder ID
        if ingest_record.clickup_folder_id:
            project = session.scalar(
                select(Project).where(
                    Project.clickup_folder_id == ingest_record.clickup_folder_id,
                    Project.state == "active",
                )
            )
            if project:
                ingest_record.match_signal = "clickup_folder_id"
                ingest_record.match_confidence = 0.90
                return project

        # Signal 3: ClickUp task name contains a project number
        task_data = self._fetch_task_data(ingest_record.clickup_task_id)
        self._cached_task_data = task_data  # cache for sender identification later
        task_name = task_data.get("name") if task_data else None
        if task_name:
            candidates = session.scalars(
                select(Project).where(Project.state == "active")
            ).all()
            task_name_lower = task_name.lower()
            for proj in candidates:
                if proj.project_number.lower() in task_name_lower:
                    ingest_record.match_signal = "clickup_task_name"
                    ingest_record.match_confidence = 0.95
                    return proj

        # Signal 4: Filename contains a project number token
        filename_lower = ingest_record.original_filename.lower()
        candidates_fn = session.scalars(
            select(Project).where(Project.state == "active")
        ).all()
        for proj in candidates_fn:
            if proj.project_number.lower() in filename_lower:
                ingest_record.match_signal = "filename_project_number"
                ingest_record.match_confidence = 0.85
                return proj

        return None

    @staticmethod
    def _fetch_task_data(task_id: str | None) -> dict | None:
        """Fetch task data (name, description, comments) from the ClickUp API.

        Returns the full task JSON dict, or None on failure.
        Used for both project matching (name) and sender identification
        (description contains the forwarded email with sender address).
        """
        if not task_id:
            return None
        try:
            from nfr.clickup.attachment_fetcher import (  # noqa: PLC0415
                _get_client,
            )
            with _get_client() as client:
                resp = client.get(f"/task/{task_id}")
                resp.raise_for_status()
                return resp.json()
        except Exception as exc:
            logger.warning(
                "Could not fetch task data for %s (non-fatal): %s",
                task_id, exc,
            )
            return None

    # ── ClickUp context builder ───────────────────────────────────────────────

    @staticmethod
    def _build_clickup_context(
        ingest_record: IngestRecord,
        task_data: dict | None = None,
    ) -> Optional[dict]:
        """Build a ClickUp context dict from the ingest record and API data.

        When *task_data* is provided (from :meth:`_fetch_task_data`), the
        task description is included so the sender identifier can extract
        the forwarded email address and signature from it.
        """
        if not any([
            ingest_record.clickup_task_id,
            ingest_record.clickup_list_id,
            ingest_record.clickup_folder_id,
        ]):
            return None

        description = ""
        task_name = ""
        if task_data:
            description = task_data.get("description", "") or ""
            # Also try text_content which is the plain-text version
            if not description:
                description = task_data.get("text_content", "") or ""
            task_name = task_data.get("name", "") or ""

        return {
            "task_id":     ingest_record.clickup_task_id,
            "list_id":     ingest_record.clickup_list_id,
            "folder_id":   ingest_record.clickup_folder_id,
            "task_name":   task_name,
            "list_name":   "",
            "folder_name": "",
            "tags":        [],
            "description": description,
        }

    # ── Review queue dispatch ─────────────────────────────────────────────────

    def _send_to_review(
        self,
        session: Session,
        ingest_record: IngestRecord,
        reason: str,
        *,
        reason_detail: Optional[str] = None,
        proposed_destination: Optional[str] = None,
        proposed_project_id: Optional[int] = None,
        proposed_confidence: Optional[float] = None,
    ) -> None:
        """Mark *ingest_record* for review and enqueue it via the review manager."""
        ingest_record.status = "review"
        ingest_record.processed_at = datetime.now(timezone.utc)

        try:
            from nfr.review import queue as review_queue
            review_queue.enqueue_review(
                session=session,
                ingest_record=ingest_record,
                reason=reason,
                reason_detail=reason_detail,
                proposed_destination=proposed_destination,
                proposed_project_id=proposed_project_id,
                proposed_confidence=proposed_confidence,
            )
        except Exception as exc:
            logger.warning("Review queue enqueue failed (non-fatal): %s", exc)

        self._try_emit_audit(
            session, "REVIEW_QUEUED", ingest_record,
            project_id=proposed_project_id,
            severity="WARN",
            payload={
                "reason": reason,
                "reason_detail": reason_detail,
                "proposed_destination": proposed_destination,
                "proposed_confidence": proposed_confidence,
            },
        )
        logger.info(
            "REVIEW_QUEUED  ulid=%s  reason=%s",
            ingest_record.ulid, reason,
        )

    # ── Temp path resolution ──────────────────────────────────────────────────

    def _resolve_temp_path(self, ingest_record: IngestRecord) -> Optional[Path]:
        """
        Resolve the expected temp file path for *ingest_record*.

        The temp filename convention (from spec §8 Step 6) is:
        ``{ulid}_{original_filename}.tmp``
        """
        staging_dir = self._settings.TEMP_STAGING_DIR
        safe_name   = sanitize_filename(ingest_record.original_filename)
        temp_name   = f"{ingest_record.ulid}_{safe_name}.tmp"
        return staging_dir / temp_name

    # ── SHA-256 helper ────────────────────────────────────────────────────────

    @staticmethod
    def _compute_sha256(path: Path) -> str:
        h = hashlib.sha256()
        with path.open("rb") as fh:
            for chunk in iter(lambda: fh.read(65536), b""):
                h.update(chunk)
        return h.hexdigest()

    # ── Audit helper ──────────────────────────────────────────────────────────

    @staticmethod
    def _try_emit_audit(
        session: Session,
        event_type: str,
        ingest_record: IngestRecord,
        *,
        project_id: Optional[int] = None,
        severity: str = "INFO",
        payload: Optional[dict] = None,
    ) -> None:
        """Attempt to emit an audit event, swallowing errors."""
        try:
            from nfr.audit import logger as audit_logger
            audit_logger.emit(
                event_type=event_type,
                component="pipeline",
                severity=severity,
                ingest_record_id=ingest_record.id,
                project_id=project_id,
                payload=payload or {},
                session=session,
            )
        except Exception as exc:
            logger.warning("Audit emit failed (non-fatal): %s", exc)