"""Asociación automática heurística de documentos a paciente/caso."""

from __future__ import annotations

from difflib import SequenceMatcher
import re
from typing import Any

from app.batch_processing.domain.models import (
    ASSOCIATION_SOURCE_AUTO,
    AssociationDecision,
    AssociationResult,
    ExtractedSignals,
    FILE_STATUS_ASOCIADO,
    FILE_STATUS_PENDIENTE_VALIDACION,
)
from app.services.patient_name_extraction import (
    extract_patient_name_from_text,
    sanitize_patient_name_candidate,
)


_DATE_PATTERN = re.compile(r"\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b")
_DOCUMENT_NUMBER_PATTERN = re.compile(
    r"(?:^|[\s:.-])(?:cc|ti|ce|dni|nit)?\s*[-:]?\s*(\d{5,12})(?:\b|$)",
    re.IGNORECASE,
)
_CASE_INLINE_PATTERNS = (
    re.compile(
        r"(?:caso\s*(?:no\.?)?|no\.?\s*de\s*caso|n[°o]\s*caso)\s*[:#-]?\s*(\d{4,12})",
        re.IGNORECASE,
    ),
    re.compile(
        r"\b(\d{4,12})\s*(?:no\.?\s*de\s*caso|n[°o]\s*caso|caso\s*(?:no\.?)?|caso)\b",
        re.IGNORECASE,
    ),
)
_CASE_LABEL_PATTERN = re.compile(
    r"(?:caso\s*(?:no\.?)?|no\.?\s*de\s*caso|n[°o]\s*caso)$",
    re.IGNORECASE,
)
_CASE_HEADER_LABEL_PATTERN = re.compile(
    r"(?:caso\s*(?:no\.?)?|no\.?\s*de\s*caso|n[°o]\s*caso|caso)\b",
    re.IGNORECASE,
)
_PROCEDURE_LINE_PATTERN = re.compile(
    r"(?:procedimiento|cirug[ií]a|intervenci[oó]n)\s*[:#]?\s*([^\n\r]{5,120})",
    re.IGNORECASE,
)
_PATIENT_ID_LABEL_TOKENS = (
    "identificación",
    "identificacion",
    "tipo y n",
    "tipo y no",
    "tipo y nro",
    "documento",
    "cc",
    "cédula",
    "cedula",
)
_PROCEDURE_DESCRIPTION_LABELS = (
    "procedimientos realizados",
    "descripción del procedimiento",
    "procedimiento realizado",
)

AUTO_ASSOCIATION_MIN_SCORE = 0.85
AUTO_ASSOCIATION_MIN_GAP = 0.20
REQUIRED_ANCHOR_SIGNALS = {"identificacion", "numero_caso"}

_FILENAME_IGNORE_TOKENS = {
    "historia",
    "clinica",
    "clínica",
    "factura",
    "cirugia",
    "cirugía",
    "quirurgico",
    "quirúrgico",
    "quirurgica",
    "quirúrgica",
    "documento",
    "doc",
    "archivo",
    "tipo",
    "egreso",
    "fecha",
    "telefono",
    "teléfono",
    "paciente",
    "usuario",
    "caso",
    "ingreso",
    "hc",
}


def _normalize_text(value: str) -> str:
    value = str(value or "").strip().lower()
    value = re.sub(r"\s+", " ", value)
    return value


def _normalize_name(value: str) -> str:
    normalized = _normalize_text(value)
    normalized = re.sub(r"[^a-z0-9áéíóúñ ]+", " ", normalized)
    normalized = re.sub(r"\s+", " ", normalized).strip()
    return normalized


def _slug(value: str) -> str:
    normalized = _normalize_name(value)
    return re.sub(r"[^a-z0-9]+", "-", normalized).strip("-")


class HeuristicCaseAssociationService:
    """
    Extrae señales mínimas por documento y construye clústeres de caso por score.

    La heurística prioriza identificación y número de caso; el nombre del paciente
    y datos secundarios solo refinan el resultado.
    """

    def extract_signals(self, filename: str, text: str, detected_type: str) -> ExtractedSignals:
        evidence: list[str] = []
        date_match = _DATE_PATTERN.search(text)

        patient_name = self._pick_patient_name(filename, text)
        if patient_name:
            evidence.append("nombre_paciente")

        patient_id = self._extract_patient_id(text)
        if patient_id:
            evidence.append("identificacion")

        case_number = self._extract_case_number(text)
        if case_number:
            evidence.append("numero_caso")

        service_date = date_match.group(1) if date_match else ""
        if service_date:
            evidence.append("fecha_atencion")

        procedure_code = ""

        procedure_description = self._extract_procedure_description(text)
        if procedure_description:
            evidence.append("descripcion_procedimiento")

        if filename:
            evidence.append("nombre_archivo")

        return ExtractedSignals(
            patient_name=patient_name,
            patient_id=patient_id,
            case_number=case_number,
            service_date=service_date,
            procedure_code=procedure_code,
            procedure_description=procedure_description,
            evidence=evidence,
        )

    def associate(self, file_records: list[dict[str, Any]]) -> AssociationResult:
        """Asocia documentos a clústeres de caso o los deja pendientes de validación."""

        clusters: list[dict[str, Any]] = []
        decisions: list[AssociationDecision] = []

        ordered_files = sorted(
            file_records,
            key=lambda item: self._signal_strength(item),
            reverse=True,
        )

        for file_record in ordered_files:
            candidate_scores: list[dict[str, Any]] = []
            for cluster in clusters:
                score, evidence, breakdown = self._score(file_record, cluster)
                candidate_scores.append(
                    {
                        "cluster": cluster,
                        "score": score,
                        "evidence": evidence,
                        "score_breakdown": breakdown,
                    }
                )

            candidate_scores.sort(key=lambda item: item["score"], reverse=True)
            best_candidate = candidate_scores[0] if candidate_scores else None
            best_cluster = best_candidate["cluster"] if best_candidate else None
            best_score = float(best_candidate["score"]) if best_candidate else 0.0
            second_score = (
                float(candidate_scores[1]["score"]) if len(candidate_scores) > 1 else 0.0
            )
            best_evidence = list(best_candidate["evidence"]) if best_candidate else []
            best_breakdown = (
                dict(best_candidate["score_breakdown"]) if best_candidate else {}
            )
            top_candidates = [
                self._candidate_payload(item)
                for item in candidate_scores[:2]
            ]

            if (
                best_cluster
                and self._can_auto_associate(
                    best_score=best_score,
                    second_score=second_score,
                    evidence=best_evidence,
                )
            ):
                self._merge_into_cluster(best_cluster, file_record)
                decision = AssociationDecision(
                    file_id=file_record["_id"],
                    status=FILE_STATUS_ASOCIADO,
                    case_key=best_cluster["case_key"],
                    confidence=self._clamp_score(best_score),
                    evidence=best_evidence,
                    patient_name=file_record.get("patient_name", "")
                    or best_cluster.get("patient_name", ""),
                    patient_id=file_record.get("patient_id", "")
                    or best_cluster.get("patient_id", ""),
                    case_number=file_record.get("case_number", "")
                    or best_cluster.get("case_number", ""),
                    procedure_code=file_record.get("procedure_code", "")
                    or best_cluster.get("procedure_code", ""),
                    procedure_description=file_record.get("procedure_description", "")
                    or best_cluster.get("procedure_description", ""),
                    associated_user=file_record.get("associated_user", "")
                    or best_cluster.get("associated_user", ""),
                    association_source=ASSOCIATION_SOURCE_AUTO,
                    score_breakdown=best_breakdown,
                    top_candidates=top_candidates,
                )
                decisions.append(decision)
                best_cluster["file_ids"].append(file_record["_id"])
                continue

            if self._should_seed_cluster(
                file_record=file_record,
                best_cluster=best_cluster,
                evidence=best_evidence,
            ):
                cluster = self._build_cluster(file_record)
                seed_breakdown = self._seed_score_breakdown(file_record)
                seed_score = self._score_from_breakdown(seed_breakdown)
                seed_evidence = self._seed_evidence(file_record)
                clusters.append(cluster)
                decisions.append(
                    AssociationDecision(
                        file_id=file_record["_id"],
                        status=FILE_STATUS_ASOCIADO,
                        case_key=cluster["case_key"],
                        confidence=seed_score,
                        evidence=seed_evidence,
                        patient_name=file_record.get("patient_name", ""),
                        patient_id=file_record.get("patient_id", ""),
                        case_number=file_record.get("case_number", ""),
                        procedure_code=file_record.get("procedure_code", ""),
                        procedure_description=file_record.get(
                            "procedure_description", ""
                        ),
                        associated_user=self._build_associated_user(
                            file_record.get("patient_id", ""),
                            file_record.get("patient_name", ""),
                        ),
                        association_source=ASSOCIATION_SOURCE_AUTO,
                        score_breakdown=seed_breakdown,
                        top_candidates=[
                            {
                                "case_key": cluster["case_key"],
                                "score": seed_score,
                                "evidence": seed_evidence,
                                "score_breakdown": seed_breakdown,
                            }
                        ],
                    )
                )
                cluster["file_ids"].append(file_record["_id"])
                continue

            decisions.append(
                AssociationDecision(
                    file_id=file_record["_id"],
                    status=FILE_STATUS_PENDIENTE_VALIDACION,
                    confidence=self._clamp_score(best_score),
                    evidence=best_evidence
                    or file_record.get("evidence", [])
                    or ["nombre_archivo"],
                    patient_name=file_record.get("patient_name", ""),
                    patient_id=file_record.get("patient_id", ""),
                    case_number=file_record.get("case_number", ""),
                    procedure_code=file_record.get("procedure_code", ""),
                    procedure_description=file_record.get("procedure_description", ""),
                    associated_user=self._build_associated_user(
                        file_record.get("patient_id", ""),
                        file_record.get("patient_name", ""),
                    ),
                    association_source=ASSOCIATION_SOURCE_AUTO,
                    score_breakdown=best_breakdown,
                    top_candidates=top_candidates,
                )
            )

        cases = []
        for cluster in clusters:
            cases.append(
                {
                    "batch_id": cluster["batch_id"],
                    "case_key": cluster["case_key"],
                    "patient_name": cluster.get("patient_name", ""),
                    "patient_id": cluster.get("patient_id", ""),
                    "case_number": cluster.get("case_number", ""),
                    "service_date": cluster.get("service_date", ""),
                    "procedure_code": cluster.get("procedure_code", ""),
                    "procedure_description": cluster.get(
                        "procedure_description", ""
                    ),
                    "associated_user": cluster.get("associated_user", ""),
                    "file_ids": cluster.get("file_ids", []),
                }
            )
        return AssociationResult(decisions=decisions, cases=cases)

    def _pick_patient_name(self, filename: str, text: str) -> str:
        extracted_name = extract_patient_name_from_text(text)
        sanitized_extracted = sanitize_patient_name_candidate(extracted_name)
        if sanitized_extracted:
            return sanitized_extracted
        from_filename = self._name_candidate_from_filename(filename)
        if from_filename:
            return from_filename
        return ""

    def _signal_strength(self, file_record: dict[str, Any]) -> int:
        return sum(
            1
            for key in (
                "patient_id",
                "case_number",
                "patient_name",
                "service_date",
            )
            if file_record.get(key)
        )

    def _can_seed_cluster(self, file_record: dict[str, Any]) -> bool:
        return bool(file_record.get("patient_id") or file_record.get("case_number"))

    def _seed_evidence(self, file_record: dict[str, Any]) -> list[str]:
        evidence = []
        if file_record.get("patient_id"):
            evidence.append("identificacion")
        if file_record.get("case_number"):
            evidence.append("numero_caso")
        if file_record.get("patient_name"):
            evidence.append("nombre_paciente")
        return evidence or ["nombre_archivo"]

    def _build_cluster(self, file_record: dict[str, Any]) -> dict[str, Any]:
        case_key = self._build_case_key(file_record)
        return {
            "batch_id": file_record["batch_id"],
            "case_key": case_key,
            "patient_name": file_record.get("patient_name", ""),
            "patient_id": file_record.get("patient_id", ""),
            "case_number": file_record.get("case_number", ""),
            "service_date": file_record.get("service_date", ""),
            "procedure_code": "",
            "procedure_description": file_record.get("procedure_description", ""),
            "associated_user": self._build_associated_user(
                file_record.get("patient_id", ""),
                file_record.get("patient_name", ""),
            ),
            "file_ids": [],
        }

    def _merge_into_cluster(
        self,
        cluster: dict[str, Any],
        file_record: dict[str, Any],
    ) -> None:
        for key in (
            "patient_name",
            "patient_id",
            "case_number",
            "service_date",
            "procedure_description",
        ):
            if not cluster.get(key) and file_record.get(key):
                cluster[key] = file_record.get(key)
        if not cluster.get("associated_user"):
            cluster["associated_user"] = self._build_associated_user(
                cluster.get("patient_id", "") or file_record.get("patient_id", ""),
                cluster.get("patient_name", "")
                or file_record.get("patient_name", ""),
            )

    def _build_case_key(self, file_record: dict[str, Any]) -> str:
        parts = [
            _slug(file_record.get("patient_id", "")),
            _slug(file_record.get("case_number", "")),
            _slug(file_record.get("patient_name", "")),
        ]
        parts = [part for part in parts if part]
        return "-".join(parts[:3]) or _slug(file_record.get("original_name", "archivo"))

    def _score(
        self,
        file_record: dict[str, Any],
        cluster: dict[str, Any],
    ) -> tuple[float, list[str], dict[str, float]]:
        """Calcula el score acordado para decidir asociación automática segura."""

        evidence: list[str] = []
        breakdown: dict[str, float] = {}
        if (
            file_record.get("patient_id")
            and cluster.get("patient_id")
            and _normalize_text(file_record["patient_id"])
            == _normalize_text(cluster["patient_id"])
        ):
            evidence.append("identificacion")
            breakdown["identificacion"] = 0.55
        if (
            file_record.get("case_number")
            and cluster.get("case_number")
            and _normalize_text(file_record["case_number"])
            == _normalize_text(cluster["case_number"])
        ):
            evidence.append("numero_caso")
            breakdown["numero_caso"] = 0.25
        name_ratio = self._name_similarity(
            file_record.get("patient_name", ""),
            cluster.get("patient_name", ""),
        )
        if name_ratio >= 0.92:
            evidence.append("nombre_paciente")
            breakdown["nombre_paciente"] = 0.20
        if (
            file_record.get("service_date")
            and cluster.get("service_date")
            and file_record["service_date"] == cluster["service_date"]
        ):
            evidence.append("fecha_atencion")
            breakdown["fecha_atencion"] = 0.05
        normalized_breakdown = self._normalize_breakdown(breakdown)
        return self._score_from_breakdown(normalized_breakdown), evidence, normalized_breakdown

    def _seed_score_breakdown(self, file_record: dict[str, Any]) -> dict[str, float]:
        breakdown: dict[str, float] = {}
        if file_record.get("patient_id"):
            breakdown["identificacion"] = 0.55
        if file_record.get("case_number"):
            breakdown["numero_caso"] = 0.25
        if file_record.get("patient_name"):
            breakdown["nombre_paciente"] = 0.20
        return self._normalize_breakdown(breakdown)

    def _candidate_payload(self, candidate: dict[str, Any]) -> dict[str, Any]:
        cluster = candidate["cluster"]
        return {
            "case_key": cluster.get("case_key", ""),
            "score": self._clamp_score(candidate.get("score", 0.0)),
            "evidence": list(candidate.get("evidence", [])),
            "score_breakdown": dict(candidate.get("score_breakdown", {})),
            "patient_name": cluster.get("patient_name", ""),
            "patient_id": cluster.get("patient_id", ""),
            "procedure_code": "",
        }

    def _build_associated_user(self, patient_id: str, patient_name: str) -> str:
        if patient_id:
            return str(patient_id).strip()
        if patient_name:
            return re.sub(r"\s+", " ", str(patient_name).strip())
        return ""

    def _name_similarity(self, left: str, right: str) -> float:
        left_norm = _normalize_name(left)
        right_norm = _normalize_name(right)
        if not left_norm or not right_norm:
            return 0.0
        if left_norm == right_norm:
            return 1.0
        return SequenceMatcher(a=left_norm, b=right_norm).ratio()

    def _extract_patient_id(self, text: str) -> str:
        lines = self._clean_lines(text)
        for index, line in enumerate(lines):
            line_lower = line.lower()
            if "paciente:" in line_lower:
                candidate = self._extract_document_number_from_fragment(line)
                if candidate:
                    return candidate
            if not self._line_has_patient_id_label(line_lower):
                continue

            candidate = self._extract_document_number_from_fragment(line)
            if candidate:
                return candidate

            for probe in range(index + 1, min(index + 4, len(lines))):
                next_line = lines[probe]
                if self._line_has_phone_context(next_line):
                    continue
                candidate = self._extract_document_number_from_fragment(next_line)
                if candidate:
                    return candidate
        return ""

    def _extract_case_number(self, text: str) -> str:
        lines = self._clean_lines(text)
        for line in lines[:12]:
            candidate = self._extract_case_number_from_fragment(line)
            if candidate:
                return candidate

        for pattern in _CASE_INLINE_PATTERNS:
            match = pattern.search(text)
            if match:
                return match.group(1).strip()

        for index, line in enumerate(lines):
            if _CASE_LABEL_PATTERN.search(line):
                for probe in range(index + 1, min(index + 3, len(lines))):
                    number_match = re.search(r"\b(\d{4,12})\b", lines[probe])
                    if number_match:
                        return number_match.group(1)
        return ""

    def _extract_procedure_description(self, text: str) -> str:
        lines = self._clean_lines(text)
        for index, line in enumerate(lines):
            line_lower = line.lower()
            if any(label in line_lower for label in _PROCEDURE_DESCRIPTION_LABELS):
                inline_match = _PROCEDURE_LINE_PATTERN.search(line)
                if inline_match and ":" in line:
                    candidate = self._sanitize_procedure_description(
                        inline_match.group(1)
                    )
                    if candidate:
                        return candidate
                for probe in range(index + 1, min(index + 4, len(lines))):
                    candidate = self._sanitize_procedure_description(lines[probe])
                    if candidate:
                        return candidate
        return ""

    def _name_candidate_from_filename(self, filename: str) -> str:
        basename = re.sub(r"\.pdf$", "", str(filename or ""), flags=re.IGNORECASE)
        tokens = re.findall(r"[A-Za-zÁÉÍÓÚÑáéíóúñ]+", basename)
        cleaned_tokens = []
        for token in tokens:
            normalized = _normalize_name(token)
            if len(normalized) < 2:
                continue
            if normalized in _FILENAME_IGNORE_TOKENS:
                continue
            cleaned_tokens.append(token)
        if len(cleaned_tokens) < 2:
            return ""
        candidate = " ".join(cleaned_tokens[:4])
        return sanitize_patient_name_candidate(candidate)

    def _can_auto_associate(
        self,
        *,
        best_score: float,
        second_score: float,
        evidence: list[str],
    ) -> bool:
        score = self._clamp_score(best_score)
        gap = score - self._clamp_score(second_score)
        evidence_set = set(evidence)
        has_anchor_signal = any(signal in REQUIRED_ANCHOR_SIGNALS for signal in evidence_set)
        has_required_anchor_match = REQUIRED_ANCHOR_SIGNALS.issubset(evidence_set)
        if has_required_anchor_match and gap >= AUTO_ASSOCIATION_MIN_GAP:
            return True
        return (
            has_anchor_signal
            and score >= AUTO_ASSOCIATION_MIN_SCORE
            and gap >= AUTO_ASSOCIATION_MIN_GAP
        )

    def _should_seed_cluster(
        self,
        *,
        file_record: dict[str, Any],
        best_cluster: dict[str, Any] | None,
        evidence: list[str],
    ) -> bool:
        if not self._can_seed_cluster(file_record):
            return False
        if not best_cluster:
            return True

        anchor_evidence = {
            signal for signal in evidence if signal in REQUIRED_ANCHOR_SIGNALS
        }
        return not anchor_evidence

    def _normalize_breakdown(self, breakdown: dict[str, float]) -> dict[str, float]:
        positive = {
            key: float(value)
            for key, value in breakdown.items()
            if float(value) > 0.0
        }
        if not positive:
            return {}

        total = sum(positive.values())
        if total <= 1.0:
            return {key: round(value, 4) for key, value in positive.items()}

        factor = 1.0 / total
        normalized = {
            key: round(value * factor, 4)
            for key, value in positive.items()
        }
        normalized_total = sum(normalized.values())
        if normalized_total > 1.0:
            first_key = next(iter(normalized))
            normalized[first_key] = round(
                max(0.0, normalized[first_key] - (normalized_total - 1.0)),
                4,
            )
        return normalized

    def _score_from_breakdown(self, breakdown: dict[str, float]) -> float:
        return self._clamp_score(sum(breakdown.values()))

    def _clamp_score(self, value: float | int) -> float:
        try:
            numeric = float(value)
        except (TypeError, ValueError):
            return 0.0
        if numeric < 0.0:
            return 0.0
        if numeric > 1.0:
            return 1.0
        return round(numeric, 4)

    def _clean_lines(self, text: str) -> list[str]:
        return [
            re.sub(r"\s+", " ", line).strip()
            for line in text.splitlines()
            if re.sub(r"\s+", " ", line).strip()
        ]

    def _extract_document_number_from_fragment(self, fragment: str) -> str:
        if self._line_has_phone_context(fragment):
            return ""
        explicit_match = re.search(
            r"(?:cc|ti|ce|dni|nit)\s*[-:]?\s*(\d{5,12})",
            fragment,
            re.IGNORECASE,
        )
        if explicit_match:
            return explicit_match.group(1).strip()

        label_match = re.search(
            r"(?:identificaci[oó]n|documento)\D*(\d{5,12})",
            fragment,
            re.IGNORECASE,
        )
        if label_match:
            return label_match.group(1).strip()

        match = _DOCUMENT_NUMBER_PATTERN.search(fragment)
        if not match:
            return ""
        return match.group(1).strip()

    def _extract_case_number_from_fragment(self, fragment: str) -> str:
        normalized = re.sub(r"\s+", " ", str(fragment or "")).strip()
        if not normalized:
            return ""

        if not _CASE_HEADER_LABEL_PATTERN.search(normalized):
            return ""

        for pattern in _CASE_INLINE_PATTERNS:
            match = pattern.search(normalized)
            if match:
                return match.group(1).strip()

        return ""

    def _line_has_patient_id_label(self, line_lower: str) -> bool:
        if any(token in line_lower for token in ("identificación", "identificacion", "documento")):
            return True
        return bool(re.search(r"\bcc\b|\bc[ée]dula\b", line_lower))

    def _line_has_phone_context(self, value: str) -> bool:
        lowered = value.lower()
        return any(
            token in lowered
            for token in ("tel", "telefono", "teléfono", "cel", "fax", "contacto")
        )

    def _sanitize_procedure_description(self, value: str) -> str:
        candidate = re.sub(r"\s+", " ", str(value or "")).strip(" :-")
        if len(candidate) < 8:
            return ""
        lowered = candidate.lower()
        if lowered in {label.lower() for label in _PROCEDURE_DESCRIPTION_LABELS}:
            return ""
        if any(
            token in lowered
            for token in ("paciente quien", "estado salida", "registro de muestras")
        ):
            return ""
        return candidate
