from typing import Iterator
import sys
import re
from collections import defaultdict
from pathlib import Path
from subprocess import PIPE, run

import yaml
from corpustools.sentencedivider import make_sentences

from giellaltgramtools.gramchecker import check_paragraphs_in_parallel
from giellaltgramtools.grammar_error_annotated_sentence import (
    GrammarErrorAnnotatedSentence,
)


def archive_path_to_variant(archive_path: Path) -> str:
    """Convert archive path to language code.

    Args:
        archive_path: Path to the grammar checker archive.
    Returns:
        tuple[str, str]: The language code and archives variant name.
    """
    langs = {
        "se": "sme",
        "ga": "gle",
        "fo": "fao",
        "kl": "kal",
    }
    archive_lang = archive_path.stem
    return f"{langs.get(archive_lang, archive_lang)}gram"

def chunk_lines(max_lines=100)-> Iterator[bytes]:
    chunk = []
    for line in sys.stdin.buffer: 
        chunk.append(line)
        if len(chunk) >= max_lines:
            yield b"\n".join(chunk)
            chunk = []
    if chunk:
        yield b"\n".join(chunk)

def make_tokenised_output(input_bytes: bytes, lang_directory: Path) -> str:
    tokeniser = lang_directory / "tools/tokenisers/tokeniser-disamb-gt-desc.pmhfst"
    if not tokeniser.is_file():
        raise FileNotFoundError(f"Tokeniser not found at: {tokeniser}")
    command = ["hfst-tokenise", "--print-all", str(tokeniser)]
    result = run(
        command,
        input=input_bytes,
        stdout=PIPE,
        stderr=PIPE,
        check=True,
    )
    return result.stdout.decode("utf-8")


def classify_checker_result(result: GrammarErrorAnnotatedSentence) -> set[str]:
    """Classify the grammar checker result into error types.

    Args:
        result: A GrammarErrorAnnotatedSentence object containing the checker result.
    Returns:
        set[str]: A set of error types identified in the result.
    """
    if not result.errors or all(error.error_type == "typo" for error in result.errors):
        return {"generic"}

    return {error.error_type for error in result.errors if error.error_type != "typo"}

def gramcheck_candidates(
    archive_path: Path
) -> list[tuple[str, GrammarErrorAnnotatedSentence]]:
    """Run grammar checker on candidate file and return results."""

    sentences = make_sentences(
        "".join([make_tokenised_output(line, archive_path.parent.parent.parent) for line in chunk_lines()])
    )

    variant = archive_path_to_variant(archive_path)
    checker_results = check_paragraphs_in_parallel(
        command=f"divvun-checker --archive {archive_path} --variant {variant}",
        paragraphs=[
            sentence
            for sentence in sentences
            if not sentence.strip() or "......" not in sentence
        ],
    )

    return [
        (error_type, result)
        for result in checker_results
        for error_type in classify_checker_result(result)
    ]


def error_type_to_file_component(error_type: str) -> str:
    """Convert error type to a filename-safe component."""
    safe_name = re.sub(r"[^A-Za-z0-9._-]+", "-", error_type).strip("-")
    return safe_name or "unknown"


def create_yaml_candidates(
    candidate_prefix: str, archive_path: Path
) -> None:
    """Create candidate files for testing.

    Args:
        input_bytes: Input text as bytes.
        candidate_prefix: Prefix for the candidates file to be created.
        archive_path: Path to the grammar checker archive.
    """
    candidate_list = gramcheck_candidates(
        archive_path=archive_path,
    )

    candidates_by_type: dict[str, list[str]] = defaultdict(list)
    for error_type, result in candidate_list:
        candidates_by_type[error_type].append(result.to_manual_markup())

    if not candidates_by_type:
        print("No candidates found.")
        return

    candidate_directory = archive_path.parent / "tests" / "candidates"
    candidate_directory.mkdir(parents=True, exist_ok=True)

    variant = archive_path_to_variant(archive_path)
    spec_path = "../../pipespec.xml"

    for error_type, tests in sorted(candidates_by_type.items()):
        file_component = error_type_to_file_component(error_type)
        file_name = (
            f"{candidate_prefix}-{file_component}-FAIL.yaml"
            if candidate_prefix
            else f"{file_component}-FAIL.yaml"
        )
        candidate_file_path = candidate_directory / file_name

        yaml_content = yaml.safe_dump(
            {
                "Config": {
                    "Spec": spec_path,
                    "Variants": [variant],
                },
                "Tests": tests,
            },
            allow_unicode=True,
            indent=2,
            width=2000,
            sort_keys=False,
        )

        candidate_file_path.write_text(yaml_content)
        print(f"Candidate file created at: {candidate_file_path}")