"""Analyse tokenised input and provide lexc suggestions for missing words.

Make a suggestion for a missing word
    echo "word" | gtmissing.py -l sme

Make a suggestion for a multiword expression
    echo "multi word" | gtmissing.py -l sme

Make a suggestion for an unlexicalised compound or derivation
    echo "compoundword" | gtmissing.py -l sme

Make suggestions for a whole corpus, save it to a file
    gtmissing.py \\
        -l sme \\
        --input sme-tokenised-corpus-words.txt \\
        --output missing_sme_corpus.lexc
"""

import logging
import os
import re
import subprocess
import sys
from argparse import ArgumentParser, RawDescriptionHelpFormatter
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, Iterator, Optional

from giellaltlextools.hfst import load_hfst  # type: ignore


@dataclass
class LexcEntry:
    stem: str
    tags: list[str]
    lower: str
    contlex: str
    filename: str
    parent_lexicon: str

    def __str__(self) -> str:
        tags = "+".join(self.tags)
        return (
            f"{self.stem.replace(' ', '% ')}{'+' if tags else ''}{tags}:"
            f"{self.lower.replace(' ', '% ')} {self.contlex} ; "
            f"! {self.filename} {self.parent_lexicon}"
        )


LEXC_LINE_RE = re.compile(
    r"""
    (?P<contlex>\S+)            #  any nonspace
    (?P<translation>\s+".*")?   #  optional translation, might be empty
    \s*;\s*                     #  skip space and semicolon
    (?P<comment>!.*)?           #  followed by an optional comment
    $
""",
    re.VERBOSE | re.UNICODE,
)

LEXC_CONTENT_RE = re.compile(
    r"""
    (?P<exclam>^\s*!\s*)?          #  optional comment
    (?P<content>(<.+>)|(.+))?      #  optional content
""",
    re.VERBOSE | re.UNICODE,
)


def parse_line(
    old_match: dict[str, str], lexc_filename: str, lexicon_name: str
) -> Optional[LexcEntry]:
    """Parse a lexc line.

    Arguments:
        old_match:

    Returns:
        The entries inside the lexc line expressed as a dict
    """
    line = old_match.get("content")
    if not line:
        return None

    try:
        upper, lower = line.split(":")
    except ValueError:
        return None

    uppers = upper.split("+")

    return LexcEntry(
        stem=uppers[0],
        tags=uppers[1:],
        lower=lower.strip(),
        contlex=old_match.get("contlex", ""),
        filename=lexc_filename,
        parent_lexicon=lexicon_name,
    )


def make_lexc_entry(
    line: str, lexc_filename: str, lexicon_name: str
) -> Optional[dict[str, str]]:
    """Turn line into a dict using regexes.

    Args:
        line: The line to parse.

    Returns:
        The line as a dict.
    """
    match = LEXC_LINE_RE.search(line)

    if match:
        content = match.groupdict()
        match2 = LEXC_CONTENT_RE.match(LEXC_LINE_RE.sub("", line))
        if match2:
            content.update(match2.groupdict())
            return content

    return None


def line_to_lexicon_name(line: str) -> str:
    """Get the lexicon name from a line."""
    l_name = line.split(" ")[1]
    return l_name.strip()


def get_lexc_lines(lexc_file: Path) -> Iterable[str]:
    """Get lexc lines from a file."""
    return (
        line
        for line in lexc_file.read_text().split("\n")
        if not line.startswith("!") or line.strip()
    )


def get_lexc_files(lang_directory: Path) -> Iterable[Path]:
    """Get lexc files from a directory."""
    morphology_directory = Path(lang_directory) / "src" / "fst" / "morphology"
    return (
        lexc_file
        for subdir in ["stems", "generated_files"]
        for lexc_file in (morphology_directory / subdir).glob("*.lexc")
    )


def handle_lexc_lines(
    lines: Iterable[str], lexc_filename: str
) -> Iterable[LexcEntry]:
    """Handle lexc lines from a file."""
    lexicon_name = None
    for line in lines:
        if line.startswith("LEXICON"):
            lexicon_name = line_to_lexicon_name(line.strip())
            continue
        if lexicon_name is not None:
            try:
                content = make_lexc_entry(line, lexc_filename, lexicon_name)
                if content is None:
                    continue
            except TypeError:
                print(f"Could not parse line {line}", file=sys.stderr)
                continue

            lexc_entry = parse_line(content, lexc_filename, lexicon_name)
            if lexc_entry is not None:
                yield lexc_entry


def read_lexc_files(lang_directory: Path) -> dict[str, list[LexcEntry]]:
    """Read lexc entries from a language file.

    Args:
        lang_directory: The directory to read from.

    Returns:
        A dictionary with the stems as keys and list of LexcEntries are values.
    """

    lexc_dict: dict[str, list[LexcEntry]] = defaultdict(list)
    for lexc_file in get_lexc_files(lang_directory):
        for lexc_entry in handle_lexc_lines(
            lines=get_lexc_lines(lexc_file), lexc_filename=lexc_file.name
        ):
            lexc_dict[lexc_entry.stem].append(lexc_entry)

    return lexc_dict


def parse_hfst_line(hfst_line: str) -> tuple[str, str]:
    """Parse a line from HFST output.

    Args:
        hfst_line: The line to parse.

    Returns:
        A tuple with the stem and the analysis.
    """
    number_of_hfst_fields = 3
    fields = hfst_line.split("\t")
    if len(fields) != number_of_hfst_fields:
        raise ValueError("Invalid HFST line: {}".format(hfst_line))

    return fields[0], fields[1]


def parse_hfst_output(
    lines: Iterable[str],
) -> dict[str, list[str]]:
    """Parse HFST output.

    Args:
        lines: The lines to parse.

    Returns:
        A dictionary with the stems as keys and the analyses as values.
    """
    result: dict[str, list[str]] = {}

    for line in lines:
        stem, analysis = parse_hfst_line(line)
        result.setdefault(stem, []).append(analysis)

    return result


def filter_derivations_and_compounds(
    parsed_hfst_output: dict[str, set[str]],
) -> dict[str, set[str]]:
    """Pick stems that are unlexicalised compounds or derivations.

    Args:
        parsed_hfst_output: The parsed hfst output.

    Returns:
        A dictionary with the stems as keys and the analyses as values.
        It contains only stems that are unlexicalised compounds or derivations.
    """
    return {
        stem: analyses
        for stem, analyses in parsed_hfst_output.items()
        if all(
            "+Cmp#" in analysis or "+Der" in analysis for analysis in analyses
        )
    }


def analyse_expressions(fst: Path, lines: Iterable[str]) -> list[str]:
    """Analyse a list of expressions using a HFST FST.

    Args:
        fst: The path to the HFST FST.
        lines: The expressions to analyse.

    Returns:
        The analyses of the expressions.
    """
    command = ["hfst-lookup", fst.as_posix()]
    result = subprocess.run(
        command,
        capture_output=True,
        check=False,
        input="\n".join(lines).encode("utf-8"),
    )
    return [
        line.strip()
        for line in result.stdout.decode("utf-8").split("\n")
        if line.strip()
    ]


def pyhfst_analyse_expressions(
    fst: Path, lines: Iterable[str]
) -> Iterator[tuple[str, set[str]]]:
    """Analyse a list of expressions using a HFST FST with pyhfst.

    Args:
        fst: The path to the HFST FST.
        lines: The expressions to analyse.

    Returns:
        The analyses of the expressions.
    """
    analyser = load_hfst(fst.as_posix())  # type: ignore
    for line in lines:
        analyses = analyser.lookup(line.strip())  # type: ignore
        if analyses:
            yield line.strip(), {analysis[0] for analysis in analyses}  # type: ignore
        else:
            yield line.strip(), set()


def categorise_pyhfst_output(
    pyhfst_output: Iterator[tuple[str, set[str]]],
) -> tuple[dict[str, set[str]], set[str]]:
    """Categorise pyhfst output into a dictionary.

    Args:
        pyhfst_output: The output from pyhfst analyse_expressions.

    Returns:
        A dictionary with the stems as keys and the analyses as values.
    """
    analysed: dict[str, set[str]] = defaultdict(set)
    typos: set[str] = set()
    for stem, analyses in pyhfst_output:
        if analyses and stem not in analysed:
            analysed[stem] = analyses
        elif not analyses and stem not in typos:
            typos.add(stem)

    return analysed, typos


def get_longest_cmp_stem(suffix: str, analyses: set[str]) -> str:
    """Get the longest last compound stem from a list of analyses."""
    for analysis in analyses:
        logging.debug(f"{analysis=}")

    return max(
        [
            analysis.split("#")[-1].split("+")[0]
            for analysis in analyses
            if analysis.split("#")[-1].split("+")[0].endswith(suffix)
        ],
        key=len,
    )


def lexicalise_compound(
    unlexicalised_compound_stem: str,
    analyses: set[str],
    lexc_dict: dict[str, list[LexcEntry]],
) -> Iterator[LexcEntry]:
    """Lexicalise an unlexicalised compound stem.

    Args:
        unlexicalised_compound_stem: The unlexicalised compound stem.
        analyses: The analyses of the compound stem.
        lexc_dict: The lexc dictionary.

    Returns:
        An iterator of lexicalised lexc entries.
    """
    try:
        longest_last_stem = get_longest_cmp_stem(
            suffix=unlexicalised_compound_stem[-1], analyses=analyses
        )
    except ValueError:
        logging.debug(
            f"Could not find a compound stem for {unlexicalised_compound_stem}"
        )
        return iter([])

    if longest_last_stem not in lexc_dict:
        raise ValueError(f"Longest stem {longest_last_stem} not found in lexc")

    prefix = unlexicalised_compound_stem[
        : unlexicalised_compound_stem.find(longest_last_stem)
    ]
    matching_lexc_entries = lexc_dict.get(longest_last_stem, [])

    logging.debug(
        f"{prefix=} {unlexicalised_compound_stem=} {longest_last_stem=}"
    )
    return (
        LexcEntry(
            stem=f"{prefix}{longest_last_stem}",
            tags=entry.tags,
            lower=f"{prefix}#{entry.lower}",
            contlex=entry.contlex,
            filename=entry.filename,
            parent_lexicon=entry.parent_lexicon,
        )
        for entry in matching_lexc_entries
    )


def get_matching_lexc_stems(
    hfst_stem: str, lexc_stems: list[str]
) -> tuple[str, list[str]]:
    """Get the matching lexc stems for a HFST stem.

    Args:
        hfst_stem: The HFST stem.
        lexc_stems: The lexc stems, keys from the lexc dictionary.

    Returns:
        A tuple with the common ending and a list of the matching lexc stems.
    """
    for index in range(1, len(hfst_stem) - 3):
        ending = hfst_stem[index:]
        hits = [stem for stem in lexc_stems if stem.endswith(ending)]
        if hits:
            return ending, hits

    return "", []


def make_missing_lexc_entry(
    hfst_stem: str, common_ending: str, lexc_entry: LexcEntry
) -> LexcEntry:
    """Make a lexc entry for stem not found in the analyser.

    Args:
        hfst_stem: The HFST stem.
        common_ending: The common ending of the HFST and lexc stems.
        lexc_entry: The lexc entry.

    Returns:
        A modified version of the incoming lexc entry for the missing stem.
    """
    hfst_prefix = hfst_stem[: hfst_stem.find(common_ending)]
    old_prefix = lexc_entry.stem[: lexc_entry.stem.find(common_ending)]
    old_lower = lexc_entry.lower

    # Skip matching chars in old_prefix vs old_lower
    i, j = 0, 0
    while i < len(old_prefix) and j < len(old_lower):
        if old_prefix[i] == old_lower[j]:
            i += 1
            j += 1
        else:
            j += 1

    new_lower = hfst_prefix + old_lower[j:]
    logging.debug(
        f"{hfst_stem=} {common_ending=} {lexc_entry.stem=} {new_lower=}"
    )
    return LexcEntry(
        stem=hfst_stem,
        tags=lexc_entry.tags,
        lower=new_lower,
        contlex=lexc_entry.contlex,
        filename=lexc_entry.filename,
        parent_lexicon=lexc_entry.parent_lexicon,
    )


def get_shortest_matching_lexc_entries(
    lexc_entries: Iterable[LexcEntry],
) -> list[LexcEntry]:
    """Find lexc entries with the shortest stems.

    A stem may multiple contination lexicons and with multiple
    parent lexicons. We would like to present the linguist with the
    shortest stem for each unique combination of contlex and parent lexicon.

    Args:
        lexc_entries: The lexc entries that matches an ending.

    Returns:
        The lexc entries with the shortest stem for each unique combination
        of contlex and parent lexicon.
    """

    map_by_lexicons: dict[str, list[LexcEntry]] = {}
    for entry in lexc_entries:
        map_by_lexicons.setdefault(
            f"{entry.contlex}_{entry.parent_lexicon}", []
        ).append(entry)

    return [
        min(entries, key=lambda entry: len(entry.stem))
        for entries in map_by_lexicons.values()
    ]


def get_typos(descriptive_typos: dict[str, set[str]]) -> Iterator[str]:
    """Print typos with their analyses.

    Args:
        descriptive_typos: These are typos, since they are found in the
            descriptive analyser, but not in the normative analyser.
    """
    yield "!Typos\n"
    yield from (
        f"! {hfst_stem}\n"
        + "\n".join(f"!\t{analysis}" for analysis in analyses)
        + "\n"
        for hfst_stem, analyses in descriptive_typos.items()
    )


def get_lexicalised_compounds(
    lexc_dict: dict[str, list[LexcEntry]],
    compounds_and_derivations_only: dict[str, set[str]],
    comment_string: str,
) -> Iterator[str]:
    """Lexicalise compounds and derivations

    Present tentive lexc entries to the linguist.

    Args:
        lexc_dict: The lexc dictionary.
        compounds_and_derivations_only: The compounds and derivations that are
            not lexicalised.
    """
    yield "! Compounds and derivations only\n"
    yield from (
        str(lexc_entry) + comment_string
        for hfst_stem, analyses in compounds_and_derivations_only.items()
        for lexc_entry in lexicalise_compound(hfst_stem, analyses, lexc_dict)
    )


def get_typos_suggestions(
    lexc_dict: dict[str, list[LexcEntry]],
    missing_desc_words: set[str],
    comment_string: str,
) -> Iterator[str]:
    """Yield suggestions for missing words in the descriptive analyser.

    Match the missing words with the lexc dictionary and present tentive lexc
    entries to the linguist.

    Args:
        lexc_dict: The lexc dictionary.
        missing_desc_words: Words that are unknown to both the normative
            and descriptive analyser.
    """
    yield "! Suggestions for missing words\n"
    for desc_missing_word in sorted(missing_desc_words):
        common_ending, matching_lexc_stems = get_matching_lexc_stems(
            desc_missing_word, list(lexc_dict.keys())
        )

        lexc_entries = [
            make_missing_lexc_entry(
                desc_missing_word, common_ending, matching_entry
            )
            for matching_entry in get_shortest_matching_lexc_entries(
                [
                    lexc_entry
                    for stem in matching_lexc_stems
                    for lexc_entry in lexc_dict[stem]
                ]
            )
        ]

        lexc_strings = [
            str(lexc_entry)
            for lexc_entry in lexc_entries
            if str(lexc_entry).strip()
        ]
        if lexc_strings:
            yield f"! Suggestions for missing word: {desc_missing_word}"
            yield from (
                lexc_string + comment_string for lexc_string in lexc_strings
            )
            yield "\n"


def parse_args():
    parser = ArgumentParser(
        description=__doc__, formatter_class=RawDescriptionHelpFormatter
    )
    parser.add_argument(
        "-i",
        "--input",
        default=sys.stdin,
        type=Path,
        dest="infile",
        help="source of analysis data",
    )
    parser.add_argument(
        "-l",
        "--language",
        required=True,
        help="The language to analyse. This should be the language code, "
        "e.g., 'sme' for Northern Sami.",
    )
    parser.add_argument(
        "-o",
        "--output",
        default=sys.stdout,
        type=Path,
        dest="outfile",
        help="output file",
    )
    parser.add_argument(
        "-t",
        "--no-typos",
        action="store_false",
        help="Do not print typos",
    )
    parser.add_argument(
        "-n",
        "--normative-fst",
        help="The path to the normative FST",
        default=None,
        type=Path,
    )
    parser.add_argument(
        "-d",
        "--descriptive-fst",
        help="The path to the descriptive FST",
        default=None,
        type=Path,
    )
    parser.add_argument(
        "-c",
        "--comment",
        help="A freestyle comment to add to the output",
        default="",
    )
    parser.add_argument(
        "-r",
        "--lang-root",
        help="The root of the language directory",
        default=None,
        type=Path,
    )
    parser.add_argument(
        "--debug",
        action="store_true",
        help="Print debug information",
    )

    return parser.parse_args()


def get_language_parent(lang_root: Optional[str]) -> Path:
    if lang_root is None:
        lang_parent = os.getenv("GTLANGS")
        if not lang_parent:
            raise SystemExit("GTLANGS environment variable not set")
    else:
        lang_parent = lang_root

    lang_path = Path(lang_parent)
    if not lang_path.exists():
        raise SystemExit(f"Could not find the language directory {lang_path}")

    return lang_path


def get_analysers(
    normative_analyser: Optional[str],
    descriptive_analyser: Optional[str],
    lang_directory: Path,
    language: str,
) -> tuple[Path, Path]:
    if normative_analyser is not None and descriptive_analyser is not None:
        return Path(normative_analyser), Path(descriptive_analyser)

    for prefix in [
        lang_directory / "src/fst/",
        Path("/usr/local/share/giella/") / language,
        Path("/usr/share/giella/") / language,
    ]:
        logging.debug(f"Looking for analysers in {prefix}")
        normative_path = prefix / "analyser-gt-norm.hfstol"
        descriptive_path = prefix / "analyser-gt-desc.hfstol"

        if normative_path.exists() and descriptive_path.exists():
            return normative_path, descriptive_path

    raise SystemExit("Could not find the normative and descriptive analyser.")


def main():
    # Setup
    args = parse_args()

    logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)

    lang_parent = get_language_parent(args.lang_root)
    lang_directory = lang_parent / f"lang-{args.language}"

    normative_analyser, descriptive_analyser = get_analysers(
        args.normative_fst, args.descriptive_fst, lang_directory, args.language
    )

    # Save output from the normative analyser.
    input_stream = (
        sys.stdin if args.infile == sys.stdin else args.infile.open()
    )
    norm_analysed, norm_typos = categorise_pyhfst_output(
        pyhfst_analyse_expressions(
            fst=normative_analyser,
            lines={line for line in input_stream if line.strip()},
        )
    )

    # The words that are missing in the normative analyser may be typos.
    # Sending those words through the descriptive analyser gives us a list of
    # typos and really unknown words.
    descriptive_analysed, descriptive_typos = categorise_pyhfst_output(
        pyhfst_analyse_expressions(fst=descriptive_analyser, lines=norm_typos)
    )

    if args.infile == sys.stdin:
        input_filename = ""
    else:
        f = str(args.infile.absolute()).replace(str(lang_parent), "$GTLANGS")
        input_filename = f" Inputfile: {f}"

    comment = f" Comment: {args.comment}" if args.comment else ""

    # Present the result of the analysis to the linguist.
    # The categories are:
    # 1. Suggestions for unlexicalised words and multiword expressions
    # 2. Suggestions for unlexicalised compounds and derivations
    # 3. Optionally, typos

    # The words unknown to both the normative and the descriptive analyser
    # are given as the second argument.

    compounds_and_derivations_only = filter_derivations_and_compounds(
        norm_analysed
    )
    if not (
        descriptive_typos
        or compounds_and_derivations_only
        or descriptive_analysed
    ):
        print("No missing words or unlexicalised compounds found.")
        sys.exit(0)

    # Read lexc files
    lexc_dict = read_lexc_files(lang_directory)

    output_stream = (
        sys.stdout if args.outfile == sys.stdout else args.outfile.open("w")
    )
    print(
        "\n".join(
            get_typos_suggestions(
                lexc_dict=lexc_dict,
                missing_desc_words=descriptive_typos,
                comment_string=comment + input_filename,
            )
        ),
        file=output_stream,
    )

    print(
        "\n".join(
            get_lexicalised_compounds(
                lexc_dict,
                compounds_and_derivations_only=compounds_and_derivations_only,
                comment_string=comment + input_filename,
            )
        ),
        file=output_stream,
    )
    if not args.no_typos:
        print(
            "\n".join(
                get_typos(
                    descriptive_typos=descriptive_analysed,
                )
            ),
            file=output_stream,
        )