#!/usr/bin/perl
# -*- mode: cperl; indent-tabs-mode: nil; tab-width: 3; cperl-indent-level: 3; -*-
use strict;
use warnings;
use utf8;
use FileHandle;
use IPC::Cmd qw(can_run);
use IPC::Open2;
use Digest::SHA qw(sha1_base64);
use FindBin qw($Bin);

BEGIN {
	$| = 1;
	binmode(STDIN, ':encoding(UTF-8)');
	binmode(STDOUT, ':encoding(UTF-8)');
}
use open qw( :encoding(UTF-8) :std );
use feature 'unicode_strings';

use Getopt::Long;
Getopt::Long::Configure('no_ignore_case');
my $opt_bin = '';
my $opt_fst = 0;
my $opt_verbose = 0;
my $opt_noquote_hyph = 0;
my $opt_noquote_raw = 0;
my $rop = GetOptions(
	'bin|b=s' => \$opt_bin,
	'fst|f' => \$opt_fst,
	'verbose|v' => \$opt_verbose,
	'no-quote-hyphen|H' => \$opt_noquote_hyph,
	'no-quote-raw|R' => \$opt_noquote_raw,
                    );

my @bins = ($opt_bin, can_run('hfst-tokenise'), '/usr/local/bin/hfst-tokenise', '/opt/local/bin/hfst-tokenise', '/usr/bin/hfst-tokenise');
my $bin = '';
foreach my $f (@bins) {
   if ($f && -x $f) {
      $bin = $f;
      last;
   }
}
if (!$bin || !-x $bin) {
   die("No usable hfst-tokenise given or found!\n");
}
if ($opt_verbose) {
   print STDERR "kal-tokenise: Using ${bin}\n";
}

my $prefix = '/usr';
my @fsts = (
   '',
   "${prefix}/share/giella/kal/tokeniser-disamb-gt-desc.pmhfst",
   '/usr/local/share/giella/kal/tokeniser-disamb-gt-desc.pmhfst',
   '/usr/share/giella/kal/tokeniser-disamb-gt-desc.pmhfst',
   "$Bin/../tokenisers/tokeniser-disamb-gt-desc.pmhfst",
   );
if (defined $ARGV[0] && -s $ARGV[0]) {
   $fsts[0] = $ARGV[0];
}
my $fst = '';
foreach my $f (@fsts) {
   if ($f && -s $f && -r $f) {
      $fst = $f;
      last;
   }
}

if (!$fst || !-s $fst) {
   die("No usable tokeniser.pmhfst given or found!\n");
}
if ($opt_verbose) {
   print STDERR "kal-tokenise: Using ${fst}\n";
}

my $t = "${bin} -L ${fst} | grep --line-buffered -v -F 'Use/Hybrid'";

# Prefixes from root.lexc ca. lines 14-15
my @p = qw(AA TA);

# Main POS tags from root.lexc ca. lines 19-29
my @m = qw(N V Pali Conj Adv Interj Pron Prop Num Symbol);

# Other POS tags from root.lexc ca. lines 114-180
my @a = qw(
   Sg Du Pl
   Abs Rel Trm Abl Lok Aeq Ins Via
   Nom Akk
   Ind Int Imp Opt Cau Con Par Cont ContNeg IteCau
   1Sg 2Sg 3Sg 4Sg 1Pl 2Pl 3Pl 4Pl 1Du 2Du 3Du 4Du
   1SgO 2SgO 3SgO 4SgO 1PlO 2PlO 3PlO 4PlO 1DuO 2DuO 3DuO 4DuO
   1SgPoss 2SgPoss 3SgPoss 4SgPoss 1PlPoss 2PlPoss 3PlPoss 4PlPoss
   );

# List of tags that should block Gram/TV and Gram/IV propagation
my @v_block = qw(
   NIQAR HTR NAR GUMINAR SIMA TARIAQAR
   UTE
   SAAR SAR TIP NIRAR SURE SUGE GASUGE GASURE NASUGE NASURE TSAALI TIR TITIR QQU
   );

my $pp = join('|', @p);
my $mp = join('|', @m);
my $ap = join('|', @a);
my $vb = join('|', @v_block);
my $i = 'i';

open2(*OUT, *IN, $t) or die $!;
binmode(OUT, ':encoding(UTF-8)');
binmode(IN, ':encoding(UTF-8)');
autoflush OUT 1;
autoflush IN 1;

my %cache = ();
my $puncts = '\x{2018}-\x{201F}\x{2039}\x{203A}\x{2E42}\x{3008}-\x{300B}\x{301D}-\x{301F}”“"\'´`‘’()»«';
my $ps_double = '[\x{2018}-\x{201B}\'´`‘’]{2,}|[\x{201C}-\x{201F}\x{2E42}\x{301D}-\x{301F}”“"]+';
my $ps_single_r = '\x{2018}-\x{201B}\'´`‘’';
my $ps_single = "[$ps_single_r]+";
my $ps_parens = '[()]';
my $ps_angle = '[\x{2039}\x{203A}\x{3008}-\x{300B}»«]';

sub call_tokenizer {
   my ($in) = @_;

   # Protect tags early, to avoid that the various workarounds mangle them
   if ($in =~ m@^</?\w+.*>@) {
      return $in;
   }

   # Turn very long ALL-UPPER strings to First Upper to avoid geometric slowdown
   while ($in =~ m@(\p{Lu}[\pP\pS\p{Lu}]{14,})@g) {
      my ($w,$t) = ($1, ucfirst(lc($1)));
      if ($w !~ m@\p{Lu}.*?\p{Lu}@) {
         # If there aren't multiple upper-case letters, assume it'll tokenize around symbols
         next;
      }
      $in =~ s@\Q$w\E@ \x{e001} $t@g;
   }

   # Hide extremely long words
   my %long_cache = ();
   while ($in =~ m@(\pL{100,})@) {
      my ($s,$w) = ($1,$1);
      utf8::encode($s); # sha1_base64() can't handle UTF-8 for some reason
      my $hash = sha1_base64($s);
      $hash =~ s/[^a-zA-Z0-9]/x/g;
      $long_cache{$w} = "LONG_$hash";
      $in =~ s@\Q$w\E@LONG_$hash@g;
   }

   # Turn URLs into placeholders to avoid endless loop
   my %url_cache = ();
   while ($in =~ m@((?:(ht|f)tp(s?)://)?[^\s\pZ/]+\.[a-z]{2,}/\S*)@) {
      my ($s,$u) = ($1,$1);
      utf8::encode($s); # sha1_base64() can't handle UTF-8 for some reason
      my $hash = sha1_base64($s);
      $hash =~ s/[^a-zA-Z0-9]/x/g;
      $url_cache{$u} = "URL_$hash";
      $in =~ s@\Q$u\E@URL_$hash@g;
      #print STDERR "URL $u -> $hash\n";
   }

   my @qhyphs = ();

   # Turn "quoted with hyphen suffix"-imi into placeholders
   if (!$opt_noquote_hyph) {
      # Keep suffixes in line with affixes/propernouns.lexc lexicon ZcitationsformZ
      my @qs = split(m@([$puncts]+\p{Pd}i?(?:p|mut|mit|minngaannit|minngaanniit|miit|mi|tut|mik|kkut))\b@, $in);
      if (scalar(@qs) > 1) {
         for (my $i=1 ; $i<scalar(@qs) ; $i+=2) {
            my ($punct,$dash,$suffix) = ($qs[$i] =~ m@([$puncts]+)(\p{Pd})(i?(?:p|mut|mit|minngaannit|minngaanniit|miit|mi|tut|mik|kkut))\b@);

            my @accent = ();
            my $ps = "[$puncts]+";
            if ($punct =~ m@^$ps_double$@) {
               # Match double single-quotes with actual double-quotes
               $ps = $ps_double;
            }
            elsif ($punct =~ m@^$ps_single$@) {
               # Protect things like o'Malley d'Aix l'Étudiant 4'eren and possessive 's
               while ($qs[$i-1] =~ s@(\s[\pL\pN\pM]+)([$ps_single_r])([^$ps_single_r]+)$@$1\x{E000}$3@i) {
                  push(@accent, $2);
               }
               $ps = $ps_single;
            }
            elsif ($punct =~ m@^$ps_parens$@) {
               $ps = $ps_parens;
            }
            elsif ($punct =~ m@^$ps_angle$@) {
               # Various angle-brackets
               $ps = $ps_angle;
            }

            if ($qs[$i-1] =~ m@(.*)(^|[\s.,:;?()])([\pL\pN\pM]*$ps)([\pL\pN\pM\s!].*?)$@) {
               my ($pre,$ppunct,$quote) = ($1.$2, $3, $4);
               # Skip if the quoted part contains what looks like a sentence fragment
               if ($quote !~ m@\s[\p{Ll}\pN\pM]+\.\s\p{Lu}@ && scalar(() = ($quote =~ m@(\s+)@g)) < 20) {
                  my $n = scalar(@qhyphs);
                  $qs[$i-1] =~ s@(\Q$pre\E)\Q$ppunct$quote\E$@$1 QuotedHyphen$n-$suffix @;
                  for my $ip (@accent) {
                     $quote =~ s@\x{E000}@$ip@;
                  }
                  $qs[$i] = '';
                  push(@qhyphs, ["$ppunct$quote$punct", $dash, $suffix]);
                  #print STDERR "QHYPH $qhyphs[$n]->[0] $dash $suffix\n";
               }
               else {
                  #print STDERR "REJECTED $ppunct$quote$qs[$i]\n";
               }
            }
            else {
               #print STDERR "NOMATCH $qs[$i-1] $qs[$i]\n";
            }
            for my $ip (@accent) {
               $qs[$i-1] =~ s@\x{E000}@$ip@;
            }
         }
         $in = join('', @qs);
      }
   }

   if (!$opt_noquote_raw) {
=pod
      # If there is exactly one quote pair of a type, then we dare try handling it

      my $cnt_double = () = ($in =~ m@$ps_double@g);
      if ($cnt_double == 2) {
         # Turn "double-quoted without hyphen suffix" into placeholders
         while ($in =~ m@(^|\s|\pP)($ps_double)((?:\S+?\s+?){0,6}?\S[\pL\pN\pM]*?)($ps_double)(\s|\pP|$)@) {
            my ($pre,$pq,$q,$sq,$suf) = ($1,$2,$3,$4,$5);
            my $n = scalar(@qhyphs);
            $in =~ s@\Q$pq$q$sq\E@ QuotedHyphen$n @g;
            push(@qhyphs, ["$pq$q$sq", '', '']);
            #print STDERR "DQUOTE $qhyphs[$n]->[0]\n";
         }
      }

      my $cnt_single = () = ($in =~ m@[$ps_single_r]@g);
      if ($cnt_single == 2) {
         # Turn 'single-quoted without hyphen suffix' into placeholders
         while ($in =~ m@(^|\s|\pP)($ps_single)((?:\S+?\s+?){0,6}?\S[\pL\pN\pM]*?)($ps_single)(\s|\pP|$)@) {
            my ($pre,$pq,$q,$sq,$suf) = ($1,$2,$3,$4,$5);
            my $n = scalar(@qhyphs);
            $in =~ s@\Q$pq$q$sq\E@ QuotedHyphen$n @g;
            push(@qhyphs, ["$pq$q$sq", '', '']);
            #print STDERR "SQUOTE $qhyphs[$n]->[0]\n";
         }
      }

      # Turn (parentheticals) into placeholders
      while ($in =~ m@(^|\s|\pP)([(])((?:\S+?\s+?){0,6}?\S[\pL\pN\pM]*?)([)])(\s|\pP|$)@) {
         my ($pre,$pq,$q,$sq,$suf) = ($1,$2,$3,$4,$5);
         my $n = scalar(@qhyphs);
         $in =~ s@\Q$pq$q$sq\E@ QuotedHyphen$n @g;
         push(@qhyphs, ["$pq$q$sq", '', '']);
         #print STDERR "PQUOTE $qhyphs[$n]->[0]\n";
      }
=cut
   }

   # If the input consisted only of quotes, undo the placeholders
   if ($in =~ m@^(\s*QuotedHyphen\d+(?:-\S+)?)+\s*$@) {
      for (my $n=0 ; $n<scalar(@qhyphs) ; ++$n) {
         my ($q,$d,$s) = @{$qhyphs[$n]};
         $in =~ s@QuotedHyphen$n-@$q$d@g;
         $in =~ s@QuotedHyphen$n(\s|$)@$q$1@g;
      }
   }

   # Mark that this symbol did not have a space between it and the number
   $in =~ s@(^|\s)([-+]\d+)(?=(?:\s|\pP|$))@$1\x{e003}$2@g;
   $in =~ s@(^|\s)(\d+[:])(?=(?:\s|\pP|$))@$1\x{e003} $2@g;

   my $out = '';
   if (defined $cache{$in}) {
      $out = $cache{$in};
   }
   else {
      my $nonce = '<nonce-'.rand().'>';
      print IN "$in\n$nonce\n";
      while ($out !~ /\Q$nonce\E/) {
         $out .= <OUT>;
      }
      $out =~ s@\Q$nonce\E\s*@@sg;
      $out =~ s@ \?\?+@ ?@g;

      # Move the no-space marker onto the token after it
      $out =~ s@(\n?) *\x{e003} *\n("<.+?>"\n\t[^\n]+)@$1$2 <nospace>@g;
      $out =~ s@"<\x{e003}>"\n\t"\x{e003}" \?\n("<.+?>"\n\t[^\n]+)@$1 <nospace>@g;

      # Restore ALL-UPPER forms
      $out =~ s@\n?\x{e001} *\n"<([^\n]+?)>"@\n"<\U$1>"@g;
      $out =~ s@"<\x{e001}>"\n\t"\x{e001}" \?\n"<(.+?)>"@"<\U$1>"@g;

      # Restore URLs and give them the default analysis
      while (my ($u,$h) = each(%url_cache)) {
         $out =~ s@"<\Q$h\E>"@"<$u>"@g;
         $out =~ s@\t"\Q$h\E" \?@\t"$u" Sem/Url Prop Abs\n\t"$u" URL@g;
      }

      # Restore extremely long words
      while (my ($w,$h) = each(%long_cache)) {
         $out =~ s@\Q$h\E@$w@g;
      }

      for (my $n=0 ; $n<scalar(@qhyphs) ; ++$n) {
         my ($q,$d,$s) = @{$qhyphs[$n]};
         $q =~ s@[\s\pZ]+@\x{E020}@g;
         $out =~ s@"QuotedHyphen$n"( N [^\n]+)@"$q"$1 Heur/Quote@g;
         $out =~ s@<QuotedHyphen$n-@<$q$d@g;
         $out =~ s@<QuotedHyphen$n>@<$q>@g;
      }

      # Move prefixes after the base form
      $out =~ s@^(\s+)($pp) (".+?" )@$1$3Prefix/$2 @mg;

      # Special case +URL
      $out =~ s/"\+URL/" URL/g;

      # Remove empty lines, because they were simply spaces in the input
      # $out =~ s/\n([\s\t]*\n)+/\n/g; # HANDLE BETTER

      if ($in !~ /\s/) {
         $cache{$in} = $out;
      }
   }

   return $out;
}

sub parse_cohorts {
   my ($out) = @_;

   my @cs = ();
   my $in_c = 0;
   my $last = 0;
   # Parse hfst-tokenize's output to array of cohorts
   foreach (split(/\n+/, $out)) {
      my $f = $_;
      s/\s+$//g;

      if (/^"<.*?>"/) {
         # Start of a cohort
         if ($in_c) {
            ++$last;
         }
         $cs[$last][0] = $_;
         $cs[$last][1] = '';
         $in_c = 1;
      }
      elsif (/^\t+/ && $in_c) {
         # A reading
         $cs[$last][1] .= "$_\n";
      }
      else {
         # Neither cohort nor reading
         if ($in_c) {
            ++$last;
         }
         $cs[$last][0] = $_;
         $cs[$last][1] = '';
         if (!/^</ && $_) {
            # Not a tag, so give it a null analysis
            $cs[$last][0] = "\"<$_>\"";
            $cs[$last][1] = "\t\"$_\" ?\n";
         }
         ++$last;
         if ($f =~ /^[.]+([ \t]+)$/) {
            $cs[$last][0] = $1;
            $cs[$last][1] = '';
            ++$last;
         }
         $in_c = 0;
      }
   }

   return @cs;
}

sub analyze {
   my ($in) = @_;
   my $out = call_tokenizer($in);

   my @cs = parse_cohorts($out);

   my $did = 1;
   while ($did) {
      $did = 0;

      for (my $i=0 ; $i<scalar(@cs) ; ++$i) {
         # Skip empty lines and tags
         if (!$cs[$i]->[0] || $cs[$i]->[0] =~ /^</) {
            next;
         }

         # Work around HFST bug where abbreviations without space after are misanalyzed
         if (defined $cs[$i+1] && $cs[$i+1]->[0] =~ /^"<\.>"/ && ($cs[$i]->[1] =~ m@ Gram/Abbr\b@ || $cs[$i]->[1] =~ / \?/)) {
            my ($f) = ($cs[$i]->[0] =~ /^"<(.*)>"/);
            my $m = call_tokenizer("$f.");
            if ($m =~ /\t"/ && $m !~ / \?/ && $m =~ /^("<[^\n]*>"[^\n]*)\n(.+)$/s) {
               ($cs[$i]->[0], $cs[$i]->[1]) = ($1, $2);
               splice(@cs, $i+1, 1);
               $did = 1;
               last;
            }
         }

         # Merge sequential dots
         if ($cs[$i]->[0] =~ /^"<([.]+)>"/) {
            my $dots = $1;
            if (defined $cs[$i+1] && $cs[$i+1]->[0] =~ /^"<([.]+)>"/) {
               $dots .= $1;
               $cs[$i]->[0] =~ s/^"<[.]+>"/"<$dots>"/;
               $cs[$i]->[1] =~ s/(\s)"[.]+"/$1"$dots"/g;
               splice(@cs, $i+1, 1);
               $did = 1;
               last;
            }
         }

         # Try to merge sequential un-analyzed chunks
         if ($cs[$i]->[1] =~ / \?/) {
            my $j = $i;
            my $e = $i;
            my $len = 0;
            for ( ; $j<scalar(@cs) ; ++$j) {
               if ($cs[$j]->[0] !~ /^[\s\pZ]*$/ && $cs[$j]->[1] !~ / \?/) {
                  last;
               }
               # Avoid geometric slowdown
               if ($len + length($cs[$j]->[0]) - 4 >= 50) {
                  last;
               }
               $e = $j;
            }
            $j = $i;
            while ($j != $e) {
               my $combo = '';
               for ( ; $j<=$e ; ++$j) {
                  if ($cs[$j]->[0] =~ /^"<(.+)>"/) {
                     $combo .= $1;
                  }
               }
               my $m = call_tokenizer($combo);
               if ($m =~ /\t"[^\n]*"[^?\n]+\n/) {
                  my @ncs = parse_cohorts($m);
                  splice(@cs, $i, $e - $i + 1, @ncs);
                  last;
               }
               $j = $i;
               --$e;
            }
         }

         if ($cs[$i]->[1] !~ / \?/) {
            next;
         }

         # Work around HFST bug by recursing un-analyzed chunks
         my ($f) = ($cs[$i]->[0] =~ /^"<(.*)>"/);
         my %heur = ();
         my $orig = '';

         my $merge_ncs = sub {
            my ($m) = @_;
            my @ncs = parse_cohorts($m);
            my $h = join(' ', sort(keys(%heur)));
            my @origs = split(/\x{e002}/, $orig);
            for (my $ni=0 ; $ni<scalar(@ncs) ; ++$ni) {
               if ($origs[$ni]) {
                  $ncs[$ni][0] =~ s/^("<.*>")/"<$origs[$ni]>"/;
               }
               if ($h) {
                  $ncs[$ni][1] =~ s/(\t".*"[^\n]+)/$1 $h/g;
               }
            }
            splice(@cs, $i, 1, @ncs);
            $did = 1;
         };

         while (42) {
            # If input has non-alphanumerics, try it again as-is - this catches punctuation
            if ($f =~ /[^\pL\pN\pM]/) {
               my $m = call_tokenizer($f);
               if ($m =~ /\t"[^\n]*"[^?\n]+\n/) {
                  $merge_ncs->($m);
                  last;
               }
            }

            # Split non-alphanumeric from alphanumeric and retry them separately
            if ($f =~ /^([^\pL\pN\pM]+)(.+)$/ || $f =~ /^(.+?)([^\pL\pN\pM]+)$/) {
               my ($f1,$f2) = ($1, $2);
               my $m = call_tokenizer($f1).call_tokenizer($f2);
               if ($m =~ /\t"[^\n]*"[^?\n]+\n/) {
                  $merge_ncs->($m);
                  last;
               }
            }

            # Split numeric from alphabetic and retry them separately
            if ($f =~ /^([^\pL\pM]+)(\pN+)$/ || $f =~ /^(\pN+?)([^\pL\pM]+)$/) {
               my ($f1,$f2) = ($1, $2);
               my $m = call_tokenizer($f1).call_tokenizer($f2);
               if ($m =~ /\t"[^\n]*"[^?\n]+\n/) {
                  $merge_ncs->($m);
                  last;
               }
            }

            # If input has Ųų, try them as Øø
            if ($f =~ /[Ųų]/) {
               $orig = $orig || $f;
               $heur{'Heur/Ų'} = 1;
               $f =~ s/Ų/Ø/g;
               $f =~ s/ų/ø/g;
               my $m = call_tokenizer($f);
               if ($m =~ /\t"[^\n]*"[^?\n]+\n/) {
                  $merge_ncs->($m);
                  last;
               }
            }

            # If input might be lacking a -, try with one
            if ($f =~ /^([\p{Lu}\pN\pM]+)(\P{Lu}+)$/ && $f !~ /-/) {
               $orig = $orig || $f;
               my $m = call_tokenizer("$1-$2");
               if ($m =~ /\t"[^\n]*"[^?\n]+\n/ && $m !~ /^"<.*?>".*?\n"<.*?>"/s && $m !~ /(?:^|\n)[^\n<>"]+(?:\n|$)/s) {
                  $heur{'Heur/HyphenAdd'} = 1;
                  $merge_ncs->($m);
                  last;
               }
               $orig = undef;
            }

            # Split around hyphens and dashes
            if ($f =~ /^([\pL\pN\pM]+)([-\p{Pd}]+)(.+)$/) {
               $orig = $orig || $f;
               if ($orig =~ /^([\pL\pN\pM]+)([-\p{Pd}]+)(.+)$/) {
                  $orig = "$1\x{e002}$2\x{e002}$3";
               }
               my ($f1,$f2,$f3) = ($1, $2, $3);
               my ($m1,$m2,$m3) = (call_tokenizer($f1), call_tokenizer($f2), call_tokenizer($f3));
               if ($m1 =~ /\t"[^\n]*"[^?\n]+\n/ && $m3 =~ /\t"[^\n]*"[^?\n]+\n/) {
                  $merge_ncs->("$m1$m2$m3");
                  last;
               }
               $orig = undef;
            }

            # If input might be lacking a space, try with one
            if ($f =~ /^([\pL\pN\pM]+?)(\p{Lu}+[\p{Ll}\pN\pM]+)$/) {
               my ($f1,$f2) = ($1, $2);
               #print STDERR "DEBUG: $f1 $f2\n";
               my $m = call_tokenizer($f1).call_tokenizer($f2);
               if ($m =~ /\t"[^\n]*"[^?\n]+\n/) {
                  $merge_ncs->($m);
                  last;
               }
            }

            # Try without hyphens or dashes, potentially injecting an 'i' instead
            if ($f =~ /[-\p{Pd}]/) {
               $orig = $orig || $f;
               my $nf = $f;
               $nf =~ s/[-\p{Pd}]+//g;
               my $m = call_tokenizer($nf);
               if ($m =~ /\t"[^\n]*"[^?\n]+\n/) {
                  $f = $nf;
                  $heur{'Heur/HyphenRem'} = 1;
                  $merge_ncs->($m);
                  last;
               }
               $nf = $f;
               $nf =~ s/[-\p{Pd}]+/i/g;
               $m = call_tokenizer($nf);
               if ($m =~ /\t"[^\n]*"[^?\n]+\n/) {
                  $f = $nf;
                  $heur{'Heur/HyphenRepI'} = 1;
                  $merge_ncs->($m);
                  last;
               }
               $orig = undef;
            }

            # If input doesn't have lower-case letters, try it as first-upper
            if ($f =~ /^[\pL\pN]/ && $f =~ /\p{Lu}/ && $f !~ /\p{Ll}/) {
               $orig = $orig || $f;
               $f = ucfirst(lc($f));
               if ($f ne $orig) {
                  $heur{'Heur/Case'} = 1;
                  my $m = call_tokenizer($f);
                  if ($m =~ /\t"[^\n]*"[^?\n]+\n/) {
                     $merge_ncs->($m);
                     last;
                  }
                  next;
               }
               $orig = undef;
            }

            # If input is first-upper, try it as lower-case
            if ($f =~ /^\p{Lu}\P{Lu}+/ && $f =~ /\p{Ll}/) {
               $heur{'Heur/Case'} = 1;
               $orig = $orig || $f;
               $f = lc($f);
               my $m = call_tokenizer($f);
               if ($m =~ /\t"[^\n]*"[^?\n]+\n/) {
                  $merge_ncs->($m);
                  last;
               }
               $orig = undef;
               next;
            }

            # If input doesn't have upper-case letters, try it as first-upper
            if ($f =~ /^[\pL\pN]/ && $f =~ /\p{Ll}/ && $f !~ /\p{Lu}/) {
               $orig = $orig || $f;
               $f = ucfirst(lc($f));
               if ($f ne $orig) {
                  $heur{'Heur/Case'} = 1;
                  my $m = call_tokenizer($f);
                  if ($m =~ /\t"[^\n]*"[^?\n]+\n/) {
                     $merge_ncs->($m);
                     last;
                  }
                  # This one should not recurse
               }
            }

            last;
         }
      }
   }

   $out = '';
   foreach my $c (@cs) {
      $out .= $c->[0]."\n";
      if ($c->[1]) {
         $out .= $c->[1];
      }
   }

   return $out;
}

sub propagate_tags {
   my ($c) = @_;

   if ($c !~ m@^\s+"@ || $c !~ m@Gram/[TI]V@) {
      return $c;
   }

   # Snip duplicate tags
   my $o = $c;
   do {
      $o = $c;
      $c =~ s@ (Gram/[HTI]V) \1@ $1@g;
      $c =~ s@ (Gram/[HTI]V) (Der/[nv][nv]) \1@ $2 $1@g;
   } while ($o ne $c);

   my @ts = split(/ (?=(?:(?:i?(?:$mp))|(?:\p{Lu}[_\p{Lu}]+)|U)(?: |$))/, $c);

   my $gram = '';
   foreach (@ts) {
      # Stop entirely at hybrid marker, to avoid mangling 2nd part of a hybrid
      if (m@ Gram/Hyb@) {
         last;
      }
      # Don't propagate past POS, blocking morphemes, Pass/Refl, or noun morphemes
      if (m/^(?:$mp)(?: |$)/ || m/^(?:$vb)(?: |$)/ || m@ Gram/(Pass|Refl)@ || m@ Der/[nv]n@) {
         $gram = '';
         next;
      }
      if (m@ (Gram/[TI]V)@) {
         $gram = $1;
      }
      elsif ($gram && m@ Der/[nv]v@) {
         $_ .= " $gram";
      }
   }

   return join(' ', @ts);
}

my $out = "\"<¶>\"\n\t\"¶\" CLB\n";
my $flushing = 0;
my $wf = '';
while (<STDIN>) {
   s/\x{feff}//g; # Unicode BOM
   s/\x{ad}//g; # Soft Hyphen

   if (/^<STREAMCMD:FLUSH>/) {
      # If a flush is encountered, prevent outputting ¶ cohorts until non-tag input is seen
      $flushing = 1;
      print;
      next;
   }

   if (scalar(keys(%cache)) > 10000) {
      %cache = ();
   }

   $out = "\"<¶>\"\n\t\"¶\" CLB\n";
   if ($flushing) {
      $out = $_;
   }
   if (!/^\s*$/) {
      if (!/^</) {
         $flushing = 0;
      }
      $out = analyze($_);
   }

   my @ls = split(/\n+/, $out, -1);
   foreach (@ls) {
      $_ = propagate_tags($_);

      if ($opt_fst) {
         if (m@^"<(.+)>"@) {
            # Store wordform for later
            $wf = $1;
            print "\n";
            next;
         }
         elsif (m@^\s+"([^\n]+?"?[^"]*)"@) {
            # Turn CG back into FST, protecting baseforms with spaces
            my $b = $1;
            s/^\s+//g;
            s/"\Q$b\E"/XBASEFORMX/;
            s/\s+/+/g;
            s@^(.+)\+Prefix/($pp)@$2+$1@;
            s/XBASEFORMX/$b/;
            # Prepend previously seen wordform
            $_ = "$wf\t$_";
         }
      }
      elsif (m@^\s+"@) {
         # Mark all internal POS tags so they don't confuse the later steps
         while (s@ ($mp|$ap)( .+ (?:$mp)(?: |$))@ $i$1$2@g) {}
      }
      print "$_\n";
   }
}

if ($out ne "\"<¶>\"\n\t\"¶\" CLB\n") {
   if ($opt_fst) {
      print "¶\t¶+CLB\n";
   }
   else {
      print "\"<¶>\"\n\t\"¶\" CLB\n";
   }
}