#!/usr/bin/perl # -*- mode: cperl; indent-tabs-mode: nil; tab-width: 3; cperl-indent-level: 3; -*- use strict; use warnings; use utf8; use FileHandle; use IPC::Cmd qw(can_run); use IPC::Open2; use Digest::SHA qw(sha1_base64); use FindBin qw($Bin); BEGIN { $| = 1; binmode(STDIN, ':encoding(UTF-8)'); binmode(STDOUT, ':encoding(UTF-8)'); } use open qw( :encoding(UTF-8) :std ); use feature 'unicode_strings'; use Getopt::Long; Getopt::Long::Configure('no_ignore_case'); my $opt_bin = ''; my $opt_fst = 0; my $opt_verbose = 0; my $opt_noquote_hyph = 0; my $opt_noquote_raw = 0; my $rop = GetOptions( 'bin|b=s' => \$opt_bin, 'fst|f' => \$opt_fst, 'verbose|v' => \$opt_verbose, 'no-quote-hyphen|H' => \$opt_noquote_hyph, 'no-quote-raw|R' => \$opt_noquote_raw, ); my @bins = ($opt_bin, can_run('hfst-tokenise'), '/usr/local/bin/hfst-tokenise', '/opt/local/bin/hfst-tokenise', '/usr/bin/hfst-tokenise'); my $bin = ''; foreach my $f (@bins) { if ($f && -x $f) { $bin = $f; last; } } if (!$bin || !-x $bin) { die("No usable hfst-tokenise given or found!\n"); } if ($opt_verbose) { print STDERR "kal-tokenise: Using ${bin}\n"; } my $prefix = '/usr'; my @fsts = ( '', "${prefix}/share/giella/kal/tokeniser-disamb-gt-desc.pmhfst", '/usr/local/share/giella/kal/tokeniser-disamb-gt-desc.pmhfst', '/usr/share/giella/kal/tokeniser-disamb-gt-desc.pmhfst', "$Bin/../tokenisers/tokeniser-disamb-gt-desc.pmhfst", ); if (defined $ARGV[0] && -s $ARGV[0]) { $fsts[0] = $ARGV[0]; } my $fst = ''; foreach my $f (@fsts) { if ($f && -s $f && -r $f) { $fst = $f; last; } } if (!$fst || !-s $fst) { die("No usable tokeniser.pmhfst given or found!\n"); } if ($opt_verbose) { print STDERR "kal-tokenise: Using ${fst}\n"; } my $t = "${bin} -L ${fst} | grep --line-buffered -v -F 'Use/Hybrid'"; # Prefixes from root.lexc ca. lines 14-15 my @p = qw(AA TA); # Main POS tags from root.lexc ca. lines 19-29 my @m = qw(N V Pali Conj Adv Interj Pron Prop Num Symbol); # Other POS tags from root.lexc ca. lines 114-180 my @a = qw( Sg Du Pl Abs Rel Trm Abl Lok Aeq Ins Via Nom Akk Ind Int Imp Opt Cau Con Par Cont ContNeg IteCau 1Sg 2Sg 3Sg 4Sg 1Pl 2Pl 3Pl 4Pl 1Du 2Du 3Du 4Du 1SgO 2SgO 3SgO 4SgO 1PlO 2PlO 3PlO 4PlO 1DuO 2DuO 3DuO 4DuO 1SgPoss 2SgPoss 3SgPoss 4SgPoss 1PlPoss 2PlPoss 3PlPoss 4PlPoss ); # List of tags that should block Gram/TV and Gram/IV propagation my @v_block = qw( NIQAR HTR NAR GUMINAR SIMA TARIAQAR UTE SAAR SAR TIP NIRAR SURE SUGE GASUGE GASURE NASUGE NASURE TSAALI TIR TITIR QQU ); my $pp = join('|', @p); my $mp = join('|', @m); my $ap = join('|', @a); my $vb = join('|', @v_block); my $i = 'i'; open2(*OUT, *IN, $t) or die $!; binmode(OUT, ':encoding(UTF-8)'); binmode(IN, ':encoding(UTF-8)'); autoflush OUT 1; autoflush IN 1; my %cache = (); my $puncts = '\x{2018}-\x{201F}\x{2039}\x{203A}\x{2E42}\x{3008}-\x{300B}\x{301D}-\x{301F}”“"\'´`‘’()»«'; my $ps_double = '[\x{2018}-\x{201B}\'´`‘’]{2,}|[\x{201C}-\x{201F}\x{2E42}\x{301D}-\x{301F}”“"]+'; my $ps_single_r = '\x{2018}-\x{201B}\'´`‘’'; my $ps_single = "[$ps_single_r]+"; my $ps_parens = '[()]'; my $ps_angle = '[\x{2039}\x{203A}\x{3008}-\x{300B}»«]'; sub call_tokenizer { my ($in) = @_; # Protect tags early, to avoid that the various workarounds mangle them if ($in =~ m@^@) { return $in; } # Turn very long ALL-UPPER strings to First Upper to avoid geometric slowdown while ($in =~ m@(\p{Lu}[\pP\pS\p{Lu}]{14,})@g) { my ($w,$t) = ($1, ucfirst(lc($1))); if ($w !~ m@\p{Lu}.*?\p{Lu}@) { # If there aren't multiple upper-case letters, assume it'll tokenize around symbols next; } $in =~ s@\Q$w\E@ \x{e001} $t@g; } # Hide extremely long words my %long_cache = (); while ($in =~ m@(\pL{100,})@) { my ($s,$w) = ($1,$1); utf8::encode($s); # sha1_base64() can't handle UTF-8 for some reason my $hash = sha1_base64($s); $hash =~ s/[^a-zA-Z0-9]/x/g; $long_cache{$w} = "LONG_$hash"; $in =~ s@\Q$w\E@LONG_$hash@g; } # Turn URLs into placeholders to avoid endless loop my %url_cache = (); while ($in =~ m@((?:(ht|f)tp(s?)://)?[^\s\pZ/]+\.[a-z]{2,}/\S*)@) { my ($s,$u) = ($1,$1); utf8::encode($s); # sha1_base64() can't handle UTF-8 for some reason my $hash = sha1_base64($s); $hash =~ s/[^a-zA-Z0-9]/x/g; $url_cache{$u} = "URL_$hash"; $in =~ s@\Q$u\E@URL_$hash@g; #print STDERR "URL $u -> $hash\n"; } my @qhyphs = (); # Turn "quoted with hyphen suffix"-imi into placeholders if (!$opt_noquote_hyph) { # Keep suffixes in line with affixes/propernouns.lexc lexicon ZcitationsformZ my @qs = split(m@([$puncts]+\p{Pd}i?(?:p|mut|mit|minngaannit|minngaanniit|miit|mi|tut|mik|kkut))\b@, $in); if (scalar(@qs) > 1) { for (my $i=1 ; $i[0] $dash $suffix\n"; } else { #print STDERR "REJECTED $ppunct$quote$qs[$i]\n"; } } else { #print STDERR "NOMATCH $qs[$i-1] $qs[$i]\n"; } for my $ip (@accent) { $qs[$i-1] =~ s@\x{E000}@$ip@; } } $in = join('', @qs); } } if (!$opt_noquote_raw) { =pod # If there is exactly one quote pair of a type, then we dare try handling it my $cnt_double = () = ($in =~ m@$ps_double@g); if ($cnt_double == 2) { # Turn "double-quoted without hyphen suffix" into placeholders while ($in =~ m@(^|\s|\pP)($ps_double)((?:\S+?\s+?){0,6}?\S[\pL\pN\pM]*?)($ps_double)(\s|\pP|$)@) { my ($pre,$pq,$q,$sq,$suf) = ($1,$2,$3,$4,$5); my $n = scalar(@qhyphs); $in =~ s@\Q$pq$q$sq\E@ QuotedHyphen$n @g; push(@qhyphs, ["$pq$q$sq", '', '']); #print STDERR "DQUOTE $qhyphs[$n]->[0]\n"; } } my $cnt_single = () = ($in =~ m@[$ps_single_r]@g); if ($cnt_single == 2) { # Turn 'single-quoted without hyphen suffix' into placeholders while ($in =~ m@(^|\s|\pP)($ps_single)((?:\S+?\s+?){0,6}?\S[\pL\pN\pM]*?)($ps_single)(\s|\pP|$)@) { my ($pre,$pq,$q,$sq,$suf) = ($1,$2,$3,$4,$5); my $n = scalar(@qhyphs); $in =~ s@\Q$pq$q$sq\E@ QuotedHyphen$n @g; push(@qhyphs, ["$pq$q$sq", '', '']); #print STDERR "SQUOTE $qhyphs[$n]->[0]\n"; } } # Turn (parentheticals) into placeholders while ($in =~ m@(^|\s|\pP)([(])((?:\S+?\s+?){0,6}?\S[\pL\pN\pM]*?)([)])(\s|\pP|$)@) { my ($pre,$pq,$q,$sq,$suf) = ($1,$2,$3,$4,$5); my $n = scalar(@qhyphs); $in =~ s@\Q$pq$q$sq\E@ QuotedHyphen$n @g; push(@qhyphs, ["$pq$q$sq", '', '']); #print STDERR "PQUOTE $qhyphs[$n]->[0]\n"; } =cut } # If the input consisted only of quotes, undo the placeholders if ($in =~ m@^(\s*QuotedHyphen\d+(?:-\S+)?)+\s*$@) { for (my $n=0 ; $n'; print IN "$in\n$nonce\n"; while ($out !~ /\Q$nonce\E/) { $out .= ; } $out =~ s@\Q$nonce\E\s*@@sg; $out =~ s@ \?\?+@ ?@g; # Move the no-space marker onto the token after it $out =~ s@(\n?) *\x{e003} *\n("<.+?>"\n\t[^\n]+)@$1$2 @g; $out =~ s@"<\x{e003}>"\n\t"\x{e003}" \?\n("<.+?>"\n\t[^\n]+)@$1 @g; # Restore ALL-UPPER forms $out =~ s@\n?\x{e001} *\n"<([^\n]+?)>"@\n"<\U$1>"@g; $out =~ s@"<\x{e001}>"\n\t"\x{e001}" \?\n"<(.+?)>"@"<\U$1>"@g; # Restore URLs and give them the default analysis while (my ($u,$h) = each(%url_cache)) { $out =~ s@"<\Q$h\E>"@"<$u>"@g; $out =~ s@\t"\Q$h\E" \?@\t"$u" Sem/Url Prop Abs\n\t"$u" URL@g; } # Restore extremely long words while (my ($w,$h) = each(%long_cache)) { $out =~ s@\Q$h\E@$w@g; } for (my $n=0 ; $n@<$q>@g; } # Move prefixes after the base form $out =~ s@^(\s+)($pp) (".+?" )@$1$3Prefix/$2 @mg; # Special case +URL $out =~ s/"\+URL/" URL/g; # Remove empty lines, because they were simply spaces in the input # $out =~ s/\n([\s\t]*\n)+/\n/g; # HANDLE BETTER if ($in !~ /\s/) { $cache{$in} = $out; } } return $out; } sub parse_cohorts { my ($out) = @_; my @cs = (); my $in_c = 0; my $last = 0; # Parse hfst-tokenize's output to array of cohorts foreach (split(/\n+/, $out)) { my $f = $_; s/\s+$//g; if (/^"<.*?>"/) { # Start of a cohort if ($in_c) { ++$last; } $cs[$last][0] = $_; $cs[$last][1] = ''; $in_c = 1; } elsif (/^\t+/ && $in_c) { # A reading $cs[$last][1] .= "$_\n"; } else { # Neither cohort nor reading if ($in_c) { ++$last; } $cs[$last][0] = $_; $cs[$last][1] = ''; if (!/^\""; $cs[$last][1] = "\t\"$_\" ?\n"; } ++$last; if ($f =~ /^[.]+([ \t]+)$/) { $cs[$last][0] = $1; $cs[$last][1] = ''; ++$last; } $in_c = 0; } } return @cs; } sub analyze { my ($in) = @_; my $out = call_tokenizer($in); my @cs = parse_cohorts($out); my $did = 1; while ($did) { $did = 0; for (my $i=0 ; $i[0] || $cs[$i]->[0] =~ /^[0] =~ /^"<\.>"/ && ($cs[$i]->[1] =~ m@ Gram/Abbr\b@ || $cs[$i]->[1] =~ / \?/)) { my ($f) = ($cs[$i]->[0] =~ /^"<(.*)>"/); my $m = call_tokenizer("$f."); if ($m =~ /\t"/ && $m !~ / \?/ && $m =~ /^("<[^\n]*>"[^\n]*)\n(.+)$/s) { ($cs[$i]->[0], $cs[$i]->[1]) = ($1, $2); splice(@cs, $i+1, 1); $did = 1; last; } } # Merge sequential dots if ($cs[$i]->[0] =~ /^"<([.]+)>"/) { my $dots = $1; if (defined $cs[$i+1] && $cs[$i+1]->[0] =~ /^"<([.]+)>"/) { $dots .= $1; $cs[$i]->[0] =~ s/^"<[.]+>"/"<$dots>"/; $cs[$i]->[1] =~ s/(\s)"[.]+"/$1"$dots"/g; splice(@cs, $i+1, 1); $did = 1; last; } } # Try to merge sequential un-analyzed chunks if ($cs[$i]->[1] =~ / \?/) { my $j = $i; my $e = $i; my $len = 0; for ( ; $j[0] !~ /^[\s\pZ]*$/ && $cs[$j]->[1] !~ / \?/) { last; } # Avoid geometric slowdown if ($len + length($cs[$j]->[0]) - 4 >= 50) { last; } $e = $j; } $j = $i; while ($j != $e) { my $combo = ''; for ( ; $j<=$e ; ++$j) { if ($cs[$j]->[0] =~ /^"<(.+)>"/) { $combo .= $1; } } my $m = call_tokenizer($combo); if ($m =~ /\t"[^\n]*"[^?\n]+\n/) { my @ncs = parse_cohorts($m); splice(@cs, $i, $e - $i + 1, @ncs); last; } $j = $i; --$e; } } if ($cs[$i]->[1] !~ / \?/) { next; } # Work around HFST bug by recursing un-analyzed chunks my ($f) = ($cs[$i]->[0] =~ /^"<(.*)>"/); my %heur = (); my $orig = ''; my $merge_ncs = sub { my ($m) = @_; my @ncs = parse_cohorts($m); my $h = join(' ', sort(keys(%heur))); my @origs = split(/\x{e002}/, $orig); for (my $ni=0 ; $ni")/"<$origs[$ni]>"/; } if ($h) { $ncs[$ni][1] =~ s/(\t".*"[^\n]+)/$1 $h/g; } } splice(@cs, $i, 1, @ncs); $did = 1; }; while (42) { # If input has non-alphanumerics, try it again as-is - this catches punctuation if ($f =~ /[^\pL\pN\pM]/) { my $m = call_tokenizer($f); if ($m =~ /\t"[^\n]*"[^?\n]+\n/) { $merge_ncs->($m); last; } } # Split non-alphanumeric from alphanumeric and retry them separately if ($f =~ /^([^\pL\pN\pM]+)(.+)$/ || $f =~ /^(.+?)([^\pL\pN\pM]+)$/) { my ($f1,$f2) = ($1, $2); my $m = call_tokenizer($f1).call_tokenizer($f2); if ($m =~ /\t"[^\n]*"[^?\n]+\n/) { $merge_ncs->($m); last; } } # Split numeric from alphabetic and retry them separately if ($f =~ /^([^\pL\pM]+)(\pN+)$/ || $f =~ /^(\pN+?)([^\pL\pM]+)$/) { my ($f1,$f2) = ($1, $2); my $m = call_tokenizer($f1).call_tokenizer($f2); if ($m =~ /\t"[^\n]*"[^?\n]+\n/) { $merge_ncs->($m); last; } } # If input has Ųų, try them as Øø if ($f =~ /[Ųų]/) { $orig = $orig || $f; $heur{'Heur/Ų'} = 1; $f =~ s/Ų/Ø/g; $f =~ s/ų/ø/g; my $m = call_tokenizer($f); if ($m =~ /\t"[^\n]*"[^?\n]+\n/) { $merge_ncs->($m); last; } } # If input might be lacking a -, try with one if ($f =~ /^([\p{Lu}\pN\pM]+)(\P{Lu}+)$/ && $f !~ /-/) { $orig = $orig || $f; my $m = call_tokenizer("$1-$2"); if ($m =~ /\t"[^\n]*"[^?\n]+\n/ && $m !~ /^"<.*?>".*?\n"<.*?>"/s && $m !~ /(?:^|\n)[^\n<>"]+(?:\n|$)/s) { $heur{'Heur/HyphenAdd'} = 1; $merge_ncs->($m); last; } $orig = undef; } # Split around hyphens and dashes if ($f =~ /^([\pL\pN\pM]+)([-\p{Pd}]+)(.+)$/) { $orig = $orig || $f; if ($orig =~ /^([\pL\pN\pM]+)([-\p{Pd}]+)(.+)$/) { $orig = "$1\x{e002}$2\x{e002}$3"; } my ($f1,$f2,$f3) = ($1, $2, $3); my ($m1,$m2,$m3) = (call_tokenizer($f1), call_tokenizer($f2), call_tokenizer($f3)); if ($m1 =~ /\t"[^\n]*"[^?\n]+\n/ && $m3 =~ /\t"[^\n]*"[^?\n]+\n/) { $merge_ncs->("$m1$m2$m3"); last; } $orig = undef; } # If input might be lacking a space, try with one if ($f =~ /^([\pL\pN\pM]+?)(\p{Lu}+[\p{Ll}\pN\pM]+)$/) { my ($f1,$f2) = ($1, $2); #print STDERR "DEBUG: $f1 $f2\n"; my $m = call_tokenizer($f1).call_tokenizer($f2); if ($m =~ /\t"[^\n]*"[^?\n]+\n/) { $merge_ncs->($m); last; } } # Try without hyphens or dashes, potentially injecting an 'i' instead if ($f =~ /[-\p{Pd}]/) { $orig = $orig || $f; my $nf = $f; $nf =~ s/[-\p{Pd}]+//g; my $m = call_tokenizer($nf); if ($m =~ /\t"[^\n]*"[^?\n]+\n/) { $f = $nf; $heur{'Heur/HyphenRem'} = 1; $merge_ncs->($m); last; } $nf = $f; $nf =~ s/[-\p{Pd}]+/i/g; $m = call_tokenizer($nf); if ($m =~ /\t"[^\n]*"[^?\n]+\n/) { $f = $nf; $heur{'Heur/HyphenRepI'} = 1; $merge_ncs->($m); last; } $orig = undef; } # If input doesn't have lower-case letters, try it as first-upper if ($f =~ /^[\pL\pN]/ && $f =~ /\p{Lu}/ && $f !~ /\p{Ll}/) { $orig = $orig || $f; $f = ucfirst(lc($f)); if ($f ne $orig) { $heur{'Heur/Case'} = 1; my $m = call_tokenizer($f); if ($m =~ /\t"[^\n]*"[^?\n]+\n/) { $merge_ncs->($m); last; } next; } $orig = undef; } # If input is first-upper, try it as lower-case if ($f =~ /^\p{Lu}\P{Lu}+/ && $f =~ /\p{Ll}/) { $heur{'Heur/Case'} = 1; $orig = $orig || $f; $f = lc($f); my $m = call_tokenizer($f); if ($m =~ /\t"[^\n]*"[^?\n]+\n/) { $merge_ncs->($m); last; } $orig = undef; next; } # If input doesn't have upper-case letters, try it as first-upper if ($f =~ /^[\pL\pN]/ && $f =~ /\p{Ll}/ && $f !~ /\p{Lu}/) { $orig = $orig || $f; $f = ucfirst(lc($f)); if ($f ne $orig) { $heur{'Heur/Case'} = 1; my $m = call_tokenizer($f); if ($m =~ /\t"[^\n]*"[^?\n]+\n/) { $merge_ncs->($m); last; } # This one should not recurse } } last; } } } $out = ''; foreach my $c (@cs) { $out .= $c->[0]."\n"; if ($c->[1]) { $out .= $c->[1]; } } return $out; } sub propagate_tags { my ($c) = @_; if ($c !~ m@^\s+"@ || $c !~ m@Gram/[TI]V@) { return $c; } # Snip duplicate tags my $o = $c; do { $o = $c; $c =~ s@ (Gram/[HTI]V) \1@ $1@g; $c =~ s@ (Gram/[HTI]V) (Der/[nv][nv]) \1@ $2 $1@g; } while ($o ne $c); my @ts = split(/ (?=(?:(?:i?(?:$mp))|(?:\p{Lu}[_\p{Lu}]+)|U)(?: |$))/, $c); my $gram = ''; foreach (@ts) { # Stop entirely at hybrid marker, to avoid mangling 2nd part of a hybrid if (m@ Gram/Hyb@) { last; } # Don't propagate past POS, blocking morphemes, Pass/Refl, or noun morphemes if (m/^(?:$mp)(?: |$)/ || m/^(?:$vb)(?: |$)/ || m@ Gram/(Pass|Refl)@ || m@ Der/[nv]n@) { $gram = ''; next; } if (m@ (Gram/[TI]V)@) { $gram = $1; } elsif ($gram && m@ Der/[nv]v@) { $_ .= " $gram"; } } return join(' ', @ts); } my $out = "\"<¶>\"\n\t\"¶\" CLB\n"; my $flushing = 0; my $wf = ''; while () { s/\x{feff}//g; # Unicode BOM s/\x{ad}//g; # Soft Hyphen if (/^/) { # If a flush is encountered, prevent outputting ¶ cohorts until non-tag input is seen $flushing = 1; print; next; } if (scalar(keys(%cache)) > 10000) { %cache = (); } $out = "\"<¶>\"\n\t\"¶\" CLB\n"; if ($flushing) { $out = $_; } if (!/^\s*$/) { if (!/^"@) { # Store wordform for later $wf = $1; print "\n"; next; } elsif (m@^\s+"([^\n]+?"?[^"]*)"@) { # Turn CG back into FST, protecting baseforms with spaces my $b = $1; s/^\s+//g; s/"\Q$b\E"/XBASEFORMX/; s/\s+/+/g; s@^(.+)\+Prefix/($pp)@$2+$1@; s/XBASEFORMX/$b/; # Prepend previously seen wordform $_ = "$wf\t$_"; } } elsif (m@^\s+"@) { # Mark all internal POS tags so they don't confuse the later steps while (s@ ($mp|$ap)( .+ (?:$mp)(?: |$))@ $i$1$2@g) {} } print "$_\n"; } } if ($out ne "\"<¶>\"\n\t\"¶\" CLB\n") { if ($opt_fst) { print "¶\t¶+CLB\n"; } else { print "\"<¶>\"\n\t\"¶\" CLB\n"; } }