# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this file. If not, see .
#
# Copyright © 2012-2023 The University of Tromsø &
# the Norwegian Sámi Parliament
# http://giellatekno.uit.no & http://divvun.no
#
"""This file contains classes fix converted documents."""
import os
import re
from copy import deepcopy
from lxml import etree
from corpustools import decode, util
HERE = os.path.dirname(__file__)
class DocumentFixer:
"""Fix the content of a Giella xml document.
Receive a stringified etree from one of the raw converters,
replace ligatures, fix the encoding and return an etree with correct
characters
"""
newstags = re.compile(
r"(@*logo:|[\s+\']*@*\s*ingres+[\.:]*|.*@*.*bilde\s*\d*:|\W*(@|"
r"LED|bilde)*tekst:|@*foto:|@fotobyline:|@*bildetitt:|"
r"|||"
r"@*Samleingress:*|tekst/ingress:|billedtekst:|.@tekst:)",
re.IGNORECASE,
)
titletags = re.compile(
r"\s*@m.titt[\.:]|\s*@*stikk:|Mellomtittel:|@*(stikk\.*|"
r"under)titt(el)*:|@ttt:|\s*@*[utm]*[:\.]*tit+:||"
r"undertittel:",
re.IGNORECASE,
)
headertitletags = re.compile(
r"(\s*@*(led)*tittel:|\s*@*titt(\s\d)*:|@LEDtitt:|"
r"|@*(hoved|over)titt(el)*:)",
re.IGNORECASE,
)
bylinetags = re.compile(
r"(]*\s*(\S+:)*", re.UNICODE | re.IGNORECASE
)
boldtags = re.compile(r"@bold\s*:")
def __init__(self, document):
"""Initialise the DocumentFixer class."""
self.root = document
def get_etree(self):
"""Get the root of the xml document."""
return self.root
def compact_ems(self):
"""Compact consecutive em elements into a single em if possible."""
word = re.compile(r"\w+", re.UNICODE)
for element in self.root.iter("p"):
if len(element.xpath(".//em")) > 1:
lines = []
for emphasis in element.iter("em"):
next_elt = emphasis.getnext()
if (
next_elt is not None
and next_elt.tag == "em"
and (emphasis.tail is None or not word.search(emphasis.tail))
):
if emphasis.text is not None:
lines.append(emphasis.text.strip())
emphasis.getparent().remove(emphasis)
else:
if emphasis.text is not None:
lines.append(emphasis.text.strip())
emphasis.text = " ".join(lines)
if emphasis.tail is not None:
emphasis.tail = f" {emphasis.tail}"
del lines[:]
def soft_hyphen_to_hyph_tag(self):
"""Replace soft hyphen chars with hyphen tags."""
for element in self.root.iter("p"):
self.replace_shy(element)
def replace_shy(self, element):
"""Replace shy with a hyph element.
Args:
element (etree.Element): an etree element
"""
for child in element:
self.replace_shy(child)
text = element.text
if text is not None:
parts = text.split("")
if len(parts) > 1:
element.text = parts[0]
for index, part in enumerate(parts[1:]):
hyph = etree.Element("hyph")
hyph.tail = part
element.insert(index, hyph)
text = element.tail
if text is not None:
parts = text.split("")
if len(parts) > 1:
element.tail = parts[0]
for part in parts[1:]:
hyph = etree.Element("hyph")
hyph.tail = part
element.getparent().append(hyph)
def insert_spaces_after_semicolon(self):
"""Insert space after semicolon where needed."""
irritating_words_regex = re.compile(
"(govv(a|en|ejeaddji):)([^ ])", re.UNICODE | re.IGNORECASE
)
for child in self.root.find(".//body"):
self.insert_space_after_semicolon(child, irritating_words_regex)
def insert_space_after_semicolon(self, element, irritating_words_regex):
"""Insert space after words needing it.
Args:
element (etree.Element): an etree element
irritating_words_regex (re.Pattern): regex
"""
if element.text is not None:
element.text = irritating_words_regex.sub(r"\1 \3", element.text)
for child in element:
self.insert_space_after_semicolon(child, irritating_words_regex)
if element.tail is not None:
element.tail = irritating_words_regex.sub(r"\1 \3", element.tail)
def replace_ligatures(self):
"""Replace unwanted chars."""
replacements = {
"[dstrok]": "đ",
"[Dstrok]": "Đ",
"[tstrok]": "ŧ",
"[Tstrok]": "Ŧ",
"[scaron]": "š",
"[Scaron]": "Š",
"[zcaron]": "ž",
"[Zcaron]": "Ž",
"[ccaron]": "č",
"[Ccaron]": "Č",
"[eng": "ŋ",
" ]": "",
"Ď": "đ", # cough
"ď": "đ", # cough
"\x03": "",
"\x04": "",
"\x07": "",
"\x08": "",
"\x0F": "",
"\x10": "",
"\x11": "",
"\x13": "",
"\x14": "",
"\x15": "",
"\x17": "",
"\x18": "",
"\x1A": "",
"\x1B": "",
"\x1C": "",
"\x1D": "",
"\x1E": "",
"fi": "fi",
"fl": "fl",
"ff": "ff",
"ffi": "ffi",
"ffl": "ffl",
"ſt": "ft",
}
for element in self.root.iter("p"):
if element.text:
for key, value in replacements.items():
element.text = element.text.replace(key + " ", value)
element.text = element.text.replace(key, value)
def replace_bad_unicode(self):
"""Replace some chars in an otherwise 'valid utf-8' document.
These chars e.g. 'valid utf-8' (don't give UnicodeDecodeErrors), but
we still want to replace them to what they most likely were
meant to be.
:param content: a unicode string
:returns: a cleaned up unicode string
"""
# u'š'.encode('windows-1252') gives '\x9a', which sometimes
# appears in otherwise utf-8-encoded documents with the
# meaning 'š'
replacements = [
("\x9a", "š"),
("\x8a", "Š"),
("\x9e", "ž"),
("\x8e", "Ž"),
]
for element in self.root.iter("p"):
if element.text:
element.text = util.replace_all(replacements, element.text)
def fix_lang(self, element, lang):
"""Replace invalid accents with valid ones for the sms language."""
sms_space = re.compile(
r"(?P\s+)"
r"(?P[ʼʹ])", # MODIFIER LETTER APOSTROPHE,
# MODIFIER LETTER PRIME
re.UNICODE,
)
replacement_pairs = {
"sms": [
("\u2019", "\u02BC"), # RIGHT SINGLE QUOTATION MARK,
# MODIFIER LETTER APOSTROPHE
("\u0027", "\u02BC"), # apostrophe,
# MODIFIER LETTER APOSTROPHE
("\u2032", "\u02B9"), # PRIME, MODIFIER LETTER PRIME
("\u00B4", "\u02B9"), # ACUTE ACCENT,
# MODIFIER LETTER PRIME
("\u0301", "\u02BC"), # COMBINING ACUTE ACCENT,
# MODIFIER LETTER PRIME
],
"mns": [
("\uf50e", "А̄"), # CYRILLIC VOWELS WITH LENGTH MARK
("\uf50f", "а̄"),
("\uf510", "Е̄"),
("\uf511", "е̄"),
("\uf512", "Ё̄"), #
("\uf513", "ё̄"),
("\uf517", "О̄"), # 17? Just guessing
("\uf518", "О̄"), # CYRILLIC LONG CAPITAL O
("\uf519", "о̄"), # CYRILLIC LONG SMALL O
("\uf520", "Ы̄"), #
("\uf521", "ы̄"), #
("\uf522", "Э̄"),
("\uf523", "э̄"),
("\uf52c", "Ю̄"), #
("\uf52d", "ю̄"),
("\uf528", "Я̄"),
("\uf529", "я̄"),
],
}
if element.text:
element.text = util.replace_all(replacement_pairs[lang], element.text)
if lang == "sms":
element.text = sms_space.sub(r"\g", element.text)
if element.tail:
element.tail = util.replace_all(replacement_pairs[lang], element.tail)
if lang == "sms":
element.tail = sms_space.sub(r"\g", element.tail)
for child in element:
self.fix_lang(child, lang)
def fix_body_encoding(self, mainlang):
"""Replace wrongly encoded saami chars with proper ones.
Send a stringified version of the body into the EncodingGuesser class.
It returns the same version, but with fixed characters.
Parse the returned string, insert it into the document
"""
self.replace_ligatures()
body = self.root.find("body")
# Weird bug(?) in MacOS, the end tag of document lingers …
body_string = etree.tostring(body, encoding="unicode").replace(
"", ""
)
body.getparent().remove(body)
encoding = decode.guess_body_encoding(body_string, mainlang)
try:
body = etree.fromstring(decode.decode_para(encoding, body_string))
except UnicodeEncodeError as error:
raise UserWarning(str(error)) from error
self.root.append(body)
if mainlang in ["sms", "mns"]:
for paragraph in body.iter("p"):
self.fix_lang(paragraph, lang=mainlang)
def fix_title_person(self, encoding):
"""Fix encoding problems."""
title = self.root.find(".//title")
if title is not None and title.text is not None:
text = title.text
util.print_frame(encoding)
title.text = decode.decode_para(encoding, text)
persons = self.root.findall(".//person")
for person in persons:
if person is not None:
lastname = person.get("lastname")
if encoding == "mac-sami_to_latin1":
lastname = lastname.replace("‡", "á")
lastname = lastname.replace("Œ", "å")
person.set("lastname", decode.decode_para(encoding, lastname))
firstname = person.get("firstname")
if encoding == "mac-sami_to_latin1":
firstname = firstname.replace("‡", "á")
firstname = firstname.replace("Œ", "å")
person.set("firstname", decode.decode_para(encoding, firstname))
@staticmethod
def get_quote_list(text):
"""Get list of quotes from the given text.
Args:
text (str): string
Returns:
(list[tuple[int, int]]): A list of span tuples containing
indexes to quotes found in text.
"""
unwanted = r"[^:,!?.\s]"
quote_regexes = [
re.compile('"{0}.+?{0}"'.format(unwanted)),
re.compile("«.+?»"),
re.compile("“.+?”"),
re.compile("”{0}.+?{0}”".format(unwanted)),
]
quote_list = [
m.span()
for quote_regex in quote_regexes
for m in quote_regex.finditer(text)
]
quote_list.sort()
return quote_list
@staticmethod
def append_quotes(element, text, quote_list):
"""Append quotes to an element.
Args:
text (str): the plain text of the element.
quote_list (list of tuple of int): A list of span tuples containing
indexes to quotes found in text.
"""
for index in range(0, len(quote_list)):
span = etree.Element("span")
span.set("type", "quote")
span.text = text[quote_list[index][0] : quote_list[index][1]]
if index + 1 < len(quote_list):
span.tail = text[quote_list[index][1] : quote_list[index + 1][0]]
else:
span.tail = text[quote_list[index][1] :]
element.append(span)
def _detect_quote(self, element):
"""Insert span elements around quotes.
Args:
element (etree.Element): an etree element.
"""
newelement = deepcopy(element)
element.text = ""
for child in element:
child.getparent().remove(child)
text = newelement.text
if text:
quote_list = self.get_quote_list(text)
if quote_list:
element.text = text[0 : quote_list[0][0]]
self.append_quotes(element, text, quote_list)
else:
element.text = text
for child in newelement:
if child.tag == "span" and child.get("type") == "quote":
element.append(child)
else:
element.append(self._detect_quote(child))
if child.tail:
text = child.tail
quote_list = self.get_quote_list(text)
if quote_list:
child.tail = text[0 : quote_list[0][0]]
self.append_quotes(element, text, quote_list)
return element
def detect_quotes(self):
"""Detect quotes in all paragraphs."""
for paragraph in self.root.iter("p"):
paragraph = self._detect_quote(paragraph)
def calculate_wordcount(self):
"""Count the words in the file."""
plist = [
etree.tostring(paragraph, method="text", encoding="unicode")
for paragraph in self.root.iter("p")
]
return str(len(re.findall(r"\S+", " ".join(plist))))
@staticmethod
def _make_element(name, text, attributes=None):
"""Make an xml element.
:param name: the name of the element
:param text: the content of the element
:param attributes: the elements attributes
:returns: lxml.etree.Element
"""
attributes = attributes or {}
element = etree.Element(name)
for key in attributes:
element.set(key, attributes[key])
element.text = text
return element
def _fix_emphasises(self):
for emphasis in self.root.iter("em"):
paragraph = emphasis.getparent()
if not len(emphasis) and emphasis.text:
if self.bylinetags.match(emphasis.text):
line = self.bylinetags.sub("", emphasis.text).strip()
unknown = self.root.find(".//unknown")
if unknown is not None:
person = etree.Element("person")
person.set("lastname", line)
person.set("firstname", "")
unknown.getparent().replace(unknown, person)
paragraph.getparent().remove(paragraph)
elif self.titletags.match(emphasis.text):
emphasis.text = self.titletags.sub("", emphasis.text).strip()
paragraph.set("type", "title")
elif self.newstags.match(emphasis.text):
emphasis.text = self.newstags.sub("", emphasis.text).strip()
def _add_paragraph(self, line, index, paragraph, attributes):
if line:
index += 1
paragraph.getparent().insert(
index, self._make_element("p", line, attributes=attributes)
)
return index
def _add_emphasis(self, index, line, attributes, paragraph):
index += 1
element = etree.Element("p")
element.append(self._make_element("em", line, attributes))
paragraph.getparent().insert(index, element)
return index
def _handle_line(self, line, index, lines, paragraph):
if self.newstags.match(line):
index = self._add_paragraph(
" ".join(lines).strip(), index, paragraph, paragraph.attrib
)
del lines[:]
lines.append(self.newstags.sub("", line))
elif self.bylinetags.match(line):
index = self._add_paragraph(
" ".join(lines).strip(), index, paragraph, paragraph.attrib
)
del lines[:]
unknown = self.root.find(".//unknown")
if unknown is not None:
person = etree.Element("person")
person.set("lastname", self.bylinetags.sub("", line).strip())
person.set("firstname", "")
unknown.getparent().replace(unknown, person)
elif self.boldtags.match(line):
index = self._add_paragraph(
" ".join(lines).strip(), index, paragraph, paragraph.attrib
)
index = self._add_emphasis(
index, self.boldtags.sub("", line).strip(), {"type": "bold"}, paragraph
)
del lines[:]
elif line.startswith("@kursiv:"):
index = self._add_paragraph(
" ".join(lines).strip(), index, paragraph, paragraph.attrib
)
index = self._add_emphasis(
index,
line.replace("@kursiv:", "").strip(),
{"type": "italic"},
paragraph,
)
del lines[:]
elif self.headertitletags.match(line):
index = self._add_paragraph(
" ".join(lines).strip(), index, paragraph, paragraph.attrib
)
del lines[:]
header = self.root.find(".//header")
title = header.find("./title")
if title is not None and title.text is None:
title.text = self.headertitletags.sub("", line).strip()
index = self._add_paragraph(
self.headertitletags.sub("", line).strip(),
index,
paragraph,
{"type": "title"},
)
elif self.titletags.match(line):
index = self._add_paragraph(
" ".join(lines).strip(), index, paragraph, paragraph.attrib
)
del lines[:]
index += 1
paragraph.getparent().insert(
index,
self._make_element(
"p", self.titletags.sub("", line).strip(), {"type": "title"}
),
)
elif line == "" and lines:
index = self._add_paragraph(
" ".join(lines).strip(), index, paragraph, paragraph.attrib
)
del lines[:]
else:
lines.append(line)
return index
def _fix_paragraphs(self):
for paragraph in self.root.iter("p"):
if not len(paragraph) and paragraph.text:
index = paragraph.getparent().index(paragraph)
lines = []
for line in paragraph.text.split("\n"):
index = self._handle_line(line, index, lines, paragraph)
index = self._add_paragraph(
" ".join(lines).strip(), index, paragraph, paragraph.attrib
)
paragraph.getparent().remove(paragraph)
def fix_newstags(self):
"""Convert newstags found in text to xml elements."""
self._fix_emphasises()
self._fix_paragraphs()