# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this file. If not, see .
#
# Copyright © 2012-2023 The University of Tromsø &
# the Norwegian Sámi Parliament
# http://giellatekno.uit.no & http://divvun.no
#
"""Convert plaintext files to the Giella xml format."""
import codecs
import io
import re
from pathlib import Path
from typing import Iterable
from lxml import etree
from corpustools import basicconverter, util
class PlaintextConverter(basicconverter.BasicConverter):
"""Convert plain text files to the Giella xml format."""
def to_unicode(self) -> str:
"""Read a file into a unicode string.
If the content of the file is not utf-8, pretend the encoding is
latin1. The real encoding will be detected later.
Returns:
(str): The decoded string
"""
try:
content = codecs.open(self.orig.as_posix(), encoding="utf8").read()
except ValueError:
content = codecs.open(self.orig.as_posix(), encoding="latin1").read()
content = self.strip_chars(content.replace("\r\n", "\n"))
return content
@staticmethod
def strip_chars(content: str, extra="") -> str:
"""Remove the characters found in plaintext_oddities from content.
Args:
content: a string containing the content of a document.
extra: a string containg even more characters to remove
from content.
Returns:
A string containing the content sans unwanted characters.
"""
plaintext_oddities = [
("ÊÊ", "\n"),
(r"<\!q>", ""),
(r"<\!h>", ""),
("<*B>", ""),
("<*P>", ""),
("<*I>", ""),
("\r", "\n"),
("", ""),
("", ""),
("<0x010C>", "Č"),
("<0x010D>", "č"),
("<0x0110>", "Đ"),
("<0x0111>", "đ"),
("<0x014A>", "Ŋ"),
("<0x014B>", "ŋ"),
("<0x0160>", "Š"),
("<0x0161>", "š"),
("<0x0166>", "Ŧ"),
("<0x0167>", "ŧ"),
("<0x017D>", "Ž"),
("<0x017E>", "ž"),
("<0x2003>", " "),
(
"========================================================"
"========================",
"\n",
),
]
content = util.replace_all(plaintext_oddities, content)
remove_re = re.compile(f"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F{extra}]")
content, _ = remove_re.subn("", content)
return content
@staticmethod
def make_element(element_name: str, text: str) -> etree._Element:
"""Make an xml element.
Args:
element_name: Name of the xml element
text: The text the xml should contain
Returns:
an etree element
"""
element = etree.Element(element_name)
hyph_parts = text.split("")
if len(hyph_parts) > 1:
element.text = hyph_parts[0]
for hyph_part in hyph_parts[1:]:
hyph = etree.Element("hyph")
hyph.tail = hyph_part
element.append(hyph)
else:
element.text = text
return element
def lines2xml(self, content: io.StringIO) -> Iterable[etree._Element]:
"""Transform paragraphs to etree elements.
Args:
content: the content of the plaintext document.
Yields:
An etree element.
"""
valid_lines = (
line
for line_no, line in enumerate(content, start=1)
if line_no not in self.metadata.skip_lines or line.startswith("#")
)
buffer: list[str] = []
for line in valid_lines:
if line.strip() == "" and buffer:
yield self.make_element("p", "".join(buffer))
buffer.clear()
else:
buffer.append(line)
if buffer:
yield self.make_element("p", "".join(buffer))
def content2xml(self, content: io.StringIO) -> etree._Element:
"""Transform plaintext to an intermediate xml document.
Args:
content: the content of the plaintext document.
Returns:
An etree element.
"""
document = etree.Element("document")
etree.SubElement(document, "header")
body = etree.SubElement(document, "body")
for para in self.lines2xml(content):
body.append(para)
return document
def convert2intermediate(filename: Path) -> etree._Element:
"""Transform plaintext to an intermediate xml document.
Args:
filename: path of the file that should be converted
Returns:
An etree element.
"""
converter = PlaintextConverter(filename)
return converter.content2xml(io.StringIO(converter.to_unicode()))