# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this file. If not, see .
#
# Copyright © 2012-2023 The University of Tromsø &
# the Norwegian Sámi Parliament
# http://giellatekno.uit.no & http://divvun.no
#
"""Convert html content to the Giella xml format."""
import os
from lxml import etree, html
from lxml.html import clean
from corpustools import (
convert_using_pandoc,
convert_using_soffice,
epubconverter,
htmlconverter,
pdfconverter,
util,
xmlconverter,
)
HERE = os.path.dirname(__file__)
def to_html_elt(path):
chooser = {
".doc": convert_using_soffice.to_html_elt,
".docx": convert_using_pandoc.to_html_elt,
".epub": epubconverter.to_html_elt,
".html": htmlconverter.to_html_elt,
".odt": convert_using_pandoc.to_html_elt,
".pdf": pdfconverter.to_html_elt,
".rtf": convert_using_pandoc.to_html_elt,
".tex": convert_using_pandoc.to_html_elt,
".writenow": convert_using_soffice.to_html_elt,
".xml": xmlconverter.to_html_elt,
}
return chooser[os.path.splitext(path)[1]](path)
class HTMLBeautifier:
"""Convert html documents to the Giella xml format."""
def __init__(self, html_elt):
for elt in html_elt.iter("script"):
elt.getparent().remove(elt)
c_clean = self.superclean(etree.tostring(html_elt, encoding="unicode"))
self.soup = html.document_fromstring(c_clean)
def superclean(self, content):
"""Remove unwanted elements from an html document.
Args:
content (str): a string containing an html document.
Returns:
(str): a string containing the cleaned up html document.
"""
cleaner = clean.Cleaner(
page_structure=False,
scripts=True,
javascript=True,
comments=True,
style=True,
processing_instructions=True,
remove_unknown_tags=True,
embedded=True,
kill_tags=[
"img",
"area",
"address",
"hr",
"cite",
"footer",
"figcaption",
"aside",
"time",
"figure",
"nav",
"noscript",
"map",
"ins",
"s",
"colgroup",
],
)
return cleaner.clean_html(self.remove_cruft(content))
@staticmethod
def remove_cruft(content):
"""Remove cruft from svenskakyrkan.se documents.
Args:
content (str): the content of a document.
Returns:
(str): The content of the document without the cruft.
"""
replacements = [("//