textproc.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83

# Copyright (C) 2019 GNUnet e.V.
#
# This code is derived from code contributed to GNUnet e.V.
# by nikita <nikita@n0.is>.
#
# Permission to use, copy, modify, and/or distribute this software for
# any purpose with or without fee is hereby granted.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
# AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
# DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA
# OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
# PERFORMANCE OF THIS SOFTWARE.
#
# SPDX-License-Identifier: 0BSD
import html.parser
from bs4 import BeautifulSoup


class extractText(html.parser.HTMLParser):
    def __init__(self):
        super(extractText, self).__init__()
        self.result = []

    def handle_data(self, data):
        self.result.append(data)

    def text_in(self):
        return ''.join(self.result)


def html2text(html):
    k = extractText()
    k.feed(html)
    return k.text_in()


def cut_text(filename, count):
    with open(filename) as html:
        soup = BeautifulSoup(html, features="lxml")
        for script in soup(["script", "style"]):
            script.extract()
        k = []
        for i in soup.findAll('p')[1]:
            k.append(i)
        b = ''.join(str(e) for e in k)
        text = html2text(b.replace("\n", ""))
        textreduced = (text[:count] + ' [...]') if len(text) > count else (text)
        return (textreduced)


def cut_news_text(filename, count):
    return cut_text("template/news/" + filename + ".j2", count)


# TODO: replace id='...' with frontier so that we can
# pass it in cut_article reusable, or merge cut_text and
# cut_by_frontier.
def cut_by_frontier(filename):
    with open(filename) as html:
        soup = BeautifulSoup(html, features="lxml")
        k = []
        for i in soup.find(id='newspost-content'):
            k.append(i)
        b = ''.join(str(e) in k)
        text = b.replace("\n", "")
        return text


def cut_article(filename, conf, lang):
    return cut_all("template/news/" + filename + ".j2", conf, lang)

def cut_all(filename, conf, lang):
    with open(filename) as html:
        soup = BeautifulSoup(html, features="lxml")
        i = repr(soup).replace('{% extends "common/news.j2" %}\n{% block body_content %}\n', "").replace('\n{% endblock body_content %}', "").replace('<html><body><p></p>',"").replace('</body></html>', "")
        urlstr = "https://" + conf["siteconf"][0]["baseurl"] + "/" + lang + "/"
        text = i.replace("\n", "").replace("{{ url_localized('", urlstr).replace("') }}", "")
        # .replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
        return text