Auberge_Vagabonde/books.py

from os import makedirs, path
from typing import List

from PIL import Image
from bs4 import BeautifulSoup, Tag
from lxml.html import fromstring
from pypub import Chapter, Epub, create_chapter_from_string  # type: ignore
from pypub.const import SUPPORTED_TAGS  # type: ignore
from requests import get


UNSUPPORTED_TAGS = ["a", "h3"]
OUTPUT_DIR = "output"


class MyChapter(Chapter):
    def parse_etree(self):
        """generate new filtered element-tree"""
        etree = fromstring(self.content)
        # check if we can minimalize the scope
        body = etree.xpath(".//body")
        etree = body[0] if body else etree
        article = etree.xpath(".//article")
        etree = article[0] if article else etree
        # iterate elements in tree and delete/modify them
        for elem in [elem for elem in etree.iter()][1:]:
            if elem.tag in UNSUPPORTED_TAGS:
                parent = elem.getparent()
                parent.remove(elem)
            # if element tag is supported
            elif elem.tag in SUPPORTED_TAGS:
                # remove attributes not approved for specific tag
                for attr in elem.attrib:
                    if attr != "style" and attr not in SUPPORTED_TAGS[elem.tag]:
                        elem.attrib.pop(attr)
            # if element is not supported, append children to parent
            else:
                parent = elem.getparent()
                for child in elem.getchildren():
                    parent.append(child)
                parent.remove(elem)
                # NOTE: this is a bug with lxml, some children have
                # text in the parent included in the tail rather
                # than text attribute, so we also append tail to text
                if elem.tail and elem.tail.strip():
                    parent.text = (parent.text or "") + elem.tail.strip()
        # ensure all images with no src are removed
        for img in etree.xpath(".//img"):
            if "src" not in img.attrib:
                img.getparent().remove(img)
        # return new element-tree
        return etree

    def replace_images(self, image_dir: str, timeout: int = 10):
        try:
            super().replace_images(image_dir, timeout)
        except TimeoutError:
            pass
        for img in self.etree.xpath(".//img"):
            if img.attrib["src"].startswith("http"):
                img.getparent().remove(img)
            else:
                src = f"{path.dirname(image_dir)}/{img.attrib['src']}"
                image = Image.open(src)
                image.thumbnail((1000, 2000))
                image.save(src)


makedirs(name=OUTPUT_DIR, exist_ok=True)
cover_req = get(
    url="https://i0.wp.com/wanderinginn.com/wp-content/uploads/2023/03/Wandering_Inn-Vol1-eCover.jpg",  # noqa: E501
    stream=True,
)
cover_image = Image.open(cover_req.raw)
cover_image.thumbnail((500, 1000))
cover_image.save(f"{OUTPUT_DIR}/cover.png")


def process_volume(epub: Epub, urls: List[str]):
    for url in urls:
        page_req = get(url)
        page_html = BeautifulSoup(markup=page_req.text, features="lxml")
        page_content = page_html.select_one("div.entry-content")
        page_title = page_html.select_one("h1.entry-title")

        if not page_content or not page_title:
            raise Exception("Missing title or content")

        title = page_title.get_text().strip()
        if not title:
            continue

        galleries = page_content.select("div.tiled-gallery")

        for gallery in galleries:
            gallery.decompose()

        chapter = create_chapter_from_string(
            html=page_content.prettify(),
            title=title,
            factory=MyChapter,
        )

        print(f"{epub.title} - {title}")
        epub.add_chapter(chapter)

    epub.create_epub(OUTPUT_DIR)


def process_book(
    url: str, creator: str, publisher: str, language: str, strip_first=False
):
    toc_req = get(url)
    toc_html = BeautifulSoup(markup=toc_req.text, features="lxml")
    toc_content = toc_html.select("div.entry-content > p")
    toc_title = toc_html.select_one("#site-title > span > a")
    toc_date = toc_html.find(name="meta", property="article:modified_time")

    if not toc_title or type(toc_date) is not Tag:
        raise Exception("Missing title or date")
    title = toc_title.get_text().strip()

    if strip_first:
        toc_content.pop(0)

    for i, toc_line in enumerate(toc_content):
        if i % 2 == 0:
            volume = toc_line.get_text().strip()
        elif volume:
            epub = Epub(
                title=f"{title} - {volume}",
                creator=creator,
                language=language,
                publisher=publisher,
                date=toc_date["content"],
                cover=f"{OUTPUT_DIR}/cover.png",
                epub_dir=f"{OUTPUT_DIR}/{title} - {volume}",
            )

            urls = []
            for link in toc_line.select("a"):
                urls.append(link.attrs["href"])

            process_volume(epub=epub, urls=urls)


process_book(
    url="https://aubergevagabonde.wordpress.com/sommaire/",
    creator="Pirateaba",
    publisher="ElliVia",
    language="fr",
)