Auberge_Vagabonde/books.py

from os import makedirs, path
from typing import List

from PIL import Image
from bs4 import BeautifulSoup, Tag
from lxml.html import fromstring
from pypub import Chapter, Epub, create_chapter_from_string  # type: ignore
from pypub.const import SUPPORTED_TAGS  # type: ignore
from requests import get


UNSUPPORTED_TAGS = ["a", "h3"]
OUTPUT_DIR = "output"


class MyChapter(Chapter):
    def parse_etree(self):
        """generate new filtered element-tree"""
        etree = fromstring(self.content)
        # check if we can minimalize the scope
        body = etree.xpath(".//body")
        etree = body[0] if body else etree
        article = etree.xpath(".//article")
        etree = article[0] if article else etree
        # iterate elements in tree and delete/modify them
        for elem in [elem for elem in etree.iter()][1:]:
            if elem.tag in UNSUPPORTED_TAGS:
                parent = elem.getparent()
                parent.remove(elem)
            # if element tag is supported
            elif elem.tag in SUPPORTED_TAGS:
                # remove attributes not approved for specific tag
                for attr in elem.attrib:
                    if attr != "style" and attr not in SUPPORTED_TAGS[elem.tag]:
                        elem.attrib.pop(attr)
            # if element is not supported, append children to parent
            else:
                parent = elem.getparent()
                for child in elem.getchildren():
                    parent.append(child)
                parent.remove(elem)
                # NOTE: this is a bug with lxml, some children have
                # text in the parent included in the tail rather
                # than text attribute, so we also append tail to text
                if elem.tail and elem.tail.strip():
                    parent.text = (parent.text or "") + elem.tail.strip()
        # ensure all images with no src are removed
        for img in etree.xpath(".//img"):
            if "src" not in img.attrib:
                img.getparent().remove(img)
        # return new element-tree
        return etree

    def replace_images(self, image_dir: str, timeout: int = 10):
        try:
            super().replace_images(image_dir, timeout)
        except TimeoutError:
            pass
        for img in self.etree.xpath(".//img"):
            if img.attrib["src"].startswith("http"):
                img.getparent().remove(img)
            else:
                src = f"{path.dirname(image_dir)}/{img.attrib['src']}"
                image = Image.open(src)
                image.thumbnail((1000, 2000))
                image.save(src)


makedirs(name=OUTPUT_DIR, exist_ok=True)
cover_req = get(
    url="https://i0.wp.com/wanderinginn.com/wp-content/uploads/2023/03/Wandering_Inn-Vol1-eCover.jpg",  # noqa: E501
    stream=True,
)
cover_image = Image.open(cover_req.raw)
cover_image.thumbnail((500, 1000))
cover_image.save(f"{OUTPUT_DIR}/cover.png")


def process_volume(epub: Epub, urls: List[str]):
    for url in urls:
        page_req = get(url)
        page_html = BeautifulSoup(markup=page_req.text, features="lxml")
        page_content = page_html.select_one("div.entry-content")
        page_title = page_html.select_one("h1.entry-title")

        if not page_content or not page_title:
            raise Exception("Missing title or content")

        title = page_title.get_text().strip()
        if not title:
            continue

        galleries = page_content.select("div.tiled-gallery")

        for gallery in galleries:
            gallery.decompose()

        chapter = create_chapter_from_string(
            html=page_content.prettify(),
            title=title,
            factory=MyChapter,
        )

        print(f"{epub.title} - {title}")
        epub.add_chapter(chapter)

    epub.create_epub(OUTPUT_DIR)


def process_book(
    url: str, creator: str, publisher: str, language: str, strip_first=False
):
    toc_req = get(url)
    toc_html = BeautifulSoup(markup=toc_req.text, features="lxml")
    toc_content = toc_html.select("div.entry-content > p")
    toc_title = toc_html.select_one("#site-title > span > a")
    toc_date = toc_html.find(name="meta", property="article:modified_time")

    if not toc_title or type(toc_date) is not Tag:
        raise Exception("Missing title or date")
    title = toc_title.get_text().strip()

    if strip_first:
        toc_content.pop(0)

    for i, toc_line in enumerate(toc_content):
        if i % 2 == 0:
            volume = toc_line.get_text().strip()
        elif volume:
            epub = Epub(
                title=f"{title} - {volume}",
                creator=creator,
                language=language,
                publisher=publisher,
                date=toc_date["content"],
                cover=f"{OUTPUT_DIR}/cover.png",
                epub_dir=f"{OUTPUT_DIR}/{title} - {volume}",
            )

            urls = []
            for link in toc_line.select("a"):
                urls.append(link.attrs["href"])

            process_volume(epub=epub, urls=urls)


process_book(
    url="https://aubergevagabonde.wordpress.com/sommaire/",
    creator="Pirateaba",
    publisher="ElliVia",
    language="fr",
)
Add image resizing 2023-01-12 16:31:27 +00:00			`from os import makedirs, path`
Rewrite back to Python 2023-01-11 12:35:37 +00:00			`from typing import List`

			`from PIL import Image`
			`from bs4 import BeautifulSoup, Tag`
			`from lxml.html import fromstring`
			`from pypub import Chapter, Epub, create_chapter_from_string # type: ignore`
			`from pypub.const import SUPPORTED_TAGS # type: ignore`
			`from requests import get`


			`UNSUPPORTED_TAGS = ["a", "h3"]`
			`OUTPUT_DIR = "output"`


			`class MyChapter(Chapter):`
			`def parse_etree(self):`
			`"""generate new filtered element-tree"""`
			`etree = fromstring(self.content)`
			`# check if we can minimalize the scope`
			`body = etree.xpath(".//body")`
			`etree = body[0] if body else etree`
			`article = etree.xpath(".//article")`
			`etree = article[0] if article else etree`
			`# iterate elements in tree and delete/modify them`
			`for elem in [elem for elem in etree.iter()][1:]:`
			`if elem.tag in UNSUPPORTED_TAGS:`
			`parent = elem.getparent()`
			`parent.remove(elem)`
			`# if element tag is supported`
			`elif elem.tag in SUPPORTED_TAGS:`
			`# remove attributes not approved for specific tag`
			`for attr in elem.attrib:`
			`if attr != "style" and attr not in SUPPORTED_TAGS[elem.tag]:`
			`elem.attrib.pop(attr)`
			`# if element is not supported, append children to parent`
			`else:`
			`parent = elem.getparent()`
			`for child in elem.getchildren():`
			`parent.append(child)`
			`parent.remove(elem)`
			`# NOTE: this is a bug with lxml, some children have`
			`# text in the parent included in the tail rather`
			`# than text attribute, so we also append tail to text`
			`if elem.tail and elem.tail.strip():`
			`parent.text = (parent.text or "") + elem.tail.strip()`
			`# ensure all images with no src are removed`
			`for img in etree.xpath(".//img"):`
			`if "src" not in img.attrib:`
			`img.getparent().remove(img)`
			`# return new element-tree`
			`return etree`

Fix volume 7 2023-01-12 13:23:00 +00:00			`def replace_images(self, image_dir: str, timeout: int = 10):`
Fix timeout 2023-02-28 14:41:24 +00:00			`try:`
			`super().replace_images(image_dir, timeout)`
			`except TimeoutError:`
			`pass`
Fix volume 7 2023-01-12 13:23:00 +00:00			`for img in self.etree.xpath(".//img"):`
			`if img.attrib["src"].startswith("http"):`
			`img.getparent().remove(img)`
Add image resizing 2023-01-12 16:31:27 +00:00			`else:`
Forgot to save the image 2023-01-12 16:35:02 +00:00			`src = f"{path.dirname(image_dir)}/{img.attrib['src']}"`
			`image = Image.open(src)`
Add image resizing 2023-01-12 16:31:27 +00:00			`image.thumbnail((1000, 2000))`
Forgot to save the image 2023-01-12 16:35:02 +00:00			`image.save(src)`
Fix volume 7 2023-01-12 13:23:00 +00:00
Rewrite back to Python 2023-01-11 12:35:37 +00:00
			`makedirs(name=OUTPUT_DIR, exist_ok=True)`
			`cover_req = get(`
Adapt for the rewrite of book 1 2023-03-05 15:18:01 +00:00			`url="https://i0.wp.com/wanderinginn.com/wp-content/uploads/2023/03/Wandering_Inn-Vol1-eCover.jpg", # noqa: E501`
Rewrite back to Python 2023-01-11 12:35:37 +00:00			`stream=True,`
			`)`
Compress cover 2023-03-05 15:50:34 +00:00			`cover_image = Image.open(cover_req.raw)`
Reduce image even more 2023-03-05 16:12:54 +00:00			`cover_image.thumbnail((500, 1000))`
Compress cover 2023-03-05 15:50:34 +00:00			`cover_image.save(f"{OUTPUT_DIR}/cover.png")`
Rewrite back to Python 2023-01-11 12:35:37 +00:00

			`def process_volume(epub: Epub, urls: List[str]):`
			`for url in urls:`
			`page_req = get(url)`
			`page_html = BeautifulSoup(markup=page_req.text, features="lxml")`
			`page_content = page_html.select_one("div.entry-content")`
			`page_title = page_html.select_one("h1.entry-title")`

Adapt for the rewrite of book 1 2023-03-05 15:18:01 +00:00			`if not page_content or not page_title:`
			`raise Exception("Missing title or content")`

			`title = page_title.get_text().strip()`
			`if not title:`
			`continue`
Rewrite back to Python 2023-01-11 12:35:37 +00:00
			`galleries = page_content.select("div.tiled-gallery")`

			`for gallery in galleries:`
			`gallery.decompose()`

			`chapter = create_chapter_from_string(`
			`html=page_content.prettify(),`
			`title=title,`
			`factory=MyChapter,`
			`)`

			`print(f"{epub.title} - {title}")`
			`epub.add_chapter(chapter)`

			`epub.create_epub(OUTPUT_DIR)`


Adapt for the rewrite of book 1 2023-03-05 15:18:01 +00:00			`def process_book(`
			`url: str, creator: str, publisher: str, language: str, strip_first=False`
			`):`
Rewrite back to Python 2023-01-11 12:35:37 +00:00			`toc_req = get(url)`
			`toc_html = BeautifulSoup(markup=toc_req.text, features="lxml")`
			`toc_content = toc_html.select("div.entry-content > p")`
			`toc_title = toc_html.select_one("#site-title > span > a")`
Update requests 2023-01-12 18:43:45 +00:00			`toc_date = toc_html.find(name="meta", property="article:modified_time")`
Rewrite back to Python 2023-01-11 12:35:37 +00:00
			`if not toc_title or type(toc_date) is not Tag:`
Fix timeout 2023-02-28 14:41:24 +00:00			`raise Exception("Missing title or date")`
Adapt for the rewrite of book 1 2023-03-05 15:18:01 +00:00			`title = toc_title.get_text().strip()`

			`if strip_first:`
			`toc_content.pop(0)`
Rewrite back to Python 2023-01-11 12:35:37 +00:00
			`for i, toc_line in enumerate(toc_content):`
			`if i % 2 == 0:`
			`volume = toc_line.get_text().strip()`
			`elif volume:`
			`epub = Epub(`
			`title=f"{title} - {volume}",`
			`creator=creator,`
			`language=language,`
			`publisher=publisher,`
			`date=toc_date["content"],`
			`cover=f"{OUTPUT_DIR}/cover.png",`
			`epub_dir=f"{OUTPUT_DIR}/{title} - {volume}",`
			`)`

			`urls = []`
			`for link in toc_line.select("a"):`
			`urls.append(link.attrs["href"])`

			`process_volume(epub=epub, urls=urls)`
Only fr now 2023-05-11 21:12:38 +00:00

			`process_book(`
			`url="https://aubergevagabonde.wordpress.com/sommaire/",`
			`creator="Pirateaba",`
			`publisher="ElliVia",`
			`language="fr",`
			`)`