This repository has been archived on 2024-02-23. You can view files and clone it, but cannot push or open issues or pull requests.
Auberge_Vagabonde/books.py
Michel Roux 1f0112a9a1
All checks were successful
continuous-integration/drone/push Build is passing
Only fr now
2023-05-11 23:12:38 +02:00

153 lines
5.1 KiB
Python

from os import makedirs, path
from typing import List
from PIL import Image
from bs4 import BeautifulSoup, Tag
from lxml.html import fromstring
from pypub import Chapter, Epub, create_chapter_from_string # type: ignore
from pypub.const import SUPPORTED_TAGS # type: ignore
from requests import get
UNSUPPORTED_TAGS = ["a", "h3"]
OUTPUT_DIR = "output"
class MyChapter(Chapter):
def parse_etree(self):
"""generate new filtered element-tree"""
etree = fromstring(self.content)
# check if we can minimalize the scope
body = etree.xpath(".//body")
etree = body[0] if body else etree
article = etree.xpath(".//article")
etree = article[0] if article else etree
# iterate elements in tree and delete/modify them
for elem in [elem for elem in etree.iter()][1:]:
if elem.tag in UNSUPPORTED_TAGS:
parent = elem.getparent()
parent.remove(elem)
# if element tag is supported
elif elem.tag in SUPPORTED_TAGS:
# remove attributes not approved for specific tag
for attr in elem.attrib:
if attr != "style" and attr not in SUPPORTED_TAGS[elem.tag]:
elem.attrib.pop(attr)
# if element is not supported, append children to parent
else:
parent = elem.getparent()
for child in elem.getchildren():
parent.append(child)
parent.remove(elem)
# NOTE: this is a bug with lxml, some children have
# text in the parent included in the tail rather
# than text attribute, so we also append tail to text
if elem.tail and elem.tail.strip():
parent.text = (parent.text or "") + elem.tail.strip()
# ensure all images with no src are removed
for img in etree.xpath(".//img"):
if "src" not in img.attrib:
img.getparent().remove(img)
# return new element-tree
return etree
def replace_images(self, image_dir: str, timeout: int = 10):
try:
super().replace_images(image_dir, timeout)
except TimeoutError:
pass
for img in self.etree.xpath(".//img"):
if img.attrib["src"].startswith("http"):
img.getparent().remove(img)
else:
src = f"{path.dirname(image_dir)}/{img.attrib['src']}"
image = Image.open(src)
image.thumbnail((1000, 2000))
image.save(src)
makedirs(name=OUTPUT_DIR, exist_ok=True)
cover_req = get(
url="https://i0.wp.com/wanderinginn.com/wp-content/uploads/2023/03/Wandering_Inn-Vol1-eCover.jpg", # noqa: E501
stream=True,
)
cover_image = Image.open(cover_req.raw)
cover_image.thumbnail((500, 1000))
cover_image.save(f"{OUTPUT_DIR}/cover.png")
def process_volume(epub: Epub, urls: List[str]):
for url in urls:
page_req = get(url)
page_html = BeautifulSoup(markup=page_req.text, features="lxml")
page_content = page_html.select_one("div.entry-content")
page_title = page_html.select_one("h1.entry-title")
if not page_content or not page_title:
raise Exception("Missing title or content")
title = page_title.get_text().strip()
if not title:
continue
galleries = page_content.select("div.tiled-gallery")
for gallery in galleries:
gallery.decompose()
chapter = create_chapter_from_string(
html=page_content.prettify(),
title=title,
factory=MyChapter,
)
print(f"{epub.title} - {title}")
epub.add_chapter(chapter)
epub.create_epub(OUTPUT_DIR)
def process_book(
url: str, creator: str, publisher: str, language: str, strip_first=False
):
toc_req = get(url)
toc_html = BeautifulSoup(markup=toc_req.text, features="lxml")
toc_content = toc_html.select("div.entry-content > p")
toc_title = toc_html.select_one("#site-title > span > a")
toc_date = toc_html.find(name="meta", property="article:modified_time")
if not toc_title or type(toc_date) is not Tag:
raise Exception("Missing title or date")
title = toc_title.get_text().strip()
if strip_first:
toc_content.pop(0)
for i, toc_line in enumerate(toc_content):
if i % 2 == 0:
volume = toc_line.get_text().strip()
elif volume:
epub = Epub(
title=f"{title} - {volume}",
creator=creator,
language=language,
publisher=publisher,
date=toc_date["content"],
cover=f"{OUTPUT_DIR}/cover.png",
epub_dir=f"{OUTPUT_DIR}/{title} - {volume}",
)
urls = []
for link in toc_line.select("a"):
urls.append(link.attrs["href"])
process_volume(epub=epub, urls=urls)
process_book(
url="https://aubergevagabonde.wordpress.com/sommaire/",
creator="Pirateaba",
publisher="ElliVia",
language="fr",
)