This repository has been archived on 2024-02-23. You can view files and clone it, but cannot push or open issues or pull requests.
Auberge_Vagabonde/books.py

179 lines
5.7 KiB
Python
Raw Normal View History

2023-01-11 12:35:37 +00:00
from asyncio import gather, get_event_loop, run
from functools import partial, wraps
2023-01-12 16:31:27 +00:00
from os import makedirs, path
2023-01-11 12:35:37 +00:00
from typing import List
from PIL import Image
from bs4 import BeautifulSoup, Tag
from lxml.html import fromstring
from pypub import Chapter, Epub, create_chapter_from_string # type: ignore
from pypub.const import SUPPORTED_TAGS # type: ignore
from requests import get
UNSUPPORTED_TAGS = ["a", "h3"]
OUTPUT_DIR = "output"
class MyChapter(Chapter):
def parse_etree(self):
"""generate new filtered element-tree"""
etree = fromstring(self.content)
# check if we can minimalize the scope
body = etree.xpath(".//body")
etree = body[0] if body else etree
article = etree.xpath(".//article")
etree = article[0] if article else etree
# iterate elements in tree and delete/modify them
for elem in [elem for elem in etree.iter()][1:]:
if elem.tag in UNSUPPORTED_TAGS:
parent = elem.getparent()
parent.remove(elem)
# if element tag is supported
elif elem.tag in SUPPORTED_TAGS:
# remove attributes not approved for specific tag
for attr in elem.attrib:
if attr != "style" and attr not in SUPPORTED_TAGS[elem.tag]:
elem.attrib.pop(attr)
# if element is not supported, append children to parent
else:
parent = elem.getparent()
for child in elem.getchildren():
parent.append(child)
parent.remove(elem)
# NOTE: this is a bug with lxml, some children have
# text in the parent included in the tail rather
# than text attribute, so we also append tail to text
if elem.tail and elem.tail.strip():
parent.text = (parent.text or "") + elem.tail.strip()
# ensure all images with no src are removed
for img in etree.xpath(".//img"):
if "src" not in img.attrib:
img.getparent().remove(img)
# return new element-tree
return etree
2023-01-12 13:23:00 +00:00
def replace_images(self, image_dir: str, timeout: int = 10):
2023-02-28 14:41:24 +00:00
try:
super().replace_images(image_dir, timeout)
except TimeoutError:
pass
2023-01-12 13:23:00 +00:00
for img in self.etree.xpath(".//img"):
if img.attrib["src"].startswith("http"):
img.getparent().remove(img)
2023-01-12 16:31:27 +00:00
else:
2023-01-12 16:35:02 +00:00
src = f"{path.dirname(image_dir)}/{img.attrib['src']}"
image = Image.open(src)
2023-01-12 16:31:27 +00:00
image.thumbnail((1000, 2000))
2023-01-12 16:35:02 +00:00
image.save(src)
2023-01-12 13:23:00 +00:00
2023-01-11 12:35:37 +00:00
def async_wrap(func):
@wraps(func)
async def run(*args, loop=None, executor=None, **kwargs):
if loop is None:
loop = get_event_loop()
pfunc = partial(func, *args, **kwargs)
return await loop.run_in_executor(executor, pfunc)
return run
makedirs(name=OUTPUT_DIR, exist_ok=True)
cover_req = get(
2023-03-05 15:18:01 +00:00
url="https://i0.wp.com/wanderinginn.com/wp-content/uploads/2023/03/Wandering_Inn-Vol1-eCover.jpg", # noqa: E501
2023-01-11 12:35:37 +00:00
stream=True,
)
Image.open(cover_req.raw).save(f"{OUTPUT_DIR}/cover.png")
def process_volume(epub: Epub, urls: List[str]):
for url in urls:
page_req = get(url)
page_html = BeautifulSoup(markup=page_req.text, features="lxml")
page_content = page_html.select_one("div.entry-content")
page_title = page_html.select_one("h1.entry-title")
2023-03-05 15:18:01 +00:00
if not page_content or not page_title:
raise Exception("Missing title or content")
title = page_title.get_text().strip()
if not title:
continue
2023-01-11 12:35:37 +00:00
galleries = page_content.select("div.tiled-gallery")
for gallery in galleries:
gallery.decompose()
chapter = create_chapter_from_string(
html=page_content.prettify(),
title=title,
factory=MyChapter,
)
print(f"{epub.title} - {title}")
epub.add_chapter(chapter)
epub.create_epub(OUTPUT_DIR)
@async_wrap
2023-03-05 15:18:01 +00:00
def process_book(
url: str, creator: str, publisher: str, language: str, strip_first=False
):
2023-01-11 12:35:37 +00:00
toc_req = get(url)
toc_html = BeautifulSoup(markup=toc_req.text, features="lxml")
toc_content = toc_html.select("div.entry-content > p")
toc_title = toc_html.select_one("#site-title > span > a")
2023-01-12 18:43:45 +00:00
toc_date = toc_html.find(name="meta", property="article:modified_time")
2023-01-11 12:35:37 +00:00
if not toc_title or type(toc_date) is not Tag:
2023-02-28 14:41:24 +00:00
raise Exception("Missing title or date")
2023-03-05 15:18:01 +00:00
title = toc_title.get_text().strip()
if strip_first:
toc_content.pop(0)
2023-01-11 12:35:37 +00:00
for i, toc_line in enumerate(toc_content):
if i % 2 == 0:
volume = toc_line.get_text().strip()
elif volume:
epub = Epub(
title=f"{title} - {volume}",
creator=creator,
language=language,
publisher=publisher,
date=toc_date["content"],
cover=f"{OUTPUT_DIR}/cover.png",
epub_dir=f"{OUTPUT_DIR}/{title} - {volume}",
)
urls = []
for link in toc_line.select("a"):
urls.append(link.attrs["href"])
process_volume(epub=epub, urls=urls)
async def create_books():
await gather(
process_book(
2023-01-13 21:54:07 +00:00
url="https://aubergevagabonde.wordpress.com/sommaire/",
creator="Pirateaba",
publisher="ElliVia",
language="fr",
2023-01-11 12:35:37 +00:00
),
process_book(
2023-01-13 21:54:07 +00:00
url="https://wanderinginn.com/table-of-contents/",
creator="Pirateaba",
publisher="Xefir",
language="en",
2023-03-05 15:18:01 +00:00
strip_first=True,
2023-01-11 12:35:37 +00:00
),
)
2023-02-28 14:41:24 +00:00
run(create_books(), debug=True)