107 lines
3.4 KiB
Python
Raw Normal View History

2025-02-11 22:43:15 +01:00
from argparse import ArgumentParser
from json import load
from os import path
from pathlib import Path
from random import randint
from tempfile import NamedTemporaryFile
from time import sleep
from typing import cast
from bs4 import BeautifulSoup
2025-02-11 22:43:15 +01:00
from curl_cffi import requests
2025-02-14 08:59:37 +00:00
from PIL import Image
from pypub import ( # type: ignore
Chapter,
Epub,
SimpleChapterFactory,
factory,
)
from pyxml.html import HtmlElement, fromstring # type: ignore
2025-02-11 22:43:15 +01:00
parser = ArgumentParser()
parser.add_argument("-l", "--lang", choices=["fr", "en"], required=True)
parser.add_argument("-v", "--volume", type=int, choices=range(1, 10), required=True)
parser.add_argument("output", type=Path)
args = parser.parse_args()
class MyChapterFactory(SimpleChapterFactory): # type: ignore
def cleanup_html(self, content: bytes) -> HtmlElement:
"""
cleanup html content to only include supported tags
"""
# check if we can minimalize the scope
etree = fromstring(content)
# fix and remove invalid images
for img in etree.xpath(".//img"):
# ensure all images with no src are removed
if "src" not in img.attrib:
cast(HtmlElement, img.getparent()).remove(img)
# ensure they also have an alt
elif "alt" not in img.attrib:
img.attrib["alt"] = img.attrib["src"]
# return new element-tree
return etree
2025-02-14 08:59:37 +00:00
def hydrate(self, ctx: factory.RenderCtx) -> None:
"""
2025-02-14 08:59:37 +00:00
modify chapter element-tree to render images
"""
2025-02-14 08:59:37 +00:00
ctx.timeout = 100
factory.render_images(ctx)
for image in ctx.etree.xpath(".//img[@src]"):
# cleanup link and resolve relative paths
url = image.attrib["src"].rsplit("?", 1)[0]
2025-02-14 08:59:37 +00:00
if "://" in url:
# retrieve parent
parent = cast(HtmlElement, image.getparent())
parent.remove(image)
else:
fpath = path.join(ctx.imagedir, url.replace("images/", ""))
image = Image.open(fpath)
image.thumbnail((1000, 2000))
image.save(fpath)
if ctx.extern_links and ctx.chapter.url:
2025-02-13 19:47:08 +00:00
factory.externalize_links(ctx.chapter.url, ctx.etree)
2025-02-11 22:43:15 +01:00
with open(f"{path.dirname(__file__)}/{args.lang}.json") as f:
manifest = load(f)
book = manifest[args.volume - 1]
epub = Epub(
title=book["title"],
creator=book["creator"],
language=args.lang,
publisher="Nanamazon+",
factory=MyChapterFactory(),
)
for url in book["chapters"]:
markup = requests.get(
url,
impersonate=requests.BrowserType.firefox133.value,
thread=None,
curl_options=None,
debug=False,
)
etree = BeautifulSoup(markup.text, "html.parser")
title = etree.select_one("h1.entry-title").text # type: ignore
content = etree.select("div.entry-content p")
2025-02-11 22:43:15 +01:00
print(f"Chapter {title}...")
chapter = Chapter(title, "")
for elem in content:
if elem.a:
2025-02-11 22:43:15 +01:00
continue
chapter.content += elem.prettify()
2025-02-11 22:43:15 +01:00
epub.add_chapter(chapter)
timeout = randint(1, 29)
print(f"Wait {timeout} seconds...")
sleep(timeout)
2025-02-11 22:43:15 +01:00
with NamedTemporaryFile() as cover:
response = requests.get(book["cover"], thread=None, curl_options=None, debug=False)
cover.write(response.content)
epub.cover = cover.name
epub.create(args.output)