From 9127bdd8ae565e607aa299ce6ad5ab1cd36c0932 Mon Sep 17 00:00:00 2001 From: Michel Roux Date: Thu, 13 Feb 2025 19:38:48 +0000 Subject: [PATCH] fix: :lipstick: add image support behind cloudflare --- auberge_vagabonde/__main__.py | 71 ++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 2 deletions(-) diff --git a/auberge_vagabonde/__main__.py b/auberge_vagabonde/__main__.py index e6942dc..799ef40 100644 --- a/auberge_vagabonde/__main__.py +++ b/auberge_vagabonde/__main__.py @@ -6,10 +6,17 @@ from random import randint from tempfile import NamedTemporaryFile from time import sleep from typing import cast +from urllib import error, parse +from uuid import uuid4 from bs4 import BeautifulSoup from curl_cffi import requests -from pypub import Chapter, Epub, SimpleChapterFactory # type: ignore +from pypub import ( # type: ignore + Chapter, + Epub, + SimpleChapterFactory, + factory, +) from pyxml.html import HtmlElement, fromstring # type: ignore parser = ArgumentParser() @@ -37,6 +44,66 @@ class MyChapterFactory(SimpleChapterFactory): # type: ignore # return new element-tree return etree + def hydrate(self, ctx: factory.RenderCtx) -> None: + """ + modify chapter element-tree to render images + """ + downloads: dict[str, str] = {} + for image in ctx.etree.xpath(".//img[@src]"): + # cleanup link and resolve relative paths + url = image.attrib["src"].rsplit("?", 1)[0] + fmt = (ctx.chapter.title, url) + if "://" not in url: + if not ctx.chapter.url: + ctx.logger.warning( + "chapter[{}] cannot render image {!r} w/o chapter-url".format( + *fmt + ) + ) + continue + url = parse.urljoin(ctx.chapter.url, url) + fmt = (ctx.chapter.title, url) + # skip if url has already been downloaded + if url in downloads: + image.attrib["src"] = downloads[url] + continue + # download url into local image folder for epub + ctx.logger.debug("chapter[{}] downloading image: {!r}".format(*fmt)) + try: + res = requests.get( + url, + timeout=ctx.timeout, + impersonate=requests.BrowserType.firefox133.value, + thread=None, + curl_options=None, + debug=False, + ) + # ensure status of response is valid + status = getattr(res, "status", None) + if status and status != 200: + raise error.URLError(f"status: {status}") + # read first chunk to determine content-type + chunk = res.content + mime = factory.mime_type(url, chunk) + if not mime: + ctx.logger.warning( + "chapter[{}] cannot identify {!r} mime".format(*fmt) + ) + continue + fname = f"image-{uuid4()}.{mime}" + fpath = path.join(ctx.imagedir, fname) + # read rest of the content into associated file + with open(fpath, "wb") as f: + f.write(chunk) + # save epub-path in downloads cache and update image attribs + epub_path = path.join("images/", fname) + downloads[url] = epub_path + image.attrib["src"] = epub_path + except error.URLError: + ctx.logger.error("chapter[{}] failed to download {!r}".format(*fmt)) + if ctx.extern_links and ctx.chapter.url: + self.externalize_links(ctx.chapter.url, ctx.etree) + with open(f"{path.dirname(__file__)}/{args.lang}.json") as f: manifest = load(f) @@ -68,7 +135,7 @@ for url in book["chapters"]: continue chapter.content += elem.prettify() epub.add_chapter(chapter) - timeout = randint(10, 90) + timeout = randint(1, 29) print(f"Wait {timeout} seconds...") sleep(timeout)