fix: 💄 add image support behind cloudflare

2025-02-13 19:38:48 +00:00 · 2025-02-13 19:38:48 +00:00 · 9127bdd8ae
commit 9127bdd8ae
parent 14b12e3c28
1 changed files with 69 additions and 2 deletions
--- a/auberge_vagabonde/main.py
+++ b/auberge_vagabonde/main.py
@ -6,10 +6,17 @@ from random import randint
 from tempfile import NamedTemporaryFile
 from time import sleep
 from typing import cast
+from urllib import error, parse
+from uuid import uuid4

 from bs4 import BeautifulSoup
 from curl_cffi import requests
-from pypub import Chapter, Epub, SimpleChapterFactory  # type: ignore
+from pypub import (  # type: ignore
+    Chapter,
+    Epub,
+    SimpleChapterFactory,
+    factory,
+)
 from pyxml.html import HtmlElement, fromstring  # type: ignore

 parser = ArgumentParser()
@ -37,6 +44,66 @@ class MyChapterFactory(SimpleChapterFactory):  # type: ignore
        # return new element-tree
        return etree

+    def hydrate(self, ctx: factory.RenderCtx) -> None:
+        """
+        modify chapter element-tree to render images
+        """
+        downloads: dict[str, str] = {}
+        for image in ctx.etree.xpath(".//img[@src]"):
+            # cleanup link and resolve relative paths
+            url = image.attrib["src"].rsplit("?", 1)[0]
+            fmt = (ctx.chapter.title, url)
+            if "://" not in url:
+                if not ctx.chapter.url:
+                    ctx.logger.warning(
+                        "chapter[{}] cannot render image {!r} w/o chapter-url".format(
+                            *fmt
+                        )
+                    )
+                    continue
+                url = parse.urljoin(ctx.chapter.url, url)
+                fmt = (ctx.chapter.title, url)
+            # skip if url has already been downloaded
+            if url in downloads:
+                image.attrib["src"] = downloads[url]
+                continue
+            # download url into local image folder for epub
+            ctx.logger.debug("chapter[{}] downloading image: {!r}".format(*fmt))
+            try:
+                res = requests.get(
+                    url,
+                    timeout=ctx.timeout,
+                    impersonate=requests.BrowserType.firefox133.value,
+                    thread=None,
+                    curl_options=None,
+                    debug=False,
+                )
+                # ensure status of response is valid
+                status = getattr(res, "status", None)
+                if status and status != 200:
+                    raise error.URLError(f"status: {status}")
+                # read first chunk to determine content-type
+                chunk = res.content
+                mime = factory.mime_type(url, chunk)
+                if not mime:
+                    ctx.logger.warning(
+                        "chapter[{}] cannot identify {!r} mime".format(*fmt)
+                    )
+                    continue
+                fname = f"image-{uuid4()}.{mime}"
+                fpath = path.join(ctx.imagedir, fname)
+                # read rest of the content into associated file
+                with open(fpath, "wb") as f:
+                    f.write(chunk)
+                # save epub-path in downloads cache and update image attribs
+                epub_path = path.join("images/", fname)
+                downloads[url] = epub_path
+                image.attrib["src"] = epub_path
+            except error.URLError:
+                ctx.logger.error("chapter[{}] failed to download {!r}".format(*fmt))
+        if ctx.extern_links and ctx.chapter.url:
+            self.externalize_links(ctx.chapter.url, ctx.etree)
+

 with open(f"{path.dirname(__file__)}/{args.lang}.json") as f:
    manifest = load(f)
@ -68,7 +135,7 @@ for url in book["chapters"]:
            continue
        chapter.content += elem.prettify()
    epub.add_chapter(chapter)
-    timeout = randint(10, 90)
+    timeout = randint(1, 29)
    print(f"Wait {timeout} seconds...")
    sleep(timeout)