From 9127bdd8ae565e607aa299ce6ad5ab1cd36c0932 Mon Sep 17 00:00:00 2001
From: Michel Roux <xefir@crystalyx.net>
Date: Thu, 13 Feb 2025 19:38:48 +0000
Subject: [PATCH] fix: :lipstick: add image support behind cloudflare

---
 auberge_vagabonde/__main__.py | 71 ++++++++++++++++++++++++++++++++++-
 1 file changed, 69 insertions(+), 2 deletions(-)

diff --git a/auberge_vagabonde/__main__.py b/auberge_vagabonde/__main__.py
index e6942dc..799ef40 100644
--- a/auberge_vagabonde/__main__.py
+++ b/auberge_vagabonde/__main__.py
@@ -6,10 +6,17 @@ from random import randint
 from tempfile import NamedTemporaryFile
 from time import sleep
 from typing import cast
+from urllib import error, parse
+from uuid import uuid4
 
 from bs4 import BeautifulSoup
 from curl_cffi import requests
-from pypub import Chapter, Epub, SimpleChapterFactory  # type: ignore
+from pypub import (  # type: ignore
+    Chapter,
+    Epub,
+    SimpleChapterFactory,
+    factory,
+)
 from pyxml.html import HtmlElement, fromstring  # type: ignore
 
 parser = ArgumentParser()
@@ -37,6 +44,66 @@ class MyChapterFactory(SimpleChapterFactory):  # type: ignore
         # return new element-tree
         return etree
 
+    def hydrate(self, ctx: factory.RenderCtx) -> None:
+        """
+        modify chapter element-tree to render images
+        """
+        downloads: dict[str, str] = {}
+        for image in ctx.etree.xpath(".//img[@src]"):
+            # cleanup link and resolve relative paths
+            url = image.attrib["src"].rsplit("?", 1)[0]
+            fmt = (ctx.chapter.title, url)
+            if "://" not in url:
+                if not ctx.chapter.url:
+                    ctx.logger.warning(
+                        "chapter[{}] cannot render image {!r} w/o chapter-url".format(
+                            *fmt
+                        )
+                    )
+                    continue
+                url = parse.urljoin(ctx.chapter.url, url)
+                fmt = (ctx.chapter.title, url)
+            # skip if url has already been downloaded
+            if url in downloads:
+                image.attrib["src"] = downloads[url]
+                continue
+            # download url into local image folder for epub
+            ctx.logger.debug("chapter[{}] downloading image: {!r}".format(*fmt))
+            try:
+                res = requests.get(
+                    url,
+                    timeout=ctx.timeout,
+                    impersonate=requests.BrowserType.firefox133.value,
+                    thread=None,
+                    curl_options=None,
+                    debug=False,
+                )
+                # ensure status of response is valid
+                status = getattr(res, "status", None)
+                if status and status != 200:
+                    raise error.URLError(f"status: {status}")
+                # read first chunk to determine content-type
+                chunk = res.content
+                mime = factory.mime_type(url, chunk)
+                if not mime:
+                    ctx.logger.warning(
+                        "chapter[{}] cannot identify {!r} mime".format(*fmt)
+                    )
+                    continue
+                fname = f"image-{uuid4()}.{mime}"
+                fpath = path.join(ctx.imagedir, fname)
+                # read rest of the content into associated file
+                with open(fpath, "wb") as f:
+                    f.write(chunk)
+                # save epub-path in downloads cache and update image attribs
+                epub_path = path.join("images/", fname)
+                downloads[url] = epub_path
+                image.attrib["src"] = epub_path
+            except error.URLError:
+                ctx.logger.error("chapter[{}] failed to download {!r}".format(*fmt))
+        if ctx.extern_links and ctx.chapter.url:
+            self.externalize_links(ctx.chapter.url, ctx.etree)
+
 
 with open(f"{path.dirname(__file__)}/{args.lang}.json") as f:
     manifest = load(f)
@@ -68,7 +135,7 @@ for url in book["chapters"]:
             continue
         chapter.content += elem.prettify()
     epub.add_chapter(chapter)
-    timeout = randint(10, 90)
+    timeout = randint(1, 29)
     print(f"Wait {timeout} seconds...")
     sleep(timeout)