fix: 💄 add image support behind cloudflare
This commit is contained in:
parent
14b12e3c28
commit
9127bdd8ae
@ -6,10 +6,17 @@ from random import randint
|
||||
from tempfile import NamedTemporaryFile
|
||||
from time import sleep
|
||||
from typing import cast
|
||||
from urllib import error, parse
|
||||
from uuid import uuid4
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from curl_cffi import requests
|
||||
from pypub import Chapter, Epub, SimpleChapterFactory # type: ignore
|
||||
from pypub import ( # type: ignore
|
||||
Chapter,
|
||||
Epub,
|
||||
SimpleChapterFactory,
|
||||
factory,
|
||||
)
|
||||
from pyxml.html import HtmlElement, fromstring # type: ignore
|
||||
|
||||
parser = ArgumentParser()
|
||||
@ -37,6 +44,66 @@ class MyChapterFactory(SimpleChapterFactory): # type: ignore
|
||||
# return new element-tree
|
||||
return etree
|
||||
|
||||
def hydrate(self, ctx: factory.RenderCtx) -> None:
|
||||
"""
|
||||
modify chapter element-tree to render images
|
||||
"""
|
||||
downloads: dict[str, str] = {}
|
||||
for image in ctx.etree.xpath(".//img[@src]"):
|
||||
# cleanup link and resolve relative paths
|
||||
url = image.attrib["src"].rsplit("?", 1)[0]
|
||||
fmt = (ctx.chapter.title, url)
|
||||
if "://" not in url:
|
||||
if not ctx.chapter.url:
|
||||
ctx.logger.warning(
|
||||
"chapter[{}] cannot render image {!r} w/o chapter-url".format(
|
||||
*fmt
|
||||
)
|
||||
)
|
||||
continue
|
||||
url = parse.urljoin(ctx.chapter.url, url)
|
||||
fmt = (ctx.chapter.title, url)
|
||||
# skip if url has already been downloaded
|
||||
if url in downloads:
|
||||
image.attrib["src"] = downloads[url]
|
||||
continue
|
||||
# download url into local image folder for epub
|
||||
ctx.logger.debug("chapter[{}] downloading image: {!r}".format(*fmt))
|
||||
try:
|
||||
res = requests.get(
|
||||
url,
|
||||
timeout=ctx.timeout,
|
||||
impersonate=requests.BrowserType.firefox133.value,
|
||||
thread=None,
|
||||
curl_options=None,
|
||||
debug=False,
|
||||
)
|
||||
# ensure status of response is valid
|
||||
status = getattr(res, "status", None)
|
||||
if status and status != 200:
|
||||
raise error.URLError(f"status: {status}")
|
||||
# read first chunk to determine content-type
|
||||
chunk = res.content
|
||||
mime = factory.mime_type(url, chunk)
|
||||
if not mime:
|
||||
ctx.logger.warning(
|
||||
"chapter[{}] cannot identify {!r} mime".format(*fmt)
|
||||
)
|
||||
continue
|
||||
fname = f"image-{uuid4()}.{mime}"
|
||||
fpath = path.join(ctx.imagedir, fname)
|
||||
# read rest of the content into associated file
|
||||
with open(fpath, "wb") as f:
|
||||
f.write(chunk)
|
||||
# save epub-path in downloads cache and update image attribs
|
||||
epub_path = path.join("images/", fname)
|
||||
downloads[url] = epub_path
|
||||
image.attrib["src"] = epub_path
|
||||
except error.URLError:
|
||||
ctx.logger.error("chapter[{}] failed to download {!r}".format(*fmt))
|
||||
if ctx.extern_links and ctx.chapter.url:
|
||||
self.externalize_links(ctx.chapter.url, ctx.etree)
|
||||
|
||||
|
||||
with open(f"{path.dirname(__file__)}/{args.lang}.json") as f:
|
||||
manifest = load(f)
|
||||
@ -68,7 +135,7 @@ for url in book["chapters"]:
|
||||
continue
|
||||
chapter.content += elem.prettify()
|
||||
epub.add_chapter(chapter)
|
||||
timeout = randint(10, 90)
|
||||
timeout = randint(1, 29)
|
||||
print(f"Wait {timeout} seconds...")
|
||||
sleep(timeout)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user