From 3f94553e7cd2a94d88bd3f2f46c7fd49957766c5 Mon Sep 17 00:00:00 2001 From: Michel Roux Date: Fri, 14 Feb 2025 08:59:37 +0000 Subject: [PATCH] fix: :zap: reduce image size --- auberge_vagabonde/__main__.py | 75 +++++++---------------------------- poetry.lock | 2 +- pyproject.toml | 1 + 3 files changed, 17 insertions(+), 61 deletions(-) diff --git a/auberge_vagabonde/__main__.py b/auberge_vagabonde/__main__.py index 4c501d1..1693e82 100644 --- a/auberge_vagabonde/__main__.py +++ b/auberge_vagabonde/__main__.py @@ -6,11 +6,10 @@ from random import randint from tempfile import NamedTemporaryFile from time import sleep from typing import cast -from urllib import error, parse -from uuid import uuid4 from bs4 import BeautifulSoup from curl_cffi import requests +from PIL import Image from pypub import ( # type: ignore Chapter, Epub, @@ -44,68 +43,24 @@ class MyChapterFactory(SimpleChapterFactory): # type: ignore # return new element-tree return etree - def render_images(self, ctx: factory.RenderCtx) -> None: - """ - replace global image references w/ local downloaded ones - """ - downloads: dict[str, str] = {} - for image in ctx.etree.xpath(".//img[@src]"): - # cleanup link and resolve relative paths - url = image.attrib["src"].rsplit("?", 1)[0] - fmt = (ctx.chapter.title, url) - if "://" not in url: - if not ctx.chapter.url: - ctx.logger.warning( - "chapter[{}] cannot render image {!r} w/o chapter-url".format( - *fmt - ) - ) - continue - url = parse.urljoin(ctx.chapter.url, url) - fmt = (ctx.chapter.title, url) - # skip if url has already been downloaded - if url in downloads: - image.attrib["src"] = downloads[url] - continue - # download url into local image folder for epub - ctx.logger.debug("chapter[{}] downloading image: {!r}".format(*fmt)) - try: - res = requests.get( - url, - impersonate=requests.BrowserType.firefox133.value, - thread=None, - curl_options=None, - debug=False, - ) - # ensure status of response is valid - status = getattr(res, "status", None) - if status and status != 200: - raise error.URLError(f"status: {status}") - # read first chunk to determine content-type - chunk = res.content - mime = factory.mime_type(url, chunk) - if not mime: - ctx.logger.warning( - "chapter[{}] cannot identify {!r} mime".format(*fmt) - ) - continue - fname = f"image-{uuid4()}.{mime}" - fpath = path.join(ctx.imagedir, fname) - # read rest of the content into associated file - with open(fpath, "wb") as f: - f.write(chunk) - # save epub-path in downloads cache and update image attribs - epub_path = path.join("images/", fname) - downloads[url] = epub_path - image.attrib["src"] = epub_path - except error.URLError: - ctx.logger.error("chapter[{}] failed to download {!r}".format(*fmt)) - def hydrate(self, ctx: factory.RenderCtx) -> None: """ modify chapter element-tree to render images """ - self.render_images(ctx) + ctx.timeout = 100 + factory.render_images(ctx) + for image in ctx.etree.xpath(".//img[@src]"): + # cleanup link and resolve relative paths + url = image.attrib["src"].rsplit("?", 1)[0] + if "://" in url: + # retrieve parent + parent = cast(HtmlElement, image.getparent()) + parent.remove(image) + else: + fpath = path.join(ctx.imagedir, url.replace("images/", "")) + image = Image.open(fpath) + image.thumbnail((1000, 2000)) + image.save(fpath) if ctx.extern_links and ctx.chapter.url: factory.externalize_links(ctx.chapter.url, ctx.etree) diff --git a/poetry.lock b/poetry.lock index c9d1ddd..1f361f1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -556,4 +556,4 @@ files = [ [metadata] lock-version = "2.1" python-versions = "^3.9" -content-hash = "84a41aac6241a5e1eec3635df8c2e51d27506f1098f0e3c45dbed90aa26e5c60" +content-hash = "f6160acfe46ff419f788eda7f0d7ff8bd5ab69b2a6bc188331e2501cb6bd41a8" diff --git a/pyproject.toml b/pyproject.toml index dd294f2..2d1ab4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ license = "WTFPL" python = "^3.9" beautifulsoup4 = "^4.13.3" curl_cffi = "^0.9.0b2" +pillow = "^11.1.0" pypub3 = "^2.0.7" pyxml3 = "^0.0.4"