Auberge_Vagabonde/auberge_vagabonde/__main__.py

from argparse import ArgumentParser
from json import load
from os import path
from pathlib import Path
from random import randint
from tempfile import NamedTemporaryFile
from time import sleep
from typing import cast
from urllib import error, parse
from uuid import uuid4

from bs4 import BeautifulSoup
from curl_cffi import requests
from pypub import (  # type: ignore
    Chapter,
    Epub,
    SimpleChapterFactory,
    factory,
)
from pyxml.html import HtmlElement, fromstring  # type: ignore

parser = ArgumentParser()
parser.add_argument("-l", "--lang", choices=["fr", "en"], required=True)
parser.add_argument("-v", "--volume", type=int, choices=range(1, 10), required=True)
parser.add_argument("output", type=Path)
args = parser.parse_args()


class MyChapterFactory(SimpleChapterFactory):  # type: ignore
    def cleanup_html(self, content: bytes) -> HtmlElement:
        """
        cleanup html content to only include supported tags
        """
        # check if we can minimalize the scope
        etree = fromstring(content)
        # fix and remove invalid images
        for img in etree.xpath(".//img"):
            # ensure all images with no src are removed
            if "src" not in img.attrib:
                cast(HtmlElement, img.getparent()).remove(img)
            # ensure they also have an alt
            elif "alt" not in img.attrib:
                img.attrib["alt"] = img.attrib["src"]
        # return new element-tree
        return etree

    def hydrate(self, ctx: factory.RenderCtx) -> None:
        """
        modify chapter element-tree to render images
        """
        downloads: dict[str, str] = {}
        for image in ctx.etree.xpath(".//img[@src]"):
            # cleanup link and resolve relative paths
            url = image.attrib["src"].rsplit("?", 1)[0]
            fmt = (ctx.chapter.title, url)
            if "://" not in url:
                if not ctx.chapter.url:
                    ctx.logger.warning(
                        "chapter[{}] cannot render image {!r} w/o chapter-url".format(
                            *fmt
                        )
                    )
                    continue
                url = parse.urljoin(ctx.chapter.url, url)
                fmt = (ctx.chapter.title, url)
            # skip if url has already been downloaded
            if url in downloads:
                image.attrib["src"] = downloads[url]
                continue
            # download url into local image folder for epub
            ctx.logger.debug("chapter[{}] downloading image: {!r}".format(*fmt))
            try:
                res = requests.get(
                    url,
                    timeout=ctx.timeout,
                    impersonate=requests.BrowserType.firefox133.value,
                    thread=None,
                    curl_options=None,
                    debug=False,
                )
                # ensure status of response is valid
                status = getattr(res, "status", None)
                if status and status != 200:
                    raise error.URLError(f"status: {status}")
                # read first chunk to determine content-type
                chunk = res.content
                mime = factory.mime_type(url, chunk)
                if not mime:
                    ctx.logger.warning(
                        "chapter[{}] cannot identify {!r} mime".format(*fmt)
                    )
                    continue
                fname = f"image-{uuid4()}.{mime}"
                fpath = path.join(ctx.imagedir, fname)
                # read rest of the content into associated file
                with open(fpath, "wb") as f:
                    f.write(chunk)
                # save epub-path in downloads cache and update image attribs
                epub_path = path.join("images/", fname)
                downloads[url] = epub_path
                image.attrib["src"] = epub_path
            except error.URLError:
                ctx.logger.error("chapter[{}] failed to download {!r}".format(*fmt))
        if ctx.extern_links and ctx.chapter.url:
            self.externalize_links(ctx.chapter.url, ctx.etree)


with open(f"{path.dirname(__file__)}/{args.lang}.json") as f:
    manifest = load(f)
    book = manifest[args.volume - 1]

epub = Epub(
    title=book["title"],
    creator=book["creator"],
    language=args.lang,
    publisher="Nanamazon+",
    factory=MyChapterFactory(),
)

for url in book["chapters"]:
    markup = requests.get(
        url,
        impersonate=requests.BrowserType.firefox133.value,
        thread=None,
        curl_options=None,
        debug=False,
    )
    etree = BeautifulSoup(markup.text, "html.parser")
    title = etree.select_one("h1.entry-title").text  # type: ignore
    content = etree.select("div.entry-content p")
    print(f"Chapter {title}...")
    chapter = Chapter(title, "")
    for elem in content:
        if elem.a:
            continue
        chapter.content += elem.prettify()
    epub.add_chapter(chapter)
    timeout = randint(1, 29)
    print(f"Wait {timeout} seconds...")
    sleep(timeout)

with NamedTemporaryFile() as cover:
    response = requests.get(book["cover"], thread=None, curl_options=None, debug=False)
    cover.write(response.content)
    epub.cover = cover.name
    epub.create(args.output)
refactor: :recycle: refresh project 2025-02-11 22:43:15 +01:00			`from argparse import ArgumentParser`
			`from json import load`
			`from os import path`
			`from pathlib import Path`
			`from random import randint`
			`from tempfile import NamedTemporaryFile`
			`from time import sleep`
			`from typing import cast`
fix: :lipstick: add image support behind cloudflare 2025-02-13 19:38:48 +00:00			`from urllib import error, parse`
			`from uuid import uuid4`
refactor: :recycle: refresh project 2025-02-11 22:43:15 +01:00
fix: :bug: use beautilfulsoup to extract bad html 2025-02-11 23:25:14 +01:00			`from bs4 import BeautifulSoup`
refactor: :recycle: refresh project 2025-02-11 22:43:15 +01:00			`from curl_cffi import requests`
fix: :lipstick: add image support behind cloudflare 2025-02-13 19:38:48 +00:00			`from pypub import ( # type: ignore`
			`Chapter,`
			`Epub,`
			`SimpleChapterFactory,`
			`factory,`
			`)`
fix: :bug: use beautilfulsoup to extract bad html 2025-02-11 23:25:14 +01:00			`from pyxml.html import HtmlElement, fromstring # type: ignore`
refactor: :recycle: refresh project 2025-02-11 22:43:15 +01:00
			`parser = ArgumentParser()`
			`parser.add_argument("-l", "--lang", choices=["fr", "en"], required=True)`
			`parser.add_argument("-v", "--volume", type=int, choices=range(1, 10), required=True)`
			`parser.add_argument("output", type=Path)`
			`args = parser.parse_args()`


			`class MyChapterFactory(SimpleChapterFactory): # type: ignore`
			`def cleanup_html(self, content: bytes) -> HtmlElement:`
			`"""`
			`cleanup html content to only include supported tags`
			`"""`
			`# check if we can minimalize the scope`
			`etree = fromstring(content)`
			`# fix and remove invalid images`
			`for img in etree.xpath(".//img"):`
			`# ensure all images with no src are removed`
			`if "src" not in img.attrib:`
			`cast(HtmlElement, img.getparent()).remove(img)`
			`# ensure they also have an alt`
			`elif "alt" not in img.attrib:`
			`img.attrib["alt"] = img.attrib["src"]`
			`# return new element-tree`
			`return etree`

fix: :lipstick: add image support behind cloudflare 2025-02-13 19:38:48 +00:00			`def hydrate(self, ctx: factory.RenderCtx) -> None:`
			`"""`
			`modify chapter element-tree to render images`
			`"""`
			`downloads: dict[str, str] = {}`
			`for image in ctx.etree.xpath(".//img[@src]"):`
			`# cleanup link and resolve relative paths`
			`url = image.attrib["src"].rsplit("?", 1)[0]`
			`fmt = (ctx.chapter.title, url)`
			`if "://" not in url:`
			`if not ctx.chapter.url:`
			`ctx.logger.warning(`
			`"chapter[{}] cannot render image {!r} w/o chapter-url".format(`
			`*fmt`
			`)`
			`)`
			`continue`
			`url = parse.urljoin(ctx.chapter.url, url)`
			`fmt = (ctx.chapter.title, url)`
			`# skip if url has already been downloaded`
			`if url in downloads:`
			`image.attrib["src"] = downloads[url]`
			`continue`
			`# download url into local image folder for epub`
			`ctx.logger.debug("chapter[{}] downloading image: {!r}".format(*fmt))`
			`try:`
			`res = requests.get(`
			`url,`
			`timeout=ctx.timeout,`
			`impersonate=requests.BrowserType.firefox133.value,`
			`thread=None,`
			`curl_options=None,`
			`debug=False,`
			`)`
			`# ensure status of response is valid`
			`status = getattr(res, "status", None)`
			`if status and status != 200:`
			`raise error.URLError(f"status: {status}")`
			`# read first chunk to determine content-type`
			`chunk = res.content`
			`mime = factory.mime_type(url, chunk)`
			`if not mime:`
			`ctx.logger.warning(`
			`"chapter[{}] cannot identify {!r} mime".format(*fmt)`
			`)`
			`continue`
			`fname = f"image-{uuid4()}.{mime}"`
			`fpath = path.join(ctx.imagedir, fname)`
			`# read rest of the content into associated file`
			`with open(fpath, "wb") as f:`
			`f.write(chunk)`
			`# save epub-path in downloads cache and update image attribs`
			`epub_path = path.join("images/", fname)`
			`downloads[url] = epub_path`
			`image.attrib["src"] = epub_path`
			`except error.URLError:`
			`ctx.logger.error("chapter[{}] failed to download {!r}".format(*fmt))`
			`if ctx.extern_links and ctx.chapter.url:`
			`self.externalize_links(ctx.chapter.url, ctx.etree)`

refactor: :recycle: refresh project 2025-02-11 22:43:15 +01:00
			`with open(f"{path.dirname(__file__)}/{args.lang}.json") as f:`
			`manifest = load(f)`
			`book = manifest[args.volume - 1]`

			`epub = Epub(`
			`title=book["title"],`
			`creator=book["creator"],`
			`language=args.lang,`
			`publisher="Nanamazon+",`
			`factory=MyChapterFactory(),`
			`)`

			`for url in book["chapters"]:`
fix: :coffin: remove non working cookies 2025-02-11 23:37:03 +01:00			`markup = requests.get(`
			`url,`
			`impersonate=requests.BrowserType.firefox133.value,`
			`thread=None,`
			`curl_options=None,`
			`debug=False,`
			`)`
			`etree = BeautifulSoup(markup.text, "html.parser")`
fix: :bug: use beautilfulsoup to extract bad html 2025-02-11 23:25:14 +01:00			`title = etree.select_one("h1.entry-title").text # type: ignore`
			`content = etree.select("div.entry-content p")`
refactor: :recycle: refresh project 2025-02-11 22:43:15 +01:00			`print(f"Chapter {title}...")`
			`chapter = Chapter(title, "")`
			`for elem in content:`
fix: :bug: use beautilfulsoup to extract bad html 2025-02-11 23:25:14 +01:00			`if elem.a:`
refactor: :recycle: refresh project 2025-02-11 22:43:15 +01:00			`continue`
fix: :bug: use beautilfulsoup to extract bad html 2025-02-11 23:25:14 +01:00			`chapter.content += elem.prettify()`
refactor: :recycle: refresh project 2025-02-11 22:43:15 +01:00			`epub.add_chapter(chapter)`
fix: :lipstick: add image support behind cloudflare 2025-02-13 19:38:48 +00:00			`timeout = randint(1, 29)`
fix: :coffin: remove non working cookies 2025-02-11 23:37:03 +01:00			`print(f"Wait {timeout} seconds...")`
			`sleep(timeout)`
refactor: :recycle: refresh project 2025-02-11 22:43:15 +01:00
			`with NamedTemporaryFile() as cover:`
			`response = requests.get(book["cover"], thread=None, curl_options=None, debug=False)`
			`cover.write(response.content)`
			`epub.cover = cover.name`
			`epub.create(args.output)`