2025-02-11 22:43:15 +01:00
|
|
|
from argparse import ArgumentParser
|
|
|
|
from json import load
|
|
|
|
from os import path
|
|
|
|
from pathlib import Path
|
|
|
|
from random import randint
|
|
|
|
from tempfile import NamedTemporaryFile
|
|
|
|
from time import sleep
|
|
|
|
from typing import cast
|
2025-02-13 19:38:48 +00:00
|
|
|
from urllib import error, parse
|
|
|
|
from uuid import uuid4
|
2025-02-11 22:43:15 +01:00
|
|
|
|
2025-02-11 23:25:14 +01:00
|
|
|
from bs4 import BeautifulSoup
|
2025-02-11 22:43:15 +01:00
|
|
|
from curl_cffi import requests
|
2025-02-13 19:38:48 +00:00
|
|
|
from pypub import ( # type: ignore
|
|
|
|
Chapter,
|
|
|
|
Epub,
|
|
|
|
SimpleChapterFactory,
|
|
|
|
factory,
|
|
|
|
)
|
2025-02-11 23:25:14 +01:00
|
|
|
from pyxml.html import HtmlElement, fromstring # type: ignore
|
2025-02-11 22:43:15 +01:00
|
|
|
|
|
|
|
parser = ArgumentParser()
|
|
|
|
parser.add_argument("-l", "--lang", choices=["fr", "en"], required=True)
|
|
|
|
parser.add_argument("-v", "--volume", type=int, choices=range(1, 10), required=True)
|
|
|
|
parser.add_argument("output", type=Path)
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
class MyChapterFactory(SimpleChapterFactory): # type: ignore
|
|
|
|
def cleanup_html(self, content: bytes) -> HtmlElement:
|
|
|
|
"""
|
|
|
|
cleanup html content to only include supported tags
|
|
|
|
"""
|
|
|
|
# check if we can minimalize the scope
|
|
|
|
etree = fromstring(content)
|
|
|
|
# fix and remove invalid images
|
|
|
|
for img in etree.xpath(".//img"):
|
|
|
|
# ensure all images with no src are removed
|
|
|
|
if "src" not in img.attrib:
|
|
|
|
cast(HtmlElement, img.getparent()).remove(img)
|
|
|
|
# ensure they also have an alt
|
|
|
|
elif "alt" not in img.attrib:
|
|
|
|
img.attrib["alt"] = img.attrib["src"]
|
|
|
|
# return new element-tree
|
|
|
|
return etree
|
|
|
|
|
2025-02-13 19:38:48 +00:00
|
|
|
def hydrate(self, ctx: factory.RenderCtx) -> None:
|
|
|
|
"""
|
|
|
|
modify chapter element-tree to render images
|
|
|
|
"""
|
|
|
|
downloads: dict[str, str] = {}
|
|
|
|
for image in ctx.etree.xpath(".//img[@src]"):
|
|
|
|
# cleanup link and resolve relative paths
|
|
|
|
url = image.attrib["src"].rsplit("?", 1)[0]
|
|
|
|
fmt = (ctx.chapter.title, url)
|
|
|
|
if "://" not in url:
|
|
|
|
if not ctx.chapter.url:
|
|
|
|
ctx.logger.warning(
|
|
|
|
"chapter[{}] cannot render image {!r} w/o chapter-url".format(
|
|
|
|
*fmt
|
|
|
|
)
|
|
|
|
)
|
|
|
|
continue
|
|
|
|
url = parse.urljoin(ctx.chapter.url, url)
|
|
|
|
fmt = (ctx.chapter.title, url)
|
|
|
|
# skip if url has already been downloaded
|
|
|
|
if url in downloads:
|
|
|
|
image.attrib["src"] = downloads[url]
|
|
|
|
continue
|
|
|
|
# download url into local image folder for epub
|
|
|
|
ctx.logger.debug("chapter[{}] downloading image: {!r}".format(*fmt))
|
|
|
|
try:
|
|
|
|
res = requests.get(
|
|
|
|
url,
|
|
|
|
timeout=ctx.timeout,
|
|
|
|
impersonate=requests.BrowserType.firefox133.value,
|
|
|
|
thread=None,
|
|
|
|
curl_options=None,
|
|
|
|
debug=False,
|
|
|
|
)
|
|
|
|
# ensure status of response is valid
|
|
|
|
status = getattr(res, "status", None)
|
|
|
|
if status and status != 200:
|
|
|
|
raise error.URLError(f"status: {status}")
|
|
|
|
# read first chunk to determine content-type
|
|
|
|
chunk = res.content
|
|
|
|
mime = factory.mime_type(url, chunk)
|
|
|
|
if not mime:
|
|
|
|
ctx.logger.warning(
|
|
|
|
"chapter[{}] cannot identify {!r} mime".format(*fmt)
|
|
|
|
)
|
|
|
|
continue
|
|
|
|
fname = f"image-{uuid4()}.{mime}"
|
|
|
|
fpath = path.join(ctx.imagedir, fname)
|
|
|
|
# read rest of the content into associated file
|
|
|
|
with open(fpath, "wb") as f:
|
|
|
|
f.write(chunk)
|
|
|
|
# save epub-path in downloads cache and update image attribs
|
|
|
|
epub_path = path.join("images/", fname)
|
|
|
|
downloads[url] = epub_path
|
|
|
|
image.attrib["src"] = epub_path
|
|
|
|
except error.URLError:
|
|
|
|
ctx.logger.error("chapter[{}] failed to download {!r}".format(*fmt))
|
|
|
|
if ctx.extern_links and ctx.chapter.url:
|
|
|
|
self.externalize_links(ctx.chapter.url, ctx.etree)
|
|
|
|
|
2025-02-11 22:43:15 +01:00
|
|
|
|
|
|
|
with open(f"{path.dirname(__file__)}/{args.lang}.json") as f:
|
|
|
|
manifest = load(f)
|
|
|
|
book = manifest[args.volume - 1]
|
|
|
|
|
|
|
|
epub = Epub(
|
|
|
|
title=book["title"],
|
|
|
|
creator=book["creator"],
|
|
|
|
language=args.lang,
|
|
|
|
publisher="Nanamazon+",
|
|
|
|
factory=MyChapterFactory(),
|
|
|
|
)
|
|
|
|
|
|
|
|
for url in book["chapters"]:
|
2025-02-11 23:37:03 +01:00
|
|
|
markup = requests.get(
|
|
|
|
url,
|
|
|
|
impersonate=requests.BrowserType.firefox133.value,
|
|
|
|
thread=None,
|
|
|
|
curl_options=None,
|
|
|
|
debug=False,
|
|
|
|
)
|
|
|
|
etree = BeautifulSoup(markup.text, "html.parser")
|
2025-02-11 23:25:14 +01:00
|
|
|
title = etree.select_one("h1.entry-title").text # type: ignore
|
|
|
|
content = etree.select("div.entry-content p")
|
2025-02-11 22:43:15 +01:00
|
|
|
print(f"Chapter {title}...")
|
|
|
|
chapter = Chapter(title, "")
|
|
|
|
for elem in content:
|
2025-02-11 23:25:14 +01:00
|
|
|
if elem.a:
|
2025-02-11 22:43:15 +01:00
|
|
|
continue
|
2025-02-11 23:25:14 +01:00
|
|
|
chapter.content += elem.prettify()
|
2025-02-11 22:43:15 +01:00
|
|
|
epub.add_chapter(chapter)
|
2025-02-13 19:38:48 +00:00
|
|
|
timeout = randint(1, 29)
|
2025-02-11 23:37:03 +01:00
|
|
|
print(f"Wait {timeout} seconds...")
|
|
|
|
sleep(timeout)
|
2025-02-11 22:43:15 +01:00
|
|
|
|
|
|
|
with NamedTemporaryFile() as cover:
|
|
|
|
response = requests.get(book["cover"], thread=None, curl_options=None, debug=False)
|
|
|
|
cover.write(response.content)
|
|
|
|
epub.cover = cover.name
|
|
|
|
epub.create(args.output)
|