This repository has been archived on 2024-02-23. You can view files and clone it, but cannot push or open issues or pull requests.
Auberge_Vagabonde/twi.py

90 lines
2.8 KiB
Python
Raw Normal View History

2022-01-18 21:05:00 +00:00
import logging
2022-01-18 20:43:45 +00:00
2022-09-18 23:00:12 +00:00
import bs4
2022-09-18 23:07:52 +00:00
from ebooklib import epub # type: ignore
2022-09-18 23:00:12 +00:00
import requests
import utils
2022-01-18 20:43:45 +00:00
2022-08-18 10:59:02 +00:00
def fetchVolume(title, description, metadatas, volume_title, output_dir, links):
book = epub.EpubBook()
book_items = []
book.set_identifier(title.replace(" ", "").lower())
book.set_title(title)
book.set_language(metadatas["language"])
book.set_cover("cover.jpg", open(metadatas["cover"], "rb").read())
book.add_metadata("DC", "description", description)
for author in metadatas["authors"]:
book.add_author(author)
2022-01-18 20:43:45 +00:00
2022-08-18 11:43:30 +00:00
chapter_index = 1
2022-01-18 20:43:45 +00:00
for link in links:
chapter_response = requests.get(link["href"])
chapter_html = bs4.BeautifulSoup(chapter_response.content, "html.parser")
chapter_content = chapter_html.select_one("div.entry-content")
2022-01-18 21:53:50 +00:00
chapter_title = chapter_html.select_one("h1.entry-title").get_text().strip()
2022-01-18 20:43:45 +00:00
2022-01-19 15:00:03 +00:00
to_remove = [
chapter_content.find_all("a"),
2022-01-19 15:42:41 +00:00
chapter_content.find_all("h3"),
2022-01-19 15:00:03 +00:00
chapter_content.find_all("hr"),
2022-08-18 11:43:30 +00:00
chapter_content.find_all("iframe"),
chapter_content.find_all("script"),
2022-01-19 15:00:03 +00:00
chapter_content.select("div.tiled-gallery"),
]
for dataset in to_remove:
for removed in dataset:
removed.decompose()
2022-01-18 20:43:45 +00:00
2022-01-18 21:05:00 +00:00
logging.log(logging.INFO, f"{title} - {chapter_title}")
2022-08-18 10:59:02 +00:00
book_items.append(
epub.EpubHtml(
title=chapter_title,
content=utils.generate_title_html(
chapter_title, chapter_content.prettify()
),
2022-08-18 11:43:30 +00:00
file_name=f"{chapter_index}.xhtml",
2022-08-18 10:59:02 +00:00
lang=metadatas["language"],
)
2022-01-18 20:43:45 +00:00
)
2022-08-18 11:43:30 +00:00
chapter_index += 1
2022-01-18 20:43:45 +00:00
2022-01-18 21:05:00 +00:00
logging.log(logging.INFO, f"{title} - {volume_title}")
2022-08-18 10:59:02 +00:00
for book_item in book_items:
book.add_item(book_item)
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
book.toc = book_items
book.spine = ["cover", "nav", *book_items]
epub.write_epub(f"{output_dir}/{title} - {volume_title}.epub", book)
2022-01-18 20:43:45 +00:00
2022-08-18 10:59:02 +00:00
@utils.executor
def process(metadatas, output_dir, url):
2022-01-18 20:43:45 +00:00
response = requests.get(url)
html = bs4.BeautifulSoup(response.content, "html.parser")
content = html.select("div.entry-content > p")
2022-01-18 21:53:50 +00:00
title = html.select_one("#site-title > span > a").get_text().strip()
2022-08-18 10:59:02 +00:00
description = html.select_one("#site-description").get_text().strip()
2022-01-18 20:43:45 +00:00
volume_title = None
for paragraph in content:
if paragraph.strong is not None:
2022-01-18 21:53:50 +00:00
volume_title = paragraph.strong.get_text().strip()
2022-01-18 20:43:45 +00:00
else:
2022-09-18 23:00:12 +00:00
links = paragraph.find_all("a")
if links:
fetchVolume(
title,
description,
metadatas,
volume_title,
output_dir,
links,
)