This repository has been archived on 2024-02-23. You can view files and clone it, but cannot push or open issues or pull requests.
Auberge_Vagabonde/twi.py
Michel Roux 59c9f8710c
All checks were successful
continuous-integration/drone/push Build is passing
Remove iframes and chapter index
2022-08-18 13:43:30 +02:00

88 lines
2.8 KiB
Python

import bs4
import logging
import requests
import utils
from ebooklib import epub
def fetchVolume(title, description, metadatas, volume_title, output_dir, links):
book = epub.EpubBook()
book_items = []
book.set_identifier(title.replace(" ", "").lower())
book.set_title(title)
book.set_language(metadatas["language"])
book.set_cover("cover.jpg", open(metadatas["cover"], "rb").read())
book.add_metadata("DC", "description", description)
for author in metadatas["authors"]:
book.add_author(author)
chapter_index = 1
for link in links:
chapter_response = requests.get(link["href"])
chapter_html = bs4.BeautifulSoup(chapter_response.content, "html.parser")
chapter_content = chapter_html.select_one("div.entry-content")
chapter_title = chapter_html.select_one("h1.entry-title").get_text().strip()
to_remove = [
chapter_content.find_all("a"),
chapter_content.find_all("h3"),
chapter_content.find_all("hr"),
chapter_content.find_all("iframe"),
chapter_content.find_all("script"),
chapter_content.select("div.tiled-gallery"),
]
for dataset in to_remove:
for removed in dataset:
removed.decompose()
logging.log(logging.INFO, f"{title} - {chapter_title}")
book_items.append(
epub.EpubHtml(
title=chapter_title,
content=utils.generate_title_html(
chapter_title, chapter_content.prettify()
),
file_name=f"{chapter_index}.xhtml",
lang=metadatas["language"],
)
)
chapter_index += 1
logging.log(logging.INFO, f"{title} - {volume_title}")
for book_item in book_items:
book.add_item(book_item)
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
book.toc = book_items
book.spine = ["cover", "nav", *book_items]
epub.write_epub(f"{output_dir}/{title} - {volume_title}.epub", book)
@utils.executor
def process(metadatas, output_dir, url):
response = requests.get(url)
html = bs4.BeautifulSoup(response.content, "html.parser")
content = html.select("div.entry-content > p")
title = html.select_one("#site-title > span > a").get_text().strip()
description = html.select_one("#site-description").get_text().strip()
volume_title = None
for paragraph in content:
if paragraph.strong is not None:
volume_title = paragraph.strong.get_text().strip()
else:
fetchVolume(
title,
description,
metadatas,
volume_title,
output_dir,
paragraph.find_all("a"),
)