import bs4 import logging import requests import utils from ebooklib import epub def fetchVolume(title, description, metadatas, volume_title, output_dir, links): book = epub.EpubBook() book_items = [] book.set_identifier(title.replace(" ", "").lower()) book.set_title(title) book.set_language(metadatas["language"]) book.set_cover("cover.jpg", open(metadatas["cover"], "rb").read()) book.add_metadata("DC", "description", description) for author in metadatas["authors"]: book.add_author(author) for link in links: chapter_response = requests.get(link["href"]) chapter_html = bs4.BeautifulSoup(chapter_response.content, "html.parser") chapter_content = chapter_html.select_one("div.entry-content") chapter_title = chapter_html.select_one("h1.entry-title").get_text().strip() to_remove = [ chapter_content.find_all("a"), chapter_content.find_all("h3"), chapter_content.find_all("hr"), chapter_content.select("div.tiled-gallery"), ] for dataset in to_remove: for removed in dataset: removed.decompose() logging.log(logging.INFO, f"{title} - {chapter_title}") book_items.append( epub.EpubHtml( title=chapter_title, content=utils.generate_title_html( chapter_title, chapter_content.prettify() ), file_name=f"{chapter_title}.xhtml", lang=metadatas["language"], ) ) logging.log(logging.INFO, f"{title} - {volume_title}") for book_item in book_items: book.add_item(book_item) book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) book.toc = book_items book.spine = ["cover", "nav", *book_items] epub.write_epub(f"{output_dir}/{title} - {volume_title}.epub", book) @utils.executor def process(metadatas, output_dir, url): response = requests.get(url) html = bs4.BeautifulSoup(response.content, "html.parser") content = html.select("div.entry-content > p") title = html.select_one("#site-title > span > a").get_text().strip() description = html.select_one("#site-description").get_text().strip() volume_title = None for paragraph in content: if paragraph.strong is not None: volume_title = paragraph.strong.get_text().strip() else: fetchVolume( title, description, metadatas, volume_title, output_dir, paragraph.find_all("a"), )