import bs4 import executor import logging import pypub import requests def fetchVolume(title, metadata, volume_title, output_dir, links): book = pypub.Epub(f"{title} - {volume_title}", **metadata) for link in links: chapter_response = requests.get(link["href"]) chapter_html = bs4.BeautifulSoup(chapter_response.content, "html.parser") chapter_content = chapter_html.select_one("div.entry-content") chapter_title = chapter_html.select_one("h1.entry-title").get_text().strip() to_remove = [ chapter_content.find_all("a"), chapter_content.find_all("h3"), chapter_content.find_all("hr"), chapter_content.find_all("img"), chapter_content.find_all("iframe"), chapter_content.find_all("script"), chapter_content.select("div.tiled-gallery"), ] for dataset in to_remove: for removed in dataset: removed.decompose() logging.log(logging.INFO, f"{title} - {chapter_title}") book.add_chapter( pypub.create_chapter_from_string(chapter_content.prettify(), chapter_title) ) logging.log(logging.INFO, f"{title} - {volume_title}") book.create_epub(output_dir) @executor.executor def process(metadata, output_dir, url): response = requests.get(url) html = bs4.BeautifulSoup(response.content, "html.parser") content = html.select("div.entry-content > p") title = html.select_one("#site-title > span > a").get_text().strip() volume_title = None for paragraph in content: if paragraph.strong is not None: volume_title = paragraph.strong.get_text().strip() else: fetchVolume( title, metadata, volume_title, output_dir, paragraph.find_all("a") )