import asyncio import executor import bs4 import pypub import requests def fetchVolume(title, metadata, volume_title, output_dir, links): book = pypub.Epub(f"{title} - {volume_title}", *metadata) for link in links: chapter_response = requests.get(link["href"]) chapter_html = bs4.BeautifulSoup(chapter_response.content, "html.parser") chapter_content = chapter_html.select_one("div.entry-content") chapter_title = chapter_html.select_one("h1.entry-title").get_text() a_remove = chapter_content.find_all("a") hr_remove = chapter_content.find_all("hr") div_remove = chapter_content.find_all("div.tiled-gallery") for removed in a_remove: removed.decompose() for removed in hr_remove: removed.decompose() for removed in div_remove: removed.decompose() print(f"{title} - {chapter_title}") book.add_chapter( pypub.create_chapter_from_string(chapter_content.prettify(), chapter_title) ) print(f"{title} - {volume_title}") book.create_epub(output_dir) @executor.executor def process(metadata, output_dir, url): response = requests.get(url) html = bs4.BeautifulSoup(response.content, "html.parser") content = html.select("div.entry-content > p") title = html.select_one("#site-title > span > a").get_text() volume_title = None tasks = [] for paragraph in content: if paragraph.strong is not None: volume_title = paragraph.strong.get_text() else: tasks.append( fetchVolume( title, metadata, volume_title, output_dir, paragraph.find_all("a") ) ) asyncio.run(asyncio.gather(*tasks))