This repository has been archived on 2024-02-23. You can view files and clone it, but cannot push or open issues or pull requests.
Auberge_Vagabonde/twi.py

56 lines
1.8 KiB
Python
Raw Normal View History

2022-01-18 20:43:45 +00:00
import bs4
2022-01-18 21:05:00 +00:00
import executor
import logging
2022-01-18 20:43:45 +00:00
import pypub
import requests
def fetchVolume(title, metadata, volume_title, output_dir, links):
2022-01-19 12:37:45 +00:00
book = pypub.Epub(f"{title} - {volume_title}", **metadata)
2022-01-18 20:43:45 +00:00
for link in links:
chapter_response = requests.get(link["href"])
chapter_html = bs4.BeautifulSoup(chapter_response.content, "html.parser")
chapter_content = chapter_html.select_one("div.entry-content")
2022-01-18 21:53:50 +00:00
chapter_title = chapter_html.select_one("h1.entry-title").get_text().strip()
2022-01-18 20:43:45 +00:00
2022-01-19 15:00:03 +00:00
to_remove = [
chapter_content.find_all("a"),
chapter_content.find_all("hr"),
chapter_content.find_all("img"),
chapter_content.find_all("iframe"),
chapter_content.find_all("script"),
chapter_content.select("div.tiled-gallery"),
]
for dataset in to_remove:
for removed in dataset:
removed.decompose()
2022-01-18 20:43:45 +00:00
2022-01-18 21:05:00 +00:00
logging.log(logging.INFO, f"{title} - {chapter_title}")
2022-01-18 20:43:45 +00:00
book.add_chapter(
pypub.create_chapter_from_string(chapter_content.prettify(), chapter_title)
)
2022-01-18 21:05:00 +00:00
logging.log(logging.INFO, f"{title} - {volume_title}")
2022-01-18 20:43:45 +00:00
book.create_epub(output_dir)
@executor.executor
def process(metadata, output_dir, url):
response = requests.get(url)
html = bs4.BeautifulSoup(response.content, "html.parser")
content = html.select("div.entry-content > p")
2022-01-18 21:53:50 +00:00
title = html.select_one("#site-title > span > a").get_text().strip()
2022-01-18 20:43:45 +00:00
volume_title = None
for paragraph in content:
if paragraph.strong is not None:
2022-01-18 21:53:50 +00:00
volume_title = paragraph.strong.get_text().strip()
2022-01-18 20:43:45 +00:00
else:
2022-01-18 21:53:50 +00:00
fetchVolume(
title, metadata, volume_title, output_dir, paragraph.find_all("a")
2022-01-18 20:43:45 +00:00
)