2022-01-18 20:43:45 +00:00
|
|
|
import bs4
|
2022-01-18 21:05:00 +00:00
|
|
|
import logging
|
2022-01-18 20:43:45 +00:00
|
|
|
import requests
|
2022-08-18 10:59:02 +00:00
|
|
|
import utils
|
2022-01-18 20:43:45 +00:00
|
|
|
|
2022-08-18 10:59:02 +00:00
|
|
|
from ebooklib import epub
|
2022-01-18 20:43:45 +00:00
|
|
|
|
2022-08-18 10:59:02 +00:00
|
|
|
|
|
|
|
def fetchVolume(title, description, metadatas, volume_title, output_dir, links):
|
|
|
|
book = epub.EpubBook()
|
|
|
|
book_items = []
|
|
|
|
book.set_identifier(title.replace(" ", "").lower())
|
|
|
|
book.set_title(title)
|
|
|
|
book.set_language(metadatas["language"])
|
|
|
|
book.set_cover("cover.jpg", open(metadatas["cover"], "rb").read())
|
|
|
|
book.add_metadata("DC", "description", description)
|
|
|
|
|
|
|
|
for author in metadatas["authors"]:
|
|
|
|
book.add_author(author)
|
2022-01-18 20:43:45 +00:00
|
|
|
|
2022-08-18 11:43:30 +00:00
|
|
|
chapter_index = 1
|
2022-01-18 20:43:45 +00:00
|
|
|
for link in links:
|
|
|
|
chapter_response = requests.get(link["href"])
|
|
|
|
chapter_html = bs4.BeautifulSoup(chapter_response.content, "html.parser")
|
|
|
|
chapter_content = chapter_html.select_one("div.entry-content")
|
2022-01-18 21:53:50 +00:00
|
|
|
chapter_title = chapter_html.select_one("h1.entry-title").get_text().strip()
|
2022-01-18 20:43:45 +00:00
|
|
|
|
2022-01-19 15:00:03 +00:00
|
|
|
to_remove = [
|
|
|
|
chapter_content.find_all("a"),
|
2022-01-19 15:42:41 +00:00
|
|
|
chapter_content.find_all("h3"),
|
2022-01-19 15:00:03 +00:00
|
|
|
chapter_content.find_all("hr"),
|
2022-08-18 11:43:30 +00:00
|
|
|
chapter_content.find_all("iframe"),
|
|
|
|
chapter_content.find_all("script"),
|
2022-01-19 15:00:03 +00:00
|
|
|
chapter_content.select("div.tiled-gallery"),
|
|
|
|
]
|
|
|
|
|
|
|
|
for dataset in to_remove:
|
|
|
|
for removed in dataset:
|
|
|
|
removed.decompose()
|
2022-01-18 20:43:45 +00:00
|
|
|
|
2022-01-18 21:05:00 +00:00
|
|
|
logging.log(logging.INFO, f"{title} - {chapter_title}")
|
2022-08-18 10:59:02 +00:00
|
|
|
book_items.append(
|
|
|
|
epub.EpubHtml(
|
|
|
|
title=chapter_title,
|
|
|
|
content=utils.generate_title_html(
|
|
|
|
chapter_title, chapter_content.prettify()
|
|
|
|
),
|
2022-08-18 11:43:30 +00:00
|
|
|
file_name=f"{chapter_index}.xhtml",
|
2022-08-18 10:59:02 +00:00
|
|
|
lang=metadatas["language"],
|
|
|
|
)
|
2022-01-18 20:43:45 +00:00
|
|
|
)
|
2022-08-18 11:43:30 +00:00
|
|
|
chapter_index += 1
|
2022-01-18 20:43:45 +00:00
|
|
|
|
2022-01-18 21:05:00 +00:00
|
|
|
logging.log(logging.INFO, f"{title} - {volume_title}")
|
2022-08-18 10:59:02 +00:00
|
|
|
for book_item in book_items:
|
|
|
|
book.add_item(book_item)
|
|
|
|
|
|
|
|
book.add_item(epub.EpubNcx())
|
|
|
|
book.add_item(epub.EpubNav())
|
|
|
|
book.toc = book_items
|
|
|
|
book.spine = ["cover", "nav", *book_items]
|
|
|
|
epub.write_epub(f"{output_dir}/{title} - {volume_title}.epub", book)
|
2022-01-18 20:43:45 +00:00
|
|
|
|
|
|
|
|
2022-08-18 10:59:02 +00:00
|
|
|
@utils.executor
|
|
|
|
def process(metadatas, output_dir, url):
|
2022-01-18 20:43:45 +00:00
|
|
|
response = requests.get(url)
|
|
|
|
|
|
|
|
html = bs4.BeautifulSoup(response.content, "html.parser")
|
|
|
|
content = html.select("div.entry-content > p")
|
2022-01-18 21:53:50 +00:00
|
|
|
title = html.select_one("#site-title > span > a").get_text().strip()
|
2022-08-18 10:59:02 +00:00
|
|
|
description = html.select_one("#site-description").get_text().strip()
|
2022-01-18 20:43:45 +00:00
|
|
|
|
|
|
|
volume_title = None
|
|
|
|
|
|
|
|
for paragraph in content:
|
|
|
|
if paragraph.strong is not None:
|
2022-01-18 21:53:50 +00:00
|
|
|
volume_title = paragraph.strong.get_text().strip()
|
2022-01-18 20:43:45 +00:00
|
|
|
else:
|
2022-01-18 21:53:50 +00:00
|
|
|
fetchVolume(
|
2022-08-18 10:59:02 +00:00
|
|
|
title,
|
|
|
|
description,
|
|
|
|
metadatas,
|
|
|
|
volume_title,
|
|
|
|
output_dir,
|
|
|
|
paragraph.find_all("a"),
|
2022-01-18 20:43:45 +00:00
|
|
|
)
|