Auberge_Vagabonde/twi.py

import logging

import bs4
from ebooklib import epub  # type: ignore
import requests
import utils


def fetchVolume(title, description, metadatas, volume_title, output_dir, links):
    book = epub.EpubBook()
    book_items = []
    book.set_identifier(title.replace(" ", "").lower())
    book.set_title(title)
    book.set_language(metadatas["language"])
    book.set_cover("cover.jpg", open(metadatas["cover"], "rb").read())
    book.add_metadata("DC", "description", description)

    for author in metadatas["authors"]:
        book.add_author(author)

    chapter_index = 1
    for link in links:
        chapter_response = requests.get(link["href"])
        chapter_html = bs4.BeautifulSoup(chapter_response.content, "html.parser")
        chapter_content = chapter_html.select_one("div.entry-content")
        chapter_title = chapter_html.select_one("h1.entry-title").get_text().strip()

        to_remove = [
            chapter_content.find_all("a"),
            chapter_content.find_all("h3"),
            chapter_content.find_all("hr"),
            chapter_content.find_all("iframe"),
            chapter_content.find_all("script"),
            chapter_content.select("div.tiled-gallery"),
        ]

        for dataset in to_remove:
            for removed in dataset:
                removed.decompose()

        logging.log(logging.INFO, f"{title} - {chapter_title}")
        book_items.append(
            epub.EpubHtml(
                title=chapter_title,
                content=utils.generate_title_html(
                    chapter_title, chapter_content.prettify()
                ),
                file_name=f"{chapter_index}.xhtml",
                lang=metadatas["language"],
            )
        )
        chapter_index += 1

    logging.log(logging.INFO, f"{title} - {volume_title}")
    for book_item in book_items:
        book.add_item(book_item)

    book.add_item(epub.EpubNcx())
    book.add_item(epub.EpubNav())
    book.toc = book_items
    book.spine = ["cover", "nav", *book_items]
    epub.write_epub(f"{output_dir}/{title} - {volume_title}.epub", book)


@utils.executor
def process(metadatas, output_dir, url):
    response = requests.get(url)

    html = bs4.BeautifulSoup(response.content, "html.parser")
    content = html.select("div.entry-content > p")
    title = html.select_one("#site-title > span > a").get_text().strip()
    description = html.select_one("#site-description").get_text().strip()

    volume_title = None

    for paragraph in content:
        if paragraph.strong is not None:
            volume_title = paragraph.strong.get_text().strip()
        else:
            links = paragraph.find_all("a")
            if links:
                fetchVolume(
                    title,
                    description,
                    metadatas,
                    volume_title,
                    output_dir,
                    links,
                )
Fix logging 2022-01-18 21:05:00 +00:00			`import logging`
Back to python + Gravesong 2022-01-18 20:43:45 +00:00
Fix VF parsing and use poetry 2022-09-18 23:00:12 +00:00			`import bs4`
Fix mypy 2022-09-18 23:07:52 +00:00			`from ebooklib import epub # type: ignore`
Fix VF parsing and use poetry 2022-09-18 23:00:12 +00:00			`import requests`
			`import utils`
Back to python + Gravesong 2022-01-18 20:43:45 +00:00
New way of generating epubs 2022-08-18 10:59:02 +00:00
			`def fetchVolume(title, description, metadatas, volume_title, output_dir, links):`
			`book = epub.EpubBook()`
			`book_items = []`
			`book.set_identifier(title.replace(" ", "").lower())`
			`book.set_title(title)`
			`book.set_language(metadatas["language"])`
			`book.set_cover("cover.jpg", open(metadatas["cover"], "rb").read())`
			`book.add_metadata("DC", "description", description)`

			`for author in metadatas["authors"]:`
			`book.add_author(author)`
Back to python + Gravesong 2022-01-18 20:43:45 +00:00
Remove iframes and chapter index 2022-08-18 11:43:30 +00:00			`chapter_index = 1`
Back to python + Gravesong 2022-01-18 20:43:45 +00:00			`for link in links:`
			`chapter_response = requests.get(link["href"])`
			`chapter_html = bs4.BeautifulSoup(chapter_response.content, "html.parser")`
			`chapter_content = chapter_html.select_one("div.entry-content")`
Fix async + drone 2022-01-18 21:53:50 +00:00			`chapter_title = chapter_html.select_one("h1.entry-title").get_text().strip()`
Back to python + Gravesong 2022-01-18 20:43:45 +00:00
Remove malicious script and frames 2022-01-19 15:00:03 +00:00			`to_remove = [`
			`chapter_content.find_all("a"),`
Remove h3 on twi 2022-01-19 15:42:41 +00:00			`chapter_content.find_all("h3"),`
Remove malicious script and frames 2022-01-19 15:00:03 +00:00			`chapter_content.find_all("hr"),`
Remove iframes and chapter index 2022-08-18 11:43:30 +00:00			`chapter_content.find_all("iframe"),`
			`chapter_content.find_all("script"),`
Remove malicious script and frames 2022-01-19 15:00:03 +00:00			`chapter_content.select("div.tiled-gallery"),`
			`]`

			`for dataset in to_remove:`
			`for removed in dataset:`
			`removed.decompose()`
Back to python + Gravesong 2022-01-18 20:43:45 +00:00
Fix logging 2022-01-18 21:05:00 +00:00			`logging.log(logging.INFO, f"{title} - {chapter_title}")`
New way of generating epubs 2022-08-18 10:59:02 +00:00			`book_items.append(`
			`epub.EpubHtml(`
			`title=chapter_title,`
			`content=utils.generate_title_html(`
			`chapter_title, chapter_content.prettify()`
			`),`
Remove iframes and chapter index 2022-08-18 11:43:30 +00:00			`file_name=f"{chapter_index}.xhtml",`
New way of generating epubs 2022-08-18 10:59:02 +00:00			`lang=metadatas["language"],`
			`)`
Back to python + Gravesong 2022-01-18 20:43:45 +00:00			`)`
Remove iframes and chapter index 2022-08-18 11:43:30 +00:00			`chapter_index += 1`
Back to python + Gravesong 2022-01-18 20:43:45 +00:00
Fix logging 2022-01-18 21:05:00 +00:00			`logging.log(logging.INFO, f"{title} - {volume_title}")`
New way of generating epubs 2022-08-18 10:59:02 +00:00			`for book_item in book_items:`
			`book.add_item(book_item)`

			`book.add_item(epub.EpubNcx())`
			`book.add_item(epub.EpubNav())`
			`book.toc = book_items`
			`book.spine = ["cover", "nav", *book_items]`
			`epub.write_epub(f"{output_dir}/{title} - {volume_title}.epub", book)`
Back to python + Gravesong 2022-01-18 20:43:45 +00:00

New way of generating epubs 2022-08-18 10:59:02 +00:00			`@utils.executor`
			`def process(metadatas, output_dir, url):`
Back to python + Gravesong 2022-01-18 20:43:45 +00:00			`response = requests.get(url)`

			`html = bs4.BeautifulSoup(response.content, "html.parser")`
			`content = html.select("div.entry-content > p")`
Fix async + drone 2022-01-18 21:53:50 +00:00			`title = html.select_one("#site-title > span > a").get_text().strip()`
New way of generating epubs 2022-08-18 10:59:02 +00:00			`description = html.select_one("#site-description").get_text().strip()`
Back to python + Gravesong 2022-01-18 20:43:45 +00:00
			`volume_title = None`

			`for paragraph in content:`
			`if paragraph.strong is not None:`
Fix async + drone 2022-01-18 21:53:50 +00:00			`volume_title = paragraph.strong.get_text().strip()`
Back to python + Gravesong 2022-01-18 20:43:45 +00:00			`else:`
Fix VF parsing and use poetry 2022-09-18 23:00:12 +00:00			`links = paragraph.find_all("a")`
			`if links:`
			`fetchVolume(`
			`title,`
			`description,`
			`metadatas,`
			`volume_title,`
			`output_dir,`
			`links,`
			`)`