Auberge_Vagabonde/twi.py

import bs4
import executor
import logging
import pypub
import requests


def fetchVolume(title, metadata, volume_title, output_dir, links):
    book = pypub.Epub(f"{title} - {volume_title}", **metadata)

    for link in links:
        chapter_response = requests.get(link["href"])
        chapter_html = bs4.BeautifulSoup(chapter_response.content, "html.parser")
        chapter_content = chapter_html.select_one("div.entry-content")
        chapter_title = chapter_html.select_one("h1.entry-title").get_text().strip()

        to_remove = [
            chapter_content.find_all("a"),
            chapter_content.find_all("h3"),
            chapter_content.find_all("hr"),
            chapter_content.find_all("img"),
            chapter_content.find_all("iframe"),
            chapter_content.find_all("script"),
            chapter_content.select("div.tiled-gallery"),
        ]

        for dataset in to_remove:
            for removed in dataset:
                removed.decompose()

        logging.log(logging.INFO, f"{title} - {chapter_title}")
        book.add_chapter(
            pypub.create_chapter_from_string(chapter_content.prettify(), chapter_title)
        )

    logging.log(logging.INFO, f"{title} - {volume_title}")
    book.create_epub(output_dir)


@executor.executor
def process(metadata, output_dir, url):
    response = requests.get(url)

    html = bs4.BeautifulSoup(response.content, "html.parser")
    content = html.select("div.entry-content > p")
    title = html.select_one("#site-title > span > a").get_text().strip()

    volume_title = None

    for paragraph in content:
        if paragraph.strong is not None:
            volume_title = paragraph.strong.get_text().strip()
        else:
            fetchVolume(
                title, metadata, volume_title, output_dir, paragraph.find_all("a")
            )
Back to python + Gravesong 2022-01-18 20:43:45 +00:00			`import bs4`
Fix logging 2022-01-18 21:05:00 +00:00			`import executor`
			`import logging`
Back to python + Gravesong 2022-01-18 20:43:45 +00:00			`import pypub`
			`import requests`


			`def fetchVolume(title, metadata, volume_title, output_dir, links):`
Fix metadata 2022-01-19 12:37:45 +00:00			`book = pypub.Epub(f"{title} - {volume_title}", **metadata)`
Back to python + Gravesong 2022-01-18 20:43:45 +00:00
			`for link in links:`
			`chapter_response = requests.get(link["href"])`
			`chapter_html = bs4.BeautifulSoup(chapter_response.content, "html.parser")`
			`chapter_content = chapter_html.select_one("div.entry-content")`
Fix async + drone 2022-01-18 21:53:50 +00:00			`chapter_title = chapter_html.select_one("h1.entry-title").get_text().strip()`
Back to python + Gravesong 2022-01-18 20:43:45 +00:00
Remove malicious script and frames 2022-01-19 15:00:03 +00:00			`to_remove = [`
			`chapter_content.find_all("a"),`
Remove h3 on twi 2022-01-19 15:42:41 +00:00			`chapter_content.find_all("h3"),`
Remove malicious script and frames 2022-01-19 15:00:03 +00:00			`chapter_content.find_all("hr"),`
			`chapter_content.find_all("img"),`
			`chapter_content.find_all("iframe"),`
			`chapter_content.find_all("script"),`
			`chapter_content.select("div.tiled-gallery"),`
			`]`

			`for dataset in to_remove:`
			`for removed in dataset:`
			`removed.decompose()`
Back to python + Gravesong 2022-01-18 20:43:45 +00:00
Fix logging 2022-01-18 21:05:00 +00:00			`logging.log(logging.INFO, f"{title} - {chapter_title}")`
Back to python + Gravesong 2022-01-18 20:43:45 +00:00			`book.add_chapter(`
			`pypub.create_chapter_from_string(chapter_content.prettify(), chapter_title)`
			`)`

Fix logging 2022-01-18 21:05:00 +00:00			`logging.log(logging.INFO, f"{title} - {volume_title}")`
Back to python + Gravesong 2022-01-18 20:43:45 +00:00			`book.create_epub(output_dir)`


			`@executor.executor`
			`def process(metadata, output_dir, url):`
			`response = requests.get(url)`

			`html = bs4.BeautifulSoup(response.content, "html.parser")`
			`content = html.select("div.entry-content > p")`
Fix async + drone 2022-01-18 21:53:50 +00:00			`title = html.select_one("#site-title > span > a").get_text().strip()`
Back to python + Gravesong 2022-01-18 20:43:45 +00:00
			`volume_title = None`

			`for paragraph in content:`
			`if paragraph.strong is not None:`
Fix async + drone 2022-01-18 21:53:50 +00:00			`volume_title = paragraph.strong.get_text().strip()`
Back to python + Gravesong 2022-01-18 20:43:45 +00:00			`else:`
Fix async + drone 2022-01-18 21:53:50 +00:00			`fetchVolume(`
			`title, metadata, volume_title, output_dir, paragraph.find_all("a")`
Back to python + Gravesong 2022-01-18 20:43:45 +00:00			`)`