From 23f9ce0f07bcb8834761aba307c9b08b9414e35c Mon Sep 17 00:00:00 2001 From: Michel Roux Date: Wed, 19 Jan 2022 15:00:03 +0000 Subject: [PATCH] Remove malicious script and frames --- twi.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/twi.py b/twi.py index 620fd44..49f0d76 100644 --- a/twi.py +++ b/twi.py @@ -14,19 +14,18 @@ def fetchVolume(title, metadata, volume_title, output_dir, links): chapter_content = chapter_html.select_one("div.entry-content") chapter_title = chapter_html.select_one("h1.entry-title").get_text().strip() - a_remove = chapter_content.find_all("a") - hr_remove = chapter_content.find_all("hr") - img_remove = chapter_content.find_all("img") - div_remove = chapter_content.select("div.tiled-gallery") + to_remove = [ + chapter_content.find_all("a"), + chapter_content.find_all("hr"), + chapter_content.find_all("img"), + chapter_content.find_all("iframe"), + chapter_content.find_all("script"), + chapter_content.select("div.tiled-gallery"), + ] - for removed in a_remove: - removed.decompose() - for removed in hr_remove: - removed.decompose() - for removed in img_remove: - removed.decompose() - for removed in div_remove: - removed.decompose() + for dataset in to_remove: + for removed in dataset: + removed.decompose() logging.log(logging.INFO, f"{title} - {chapter_title}") book.add_chapter(