From 9aa5b46a72b0782d25856fdb68421c50e31b37b6 Mon Sep 17 00:00:00 2001 From: Michel Roux Date: Sun, 11 Apr 2021 18:02:29 +0200 Subject: [PATCH] Add utils and multiple modifications on pypub --- livres_en.py | 2 ++ utils.py | 12 ++++++++++++ 2 files changed, 14 insertions(+) create mode 100644 utils.py diff --git a/livres_en.py b/livres_en.py index 7b04f2d..d0140fa 100644 --- a/livres_en.py +++ b/livres_en.py @@ -44,6 +44,8 @@ for volume_tag in tags: pagination.decompose() for image in text.find_all('img'): image.decompose() + for cut in text.find_all('hr'): + cut.decompose() print(title) diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..6a97635 --- /dev/null +++ b/utils.py @@ -0,0 +1,12 @@ +from bs4.element import NavigableString + + +def strip_content(tag): + # strip content from all children + children = [strip_content(child) for child in tag.children if not isinstance(child, NavigableString)] + # remove everything from the tag + tag.clear() + for child in children: + # Add back stripped children + tag.append(child) + return tag