diff --git a/livres_en.py b/livres_en.py index 7b04f2d..d0140fa 100644 --- a/livres_en.py +++ b/livres_en.py @@ -44,6 +44,8 @@ for volume_tag in tags: pagination.decompose() for image in text.find_all('img'): image.decompose() + for cut in text.find_all('hr'): + cut.decompose() print(title) diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..6a97635 --- /dev/null +++ b/utils.py @@ -0,0 +1,12 @@ +from bs4.element import NavigableString + + +def strip_content(tag): + # strip content from all children + children = [strip_content(child) for child in tag.children if not isinstance(child, NavigableString)] + # remove everything from the tag + tag.clear() + for child in children: + # Add back stripped children + tag.append(child) + return tag