fix: 🐛 use beautilfulsoup to extract bad html

2025-02-11 23:25:14 +01:00 · 2025-02-11 23:25:14 +01:00 · a778a6b947
commit a778a6b947
parent 21ebf7ed13
3 changed files with 42 additions and 7 deletions
--- a/auberge_vagabonde/main.py
+++ b/auberge_vagabonde/main.py
@ -7,9 +7,10 @@ from tempfile import NamedTemporaryFile
 from time import sleep
 from typing import cast
 from bs4 import BeautifulSoup
 from curl_cffi import requests
 from pypub import Chapter, Epub, SimpleChapterFactory  # type: ignore
-from pyxml.html import HtmlElement, fromstring, tostring  # type: ignore
+from pyxml.html import HtmlElement, fromstring  # type: ignore
 parser = ArgumentParser()
 parser.add_argument("-l", "--lang", choices=["fr", "en"], required=True)
@ -75,15 +76,15 @@ epub = Epub(
 for url in book["chapters"]:
    text = download(url)
-    etree = fromstring(text)
+    etree = BeautifulSoup(text, "html.parser")
-    title = etree.xpath(".//h1[@class='entry-title']")[0].text
+    title = etree.select_one("h1.entry-title").text  # type: ignore
-    content = etree.xpath(".//div[@class='entry-content']//p")
+    content = etree.select("div.entry-content p")
    print(f"Chapter {title}...")
    chapter = Chapter(title, "")
    for elem in content:
-        if len(elem.children) > 0 and elem.children[0].tag == "a":
+        if elem.a:
            continue
-        chapter.content += tostring(elem).decode()
+        chapter.content += elem.prettify()
    epub.add_chapter(chapter)
 with NamedTemporaryFile() as cover:
--- a/poetry.lock
+++ b/poetry.lock
@ -1,5 +1,27 @@
 # This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
 [[package]]
 name = "beautifulsoup4"
 version = "4.13.3"
 description = "Screen-scraping library"
 optional = false
 python-versions = ">=3.7.0"
 files = [
    {file = "beautifulsoup4-4.13.3-py3-none-any.whl", hash = "sha256:99045d7d3f08f91f0d656bc9b7efbae189426cd913d830294a15eefa0ea4df16"},
    {file = "beautifulsoup4-4.13.3.tar.gz", hash = "sha256:1bd32405dacc920b42b83ba01644747ed77456a65760e285fbc47633ceddaf8b"},
 ]
 [package.dependencies]
 soupsieve = ">1.2"
 typing-extensions = ">=4.0.0"
 [package.extras]
 cchardet = ["cchardet"]
 chardet = ["chardet"]
 charset-normalizer = ["charset-normalizer"]
 html5lib = ["html5lib"]
 lxml = ["lxml"]
 [[package]]
 name = "certifi"
 version = "2025.1.31"
@ -449,6 +471,17 @@ files = [
    {file = "ruff-0.9.6.tar.gz", hash = "sha256:81761592f72b620ec8fa1068a6fd00e98a5ebee342a3642efd84454f3031dca9"},
 ]
 [[package]]
 name = "soupsieve"
 version = "2.6"
 description = "A modern CSS selector implementation for Beautiful Soup."
 optional = false
 python-versions = ">=3.8"
 files = [
    {file = "soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9"},
    {file = "soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb"},
 ]
 [[package]]
 name = "tomli"
 version = "2.2.1"
@ -504,4 +537,4 @@ files = [
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "c0d1ee756a561834a3d8e62d8b9c75f72521446cf5f288b4f7de710203cd5c38"
+content-hash = "84a41aac6241a5e1eec3635df8c2e51d27506f1098f0e3c45dbed90aa26e5c60"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -7,6 +7,7 @@ license = "WTFPL"
 [tool.poetry.dependencies]
 python = "^3.9"
 beautifulsoup4 = "^4.13.3"
 curl_cffi = "^0.9.0b2"
 pypub3 = "^2.0.7"
 pyxml3 = "^0.0.4"