diff --git a/auberge_vagabonde/__main__.py b/auberge_vagabonde/__main__.py index 17ff306..9c5f2f9 100644 --- a/auberge_vagabonde/__main__.py +++ b/auberge_vagabonde/__main__.py @@ -7,9 +7,10 @@ from tempfile import NamedTemporaryFile from time import sleep from typing import cast +from bs4 import BeautifulSoup from curl_cffi import requests from pypub import Chapter, Epub, SimpleChapterFactory # type: ignore -from pyxml.html import HtmlElement, fromstring, tostring # type: ignore +from pyxml.html import HtmlElement, fromstring # type: ignore parser = ArgumentParser() parser.add_argument("-l", "--lang", choices=["fr", "en"], required=True) @@ -75,15 +76,15 @@ epub = Epub( for url in book["chapters"]: text = download(url) - etree = fromstring(text) - title = etree.xpath(".//h1[@class='entry-title']")[0].text - content = etree.xpath(".//div[@class='entry-content']//p") + etree = BeautifulSoup(text, "html.parser") + title = etree.select_one("h1.entry-title").text # type: ignore + content = etree.select("div.entry-content p") print(f"Chapter {title}...") chapter = Chapter(title, "") for elem in content: - if len(elem.children) > 0 and elem.children[0].tag == "a": + if elem.a: continue - chapter.content += tostring(elem).decode() + chapter.content += elem.prettify() epub.add_chapter(chapter) with NamedTemporaryFile() as cover: diff --git a/poetry.lock b/poetry.lock index 50c64f6..d7fa3ac 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,27 @@ # This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +[[package]] +name = "beautifulsoup4" +version = "4.13.3" +description = "Screen-scraping library" +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "beautifulsoup4-4.13.3-py3-none-any.whl", hash = "sha256:99045d7d3f08f91f0d656bc9b7efbae189426cd913d830294a15eefa0ea4df16"}, + {file = "beautifulsoup4-4.13.3.tar.gz", hash = "sha256:1bd32405dacc920b42b83ba01644747ed77456a65760e285fbc47633ceddaf8b"}, +] + +[package.dependencies] +soupsieve = ">1.2" +typing-extensions = ">=4.0.0" + +[package.extras] +cchardet = ["cchardet"] +chardet = ["chardet"] +charset-normalizer = ["charset-normalizer"] +html5lib = ["html5lib"] +lxml = ["lxml"] + [[package]] name = "certifi" version = "2025.1.31" @@ -449,6 +471,17 @@ files = [ {file = "ruff-0.9.6.tar.gz", hash = "sha256:81761592f72b620ec8fa1068a6fd00e98a5ebee342a3642efd84454f3031dca9"}, ] +[[package]] +name = "soupsieve" +version = "2.6" +description = "A modern CSS selector implementation for Beautiful Soup." +optional = false +python-versions = ">=3.8" +files = [ + {file = "soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9"}, + {file = "soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb"}, +] + [[package]] name = "tomli" version = "2.2.1" @@ -504,4 +537,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "c0d1ee756a561834a3d8e62d8b9c75f72521446cf5f288b4f7de710203cd5c38" +content-hash = "84a41aac6241a5e1eec3635df8c2e51d27506f1098f0e3c45dbed90aa26e5c60" diff --git a/pyproject.toml b/pyproject.toml index 9988315..dd294f2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,7 @@ license = "WTFPL" [tool.poetry.dependencies] python = "^3.9" +beautifulsoup4 = "^4.13.3" curl_cffi = "^0.9.0b2" pypub3 = "^2.0.7" pyxml3 = "^0.0.4"