fix: 🐛 use beautilfulsoup to extract bad html
All checks were successful
books / lint (push) Successful in 1m25s

This commit is contained in:
Michel Roux 2025-02-11 23:25:14 +01:00
parent 21ebf7ed13
commit a778a6b947
3 changed files with 42 additions and 7 deletions

View File

@ -7,9 +7,10 @@ from tempfile import NamedTemporaryFile
from time import sleep from time import sleep
from typing import cast from typing import cast
from bs4 import BeautifulSoup
from curl_cffi import requests from curl_cffi import requests
from pypub import Chapter, Epub, SimpleChapterFactory # type: ignore from pypub import Chapter, Epub, SimpleChapterFactory # type: ignore
from pyxml.html import HtmlElement, fromstring, tostring # type: ignore from pyxml.html import HtmlElement, fromstring # type: ignore
parser = ArgumentParser() parser = ArgumentParser()
parser.add_argument("-l", "--lang", choices=["fr", "en"], required=True) parser.add_argument("-l", "--lang", choices=["fr", "en"], required=True)
@ -75,15 +76,15 @@ epub = Epub(
for url in book["chapters"]: for url in book["chapters"]:
text = download(url) text = download(url)
etree = fromstring(text) etree = BeautifulSoup(text, "html.parser")
title = etree.xpath(".//h1[@class='entry-title']")[0].text title = etree.select_one("h1.entry-title").text # type: ignore
content = etree.xpath(".//div[@class='entry-content']//p") content = etree.select("div.entry-content p")
print(f"Chapter {title}...") print(f"Chapter {title}...")
chapter = Chapter(title, "") chapter = Chapter(title, "")
for elem in content: for elem in content:
if len(elem.children) > 0 and elem.children[0].tag == "a": if elem.a:
continue continue
chapter.content += tostring(elem).decode() chapter.content += elem.prettify()
epub.add_chapter(chapter) epub.add_chapter(chapter)
with NamedTemporaryFile() as cover: with NamedTemporaryFile() as cover:

35
poetry.lock generated
View File

@ -1,5 +1,27 @@
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. # This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
[[package]]
name = "beautifulsoup4"
version = "4.13.3"
description = "Screen-scraping library"
optional = false
python-versions = ">=3.7.0"
files = [
{file = "beautifulsoup4-4.13.3-py3-none-any.whl", hash = "sha256:99045d7d3f08f91f0d656bc9b7efbae189426cd913d830294a15eefa0ea4df16"},
{file = "beautifulsoup4-4.13.3.tar.gz", hash = "sha256:1bd32405dacc920b42b83ba01644747ed77456a65760e285fbc47633ceddaf8b"},
]
[package.dependencies]
soupsieve = ">1.2"
typing-extensions = ">=4.0.0"
[package.extras]
cchardet = ["cchardet"]
chardet = ["chardet"]
charset-normalizer = ["charset-normalizer"]
html5lib = ["html5lib"]
lxml = ["lxml"]
[[package]] [[package]]
name = "certifi" name = "certifi"
version = "2025.1.31" version = "2025.1.31"
@ -449,6 +471,17 @@ files = [
{file = "ruff-0.9.6.tar.gz", hash = "sha256:81761592f72b620ec8fa1068a6fd00e98a5ebee342a3642efd84454f3031dca9"}, {file = "ruff-0.9.6.tar.gz", hash = "sha256:81761592f72b620ec8fa1068a6fd00e98a5ebee342a3642efd84454f3031dca9"},
] ]
[[package]]
name = "soupsieve"
version = "2.6"
description = "A modern CSS selector implementation for Beautiful Soup."
optional = false
python-versions = ">=3.8"
files = [
{file = "soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9"},
{file = "soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb"},
]
[[package]] [[package]]
name = "tomli" name = "tomli"
version = "2.2.1" version = "2.2.1"
@ -504,4 +537,4 @@ files = [
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.9" python-versions = "^3.9"
content-hash = "c0d1ee756a561834a3d8e62d8b9c75f72521446cf5f288b4f7de710203cd5c38" content-hash = "84a41aac6241a5e1eec3635df8c2e51d27506f1098f0e3c45dbed90aa26e5c60"

View File

@ -7,6 +7,7 @@ license = "WTFPL"
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.9" python = "^3.9"
beautifulsoup4 = "^4.13.3"
curl_cffi = "^0.9.0b2" curl_cffi = "^0.9.0b2"
pypub3 = "^2.0.7" pypub3 = "^2.0.7"
pyxml3 = "^0.0.4" pyxml3 = "^0.0.4"