fix: 🐛 use beautilfulsoup to extract bad html
All checks were successful
books / lint (push) Successful in 1m25s
All checks were successful
books / lint (push) Successful in 1m25s
This commit is contained in:
parent
21ebf7ed13
commit
a778a6b947
@ -7,9 +7,10 @@ from tempfile import NamedTemporaryFile
|
|||||||
from time import sleep
|
from time import sleep
|
||||||
from typing import cast
|
from typing import cast
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
from curl_cffi import requests
|
from curl_cffi import requests
|
||||||
from pypub import Chapter, Epub, SimpleChapterFactory # type: ignore
|
from pypub import Chapter, Epub, SimpleChapterFactory # type: ignore
|
||||||
from pyxml.html import HtmlElement, fromstring, tostring # type: ignore
|
from pyxml.html import HtmlElement, fromstring # type: ignore
|
||||||
|
|
||||||
parser = ArgumentParser()
|
parser = ArgumentParser()
|
||||||
parser.add_argument("-l", "--lang", choices=["fr", "en"], required=True)
|
parser.add_argument("-l", "--lang", choices=["fr", "en"], required=True)
|
||||||
@ -75,15 +76,15 @@ epub = Epub(
|
|||||||
|
|
||||||
for url in book["chapters"]:
|
for url in book["chapters"]:
|
||||||
text = download(url)
|
text = download(url)
|
||||||
etree = fromstring(text)
|
etree = BeautifulSoup(text, "html.parser")
|
||||||
title = etree.xpath(".//h1[@class='entry-title']")[0].text
|
title = etree.select_one("h1.entry-title").text # type: ignore
|
||||||
content = etree.xpath(".//div[@class='entry-content']//p")
|
content = etree.select("div.entry-content p")
|
||||||
print(f"Chapter {title}...")
|
print(f"Chapter {title}...")
|
||||||
chapter = Chapter(title, "")
|
chapter = Chapter(title, "")
|
||||||
for elem in content:
|
for elem in content:
|
||||||
if len(elem.children) > 0 and elem.children[0].tag == "a":
|
if elem.a:
|
||||||
continue
|
continue
|
||||||
chapter.content += tostring(elem).decode()
|
chapter.content += elem.prettify()
|
||||||
epub.add_chapter(chapter)
|
epub.add_chapter(chapter)
|
||||||
|
|
||||||
with NamedTemporaryFile() as cover:
|
with NamedTemporaryFile() as cover:
|
||||||
|
35
poetry.lock
generated
35
poetry.lock
generated
@ -1,5 +1,27 @@
|
|||||||
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
|
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "beautifulsoup4"
|
||||||
|
version = "4.13.3"
|
||||||
|
description = "Screen-scraping library"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.7.0"
|
||||||
|
files = [
|
||||||
|
{file = "beautifulsoup4-4.13.3-py3-none-any.whl", hash = "sha256:99045d7d3f08f91f0d656bc9b7efbae189426cd913d830294a15eefa0ea4df16"},
|
||||||
|
{file = "beautifulsoup4-4.13.3.tar.gz", hash = "sha256:1bd32405dacc920b42b83ba01644747ed77456a65760e285fbc47633ceddaf8b"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
soupsieve = ">1.2"
|
||||||
|
typing-extensions = ">=4.0.0"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
cchardet = ["cchardet"]
|
||||||
|
chardet = ["chardet"]
|
||||||
|
charset-normalizer = ["charset-normalizer"]
|
||||||
|
html5lib = ["html5lib"]
|
||||||
|
lxml = ["lxml"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "certifi"
|
name = "certifi"
|
||||||
version = "2025.1.31"
|
version = "2025.1.31"
|
||||||
@ -449,6 +471,17 @@ files = [
|
|||||||
{file = "ruff-0.9.6.tar.gz", hash = "sha256:81761592f72b620ec8fa1068a6fd00e98a5ebee342a3642efd84454f3031dca9"},
|
{file = "ruff-0.9.6.tar.gz", hash = "sha256:81761592f72b620ec8fa1068a6fd00e98a5ebee342a3642efd84454f3031dca9"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "soupsieve"
|
||||||
|
version = "2.6"
|
||||||
|
description = "A modern CSS selector implementation for Beautiful Soup."
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.8"
|
||||||
|
files = [
|
||||||
|
{file = "soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9"},
|
||||||
|
{file = "soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tomli"
|
name = "tomli"
|
||||||
version = "2.2.1"
|
version = "2.2.1"
|
||||||
@ -504,4 +537,4 @@ files = [
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.9"
|
python-versions = "^3.9"
|
||||||
content-hash = "c0d1ee756a561834a3d8e62d8b9c75f72521446cf5f288b4f7de710203cd5c38"
|
content-hash = "84a41aac6241a5e1eec3635df8c2e51d27506f1098f0e3c45dbed90aa26e5c60"
|
||||||
|
@ -7,6 +7,7 @@ license = "WTFPL"
|
|||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = "^3.9"
|
python = "^3.9"
|
||||||
|
beautifulsoup4 = "^4.13.3"
|
||||||
curl_cffi = "^0.9.0b2"
|
curl_cffi = "^0.9.0b2"
|
||||||
pypub3 = "^2.0.7"
|
pypub3 = "^2.0.7"
|
||||||
pyxml3 = "^0.0.4"
|
pyxml3 = "^0.0.4"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user