From 41cbe3640eeeffed6d3832b1449e9e3e68545691 Mon Sep 17 00:00:00 2001 From: Michel Roux Date: Sun, 5 Mar 2023 16:18:01 +0100 Subject: [PATCH] Adapt for the rewrite of book 1 --- books.py | 22 +++++++++++++++------- poetry.lock | 6 +++--- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/books.py b/books.py index 25cc0a7..8c8760d 100644 --- a/books.py +++ b/books.py @@ -82,7 +82,7 @@ def async_wrap(func): makedirs(name=OUTPUT_DIR, exist_ok=True) cover_req = get( - url="https://i0.wp.com/thefantasyinn.com/wp-content/uploads/2018/08/twi.jpg", + url="https://i0.wp.com/wanderinginn.com/wp-content/uploads/2023/03/Wandering_Inn-Vol1-eCover.jpg", # noqa: E501 stream=True, ) Image.open(cover_req.raw).save(f"{OUTPUT_DIR}/cover.png") @@ -94,17 +94,19 @@ def process_volume(epub: Epub, urls: List[str]): page_html = BeautifulSoup(markup=page_req.text, features="lxml") page_content = page_html.select_one("div.entry-content") page_title = page_html.select_one("h1.entry-title") - page_date = page_html.find(name="meta", property="article:modified_time") - if not page_content or not page_title or type(page_date) is not Tag: - raise Exception() + if not page_content or not page_title: + raise Exception("Missing title or content") + + title = page_title.get_text().strip() + if not title: + continue galleries = page_content.select("div.tiled-gallery") for gallery in galleries: gallery.decompose() - title = page_title.get_text().strip() chapter = create_chapter_from_string( html=page_content.prettify(), title=title, @@ -118,7 +120,9 @@ def process_volume(epub: Epub, urls: List[str]): @async_wrap -def process_book(url: str, creator: str, publisher: str, language: str): +def process_book( + url: str, creator: str, publisher: str, language: str, strip_first=False +): toc_req = get(url) toc_html = BeautifulSoup(markup=toc_req.text, features="lxml") toc_content = toc_html.select("div.entry-content > p") @@ -127,10 +131,13 @@ def process_book(url: str, creator: str, publisher: str, language: str): if not toc_title or type(toc_date) is not Tag: raise Exception("Missing title or date") + title = toc_title.get_text().strip() + + if strip_first: + toc_content.pop(0) for i, toc_line in enumerate(toc_content): if i % 2 == 0: - title = toc_title.get_text().strip() volume = toc_line.get_text().strip() elif volume: epub = Epub( @@ -163,6 +170,7 @@ async def create_books(): creator="Pirateaba", publisher="Xefir", language="en", + strip_first=True, ), ) diff --git a/poetry.lock b/poetry.lock index 1fdbb70..36ddcae 100644 --- a/poetry.lock +++ b/poetry.lock @@ -630,14 +630,14 @@ tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "pa [[package]] name = "platformdirs" -version = "3.0.0" +version = "3.1.0" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "platformdirs-3.0.0-py3-none-any.whl", hash = "sha256:b1d5eb14f221506f50d6604a561f4c5786d9e80355219694a1b244bcd96f4567"}, - {file = "platformdirs-3.0.0.tar.gz", hash = "sha256:8a1228abb1ef82d788f74139988b137e78692984ec7b08eaa6c65f1723af28f9"}, + {file = "platformdirs-3.1.0-py3-none-any.whl", hash = "sha256:13b08a53ed71021350c9e300d4ea8668438fb0046ab3937ac9a29913a1a1350a"}, + {file = "platformdirs-3.1.0.tar.gz", hash = "sha256:accc3665857288317f32c7bebb5a8e482ba717b474f3fc1d18ca7f9214be0cef"}, ] [package.extras]