Back to python + Gravesong

2022-01-18 20:43:45 +00:00 · 2022-01-18 20:43:45 +00:00 · d3a7ac0fc2
commit d3a7ac0fc2
parent a3e84310ff
11 changed files with 190 additions and 1861 deletions
--- a/.drone.yml
+++ b/.drone.yml
@ -4,11 +4,13 @@ type: docker
 steps:
  - name: epub
-    image: node
+    image: python-slim
    commands:
-      - yarn install
+      - pip install flake8 black
-      - yarn lint
+      - flake8 --ignore=501
-      - yarn start
+      - black --check
      - pip install -r requirements.txt
      - python run.py
  - name: pdf
    image: linuxserver/calibre
    commands:
--- a/.eslintrc.yml
+++ b/.eslintrc.yml
@ -1,8 +0,0 @@
 env:
  es2021: true
  node: true
 extends: 'eslint:recommended'
 parserOptions:
  ecmaVersion: 12
  sourceType: module
 rules: {}
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,4 @@
 __pycache__
 .venv
 .vscode
 output
 node_modules
--- a/executor.py
+++ b/executor.py
@ -0,0 +1,10 @@
 import asyncio
 def executor(func):
    async def wrapper(*args, **kwargs):
        await asyncio.get_event_loop().run_in_executor(
            None, lambda: func(*args, **kwargs)
        )
    return wrapper
--- a/gravesong.py
+++ b/gravesong.py
@ -0,0 +1,41 @@
 import bs4
 import executor
 import pypub
 import requests
@executor.executor
 def process(metadata, output_dir):
    book = pypub.Epub("Gravesong", *metadata)
    response = requests.post(
        "https://wanderinginn.com/wp-pass.php",
        data={"post_password": "Iwalkedameadowweary", "Submit": "Enter"},
        headers={"Referer": "https://wanderinginn.com/2022/01/11/gravesong/"},
    )
    html = bs4.BeautifulSoup(response.content, "html.parser")
    entry_content = html.select_one("div.entry-content")
    content_imgs = entry_content.find_all("img")
    for content_img in content_imgs:
        content_img.decompose()
    chapter_title = "Introduction"
    chapter_content = ""
    for paragraph in entry_content.children:
        if paragraph.name == "h1":
            print(f"Gravesong - {chapter_title}")
            book.add_chapter(
                pypub.create_chapter_from_string(chapter_content, chapter_title)
            )
            chapter_title = paragraph.get_text()
        elif paragraph.name == "p":
            chapter_content += paragraph.prettify()
    print(f"Gravesong - {chapter_title}")
    book.add_chapter(pypub.create_chapter_from_string(chapter_content, chapter_title))
    print("Gravesong - Book")
    book.create_epub(output_dir)
--- a/index.js
+++ b/index.js
@ -1,86 +0,0 @@
 import fs from 'fs'
 import cheerio from 'cheerio'
 import fetch from 'node-fetch'
 import Epub from 'epub-gen-funstory'
 if (!fs.existsSync('output')) {
    fs.mkdirSync('output')
 }
 const metadata = (volume, title, author, tocTitle, description) => ({
    title: `${title} - Volume ${volume}`,
    author,
    cover: 'https://i.pinimg.com/originals/0b/fd/cf/0bfdcfb42ba3ff0a22f4a7bc52928af4.png',
    output: `output/${title} - Volume ${volume}.epub`,
    version: 3,
    lang: 'fr',
    tocTitle,
    appendChapterTitles: true,
    content: [],
    links: [],
    verbose: true,
    description,
 })
 const fetchPage = async (url) => {
    const response = await fetch(url)
    const responseHtml = await response.text()
    const html = cheerio.load(responseHtml)
    const title = html('h1.entry-title').text()
    const content = html('div.entry-content')
    content.find('a').remove()
    content.find('h3').remove()
    content.find('hr').remove()
    content.find('div.tiled-gallery').remove()
    const data = content.html()
    console.log(title)
    return {
        title,
        data,
    }
 }
 const run = async (url, authors) => {
    const books = []
    const response = await fetch(url)
    const responseHtml = await response.text()
    const html = cheerio.load(responseHtml)
    const content = html('div.entry-content > p')
    const title = html('#site-title > span > a').text()
    const summary = html('h1.entry-title').text()
    const description = html('#site-description').text()
    let volume = 0;
    content.each((i, el) => {
        if (i % 2 === 0) {
            volume = parseInt(html(el).text().replace(/Volume /, '').trim())
            if (isNaN(volume)) return
            books.push(metadata(volume, title, authors, summary, description))
        } else {
            html('a', el).each((i, el) => {
                books[volume - 1].links.push(html(el).attr('href'))
            })
        }
    })
    books.map(async book => {
        for (const link of book.links) {
            book.content.push(await fetchPage(link))
        }
        new Epub(book)
    })
 }
 run(
    'https://aubergevagabonde.wordpress.com/sommaire/',
    ['Maroti', 'ElliVia', 'Pirateaba']
 )
 run(
    'https://wanderinginn.com/table-of-contents/',
    ['Pirateaba']
 )
--- a/package.json
+++ b/package.json
@ -1,26 +0,0 @@
 {
  "name": "auberge_vagabonde_js",
  "version": "1.0.0",
  "description": "Convert The Wandering Inn Into Ebooks",
  "repository": {
    "type": "git",
    "url": "https://git.crystalyx.net/Xefir/Auberge_Vagabonde_JS"
  },
  "author": "Xéfir Destiny",
  "license": "ISC",
  "main": "index.js",
  "scripts": {
    "start": "node .",
    "lint": "eslint ."
  },
  "dependencies": {
    "cheerio": "1.0.0-rc.10",
    "epub-gen-funstory": "0.1.3",
    "node-fetch": "2"
  },
  "devDependencies": {
    "@types/node-fetch": "2",
    "eslint": "8.7.0"
  },
  "type": "module"
 }
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,3 @@
 requests==2.27.1
 beautifulsoup4==4.10.0
 git+https://git.crystalyx.net/Xefir/pypub@fix/py3#pypub
--- a/run.py
+++ b/run.py
@ -0,0 +1,69 @@
 import asyncio
 import gravesong
 import logging
 import os
 import requests
 import twi
 loggers = [logging.getLogger()] + [
    logging.getLogger(name) for name in logging.root.manager.loggerDict
 ]
 for logger in loggers:
    logger.setLevel(logging.INFO)
 output_epubs = "output/epubs"
 output_imgs = "output/imgs"
 if not os.path.isdir(output_epubs):
    os.makedirs(output_epubs)
 if not os.path.isdir(output_imgs):
    os.makedirs(output_imgs)
 gravesong_img = requests.get(
    "https://wanderinginn.files.wordpress.com/2021/12/gravesong-by-boboplushie.jpg"
 )
 with open(f"{output_imgs}/gravesong-by-boboplushie.jpg", "wb") as f:
    f.write(gravesong_img.content)
 twi_img = requests.get(
    "https://i0.wp.com/thefantasyinn.com/wp-content/uploads/2018/08/twi.jpg"
 )
 with open(f"{output_imgs}/twi.jpg", "wb") as f:
    f.write(twi_img.content)
 async def main():
    await asyncio.gather(
        gravesong.process(
            {
                "creator": "Pirateaba",
                "language": "en",
                "publisher": "Xefir",
                "cover": f"{output_imgs}/gravesong-by-boboplushie.jpg",
            },
            output_epubs,
        ),
        twi.process(
            {
                "creator": "Pirateaba",
                "language": "en",
                "publisher": "Xefir",
                "cover": f"{output_imgs}/twi.jpg",
            },
            output_epubs,
            "https://wanderinginn.com/table-of-contents/",
        ),
        twi.process(
            {
                "creator": "Pirateaba",
                "language": "fr",
                "publisher": "Maroti, ElliVia",
                "cover": f"{output_imgs}/twi.jpg",
            },
            output_epubs,
            "https://aubergevagabonde.wordpress.com/sommaire/",
        ),
    )
 asyncio.run(main())
--- a/twi.py
+++ b/twi.py
@ -0,0 +1,58 @@
 import asyncio
 import executor
 import bs4
 import pypub
 import requests
 def fetchVolume(title, metadata, volume_title, output_dir, links):
    book = pypub.Epub(f"{title} - {volume_title}", *metadata)
    for link in links:
        chapter_response = requests.get(link["href"])
        chapter_html = bs4.BeautifulSoup(chapter_response.content, "html.parser")
        chapter_content = chapter_html.select_one("div.entry-content")
        chapter_title = chapter_html.select_one("h1.entry-title").get_text()
        a_remove = chapter_content.find_all("a")
        hr_remove = chapter_content.find_all("hr")
        div_remove = chapter_content.find_all("div.tiled-gallery")
        for removed in a_remove:
            removed.decompose()
        for removed in hr_remove:
            removed.decompose()
        for removed in div_remove:
            removed.decompose()
        print(f"{title} - {chapter_title}")
        book.add_chapter(
            pypub.create_chapter_from_string(chapter_content.prettify(), chapter_title)
        )
    print(f"{title} - {volume_title}")
    book.create_epub(output_dir)
@executor.executor
 def process(metadata, output_dir, url):
    response = requests.get(url)
    html = bs4.BeautifulSoup(response.content, "html.parser")
    content = html.select("div.entry-content > p")
    title = html.select_one("#site-title > span > a").get_text()
    volume_title = None
    tasks = []
    for paragraph in content:
        if paragraph.strong is not None:
            volume_title = paragraph.strong.get_text()
        else:
            tasks.append(
                fetchVolume(
                    title, metadata, volume_title, output_dir, paragraph.find_all("a")
                )
            )
    asyncio.run(asyncio.gather(*tasks))
--- a/yarn.lock
+++ b/yarn.lock