From 6da2c52a06d06780b04e4b417e4c58c1656ad543 Mon Sep 17 00:00:00 2001 From: Michel Roux Date: Sat, 21 Dec 2024 21:58:33 +0100 Subject: [PATCH] refactor: :recycle: refacto 2hdp parser --- Dockerfile | 2 +- commands/2hdp.py | 162 ++++++++++++++++++++++++++++++----------------- poetry.lock | 59 ++++++++++++++++- pyproject.toml | 2 + 4 files changed, 166 insertions(+), 59 deletions(-) diff --git a/Dockerfile b/Dockerfile index cd97d7e..3e3b636 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,7 +9,7 @@ RUN apt-get update && \ apt-get install -y \ vim p7zip* git mc lftp rename wget curl procps psmisc \ openssh-client transmission-cli speedtest-cli \ - python3-requests python3-transmissionrpc && \ + python3-bs4 python3-requests python3-transmissionrpc && \ apt-get install -y --no-install-recommends mame-tools yt-dlp && \ rm -rf /var/lib/apt/lists/* && \ curl -sSL https://raw.githubusercontent.com/MatanZ/tremc/main/tremc -o /usr/local/bin/tremc && \ diff --git a/commands/2hdp.py b/commands/2hdp.py index c2d87a3..a415969 100755 --- a/commands/2hdp.py +++ b/commands/2hdp.py @@ -1,85 +1,133 @@ #!/usr/bin/env python3 from argparse import ArgumentParser from csv import DictWriter +from datetime import datetime from sys import stdout +from time import sleep from xml.etree import ElementTree +from bs4 import BeautifulSoup from requests import get -from transmissionrpc import Client, DefaultHTTPHandler # type: ignore parser = ArgumentParser() -parser.add_argument("-u", "--username", required=True) -parser.add_argument("-p", "--password", required=True) +parser.add_argument( + "--keyword", action="append", default=["mhdgz"], help="Prefered words to search on names" +) +parser.add_argument("--cookie", required=True, help="Cookies to bypass CloudFlare") +parser.add_argument("--user-agent", required=True, help="User Agent to bypass CloudFlare") +parser.add_argument( + "--feed", required=True, action="append", help="RSS feed to search for new movies" +) args = parser.parse_args() -class CustomHTTPHandler(DefaultHTTPHandler): - def request(self, url, query, headers, timeout): - headers["User-Agent"] = ( - "Mozilla/5.0 (X11; Linux x86_64; rv:111.0) Gecko/20100101 Firefox/111.0" - ) +def nb_files(id: int): + response = get( + f"https://www.ygg.re/engine/get_files?torrent={id}", + headers={"Cookie": args.cookie, "User-Agent": args.user_agent}, + ) + sleep(1) - return super().request(url, query, headers, timeout) + json = response.json() + soup = BeautifulSoup(json["html"], "html.parser") + rows = soup.find_all("tr") + length = 0 + + for row in rows: + columns = row.find_all("td") + + if columns[1].text.strip()[-3:] == "mkv": + length += 1 + + return length -client = Client( - "https://torrent.crystalyx.net/transmission/rpc", - port=443, - user=args.username, - password=args.password, - http_handler=CustomHTTPHandler(), -) +def parse_size(size): + # https://stackoverflow.com/a/42865957 + units = {"Ko": 10**3, "Mo": 10**6, "Go": 10**9, "To": 10**12} + number = size[:-2] + unit = size[-2:] -movies = [] -torrents = client.get_torrents() -writer = DictWriter(stdout, fieldnames=["season", "title", "hash", "url"]) + return int(float(number) * units[unit]) + + +def is_valid(id: int, size: int, completed: int): + return size < parse_size("10Go") and completed > 10 and nb_files(id) == 1 + + +writer = DictWriter(stdout, fieldnames=["title", "season", "episode", "name", "link"]) writer.writeheader() -tree = ElementTree.fromstring(get("https://feed.ausha.co/Loa7srdWGm1b").text) +for feed in args.feed: + tree = ElementTree.fromstring(get(feed).text) + loop = 0 -for item in tree.findall(".//item"): - title = item.find("title") - season = item.find("itunes:season", {"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd"}) + for item in tree.findall(".//item")[::-1]: + loop += 1 + title = str(item.findtext("title")).strip() + season = int( + item.findtext( + "itunes:season", namespaces={"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd"} + ) + or datetime.strptime(item.findtext("pubDate") or "", "%a, %d %b %Y %H:%M:%S %z").year + ) + episode = int( + item.findtext( + "itunes:episode", + namespaces={"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd"}, + ) + or loop + ) - if season is None or title is None or title.text is None: - continue + response = get( + f"https://www.ygg.re/engine/search?name={title}" + "&description=&file=&uploader=&category=2145&sub_category=2183" + "&do=search&order=asc&sort=publish_date", + headers={"Cookie": args.cookie, "User-Agent": args.user_agent}, + ) + sleep(1) - row = { - "title": title.text, - "season": f"Saison {season.text}", - "hash": "", - "url": f"https://www.ygg.re/engine/search?name={title.text}&description=&file=&uploader=&category=2145&sub_category=2183&option_langue:multiple[0]=4&do=search&order=asc&sort=publish_date", - } + soup = BeautifulSoup(response.text, "html.parser") + rows = soup.select("table.table tr") + column = None - for torrent in torrents: - if title.text.lower() in torrent.name.lower(): - row["hash"] = torrent.hashString - break + for row in rows: + columns = row.find_all("td") - movies.append(title.text.lower()) - writer.writerow(row) + if len(columns) == 0: + continue -tree = ElementTree.fromstring( - get("https://www.calvinballconsortium.fr/podcasts/leretourdujeudi/feed.xml").text -) + if any(keyword.lower() in columns[1].text.strip().lower() for keyword in args.keyword): + if is_valid( + int(columns[2].a["target"]), + parse_size(columns[5].text.strip()), + int(columns[6].text.strip()), + ): + column = columns + break -for item in tree.findall(".//item"): - title = item.find("title") + if column is None: + for row in rows: + columns = row.find_all("td") - if title is None or title.text is None or title.text.lower() in movies: - continue + if len(columns) == 0: + continue - row = { - "title": title.text, - "season": "Jeudi", - "hash": "", - "url": f"https://www.ygg.re/engine/search?name={title.text}&description=&file=&uploader=&category=2145&sub_category=2183&option_langue:multiple[0]=4&do=search&order=asc&sort=publish_date", - } + if "multi" in columns[1].text.strip().lower(): + if is_valid( + int(columns[2].a["target"]), + parse_size(columns[5].text.strip()), + int(columns[6].text.strip()), + ): + column = columns + break - for torrent in torrents: - if title.text.lower() in torrent.name.lower(): - row["hash"] = torrent.hashString - break - - movies.append(title.text.lower()) - writer.writerow(row) + writer.writerow( + { + "title": title, + "season": season, + "episode": episode, + "name": column[1].text.strip() if column else "", + "link": column[1].a["href"] if column else "", + } + ) diff --git a/poetry.lock b/poetry.lock index a50105f..447a987 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,26 @@ # This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand. +[[package]] +name = "beautifulsoup4" +version = "4.12.3" +description = "Screen-scraping library" +optional = false +python-versions = ">=3.6.0" +files = [ + {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"}, + {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"}, +] + +[package.dependencies] +soupsieve = ">1.2" + +[package.extras] +cchardet = ["cchardet"] +chardet = ["chardet"] +charset-normalizer = ["charset-normalizer"] +html5lib = ["html5lib"] +lxml = ["lxml"] + [[package]] name = "certifi" version = "2024.12.14" @@ -251,6 +272,17 @@ files = [ {file = "ruff-0.8.4.tar.gz", hash = "sha256:0d5f89f254836799af1615798caa5f80b7f935d7a670fad66c5007928e57ace8"}, ] +[[package]] +name = "soupsieve" +version = "2.6" +description = "A modern CSS selector implementation for Beautiful Soup." +optional = false +python-versions = ">=3.8" +files = [ + {file = "soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9"}, + {file = "soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb"}, +] + [[package]] name = "tomli" version = "2.2.1" @@ -303,6 +335,31 @@ files = [ {file = "transmissionrpc_ng-0.13.1-py3-none-any.whl", hash = "sha256:092954f0571392cd6ff498ea9b5a55de3b20f68fd919d6fe94e4f5e112a750bc"}, ] +[[package]] +name = "types-beautifulsoup4" +version = "4.12.0.20241020" +description = "Typing stubs for beautifulsoup4" +optional = false +python-versions = ">=3.8" +files = [ + {file = "types-beautifulsoup4-4.12.0.20241020.tar.gz", hash = "sha256:158370d08d0cd448bd11b132a50ff5279237a5d4b5837beba074de152a513059"}, + {file = "types_beautifulsoup4-4.12.0.20241020-py3-none-any.whl", hash = "sha256:c95e66ce15a4f5f0835f7fbc5cd886321ae8294f977c495424eaf4225307fd30"}, +] + +[package.dependencies] +types-html5lib = "*" + +[[package]] +name = "types-html5lib" +version = "1.1.11.20241018" +description = "Typing stubs for html5lib" +optional = false +python-versions = ">=3.8" +files = [ + {file = "types-html5lib-1.1.11.20241018.tar.gz", hash = "sha256:98042555ff78d9e3a51c77c918b1041acbb7eb6c405408d8a9e150ff5beccafa"}, + {file = "types_html5lib-1.1.11.20241018-py3-none-any.whl", hash = "sha256:3f1e064d9ed2c289001ae6392c84c93833abb0816165c6ff0abfc304a779f403"}, +] + [[package]] name = "types-requests" version = "2.32.0.20241016" @@ -348,4 +405,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "b1f91153e63e166bbdef0bab71c32f2b9033b0162b752b2ae569e73f6949ec87" +content-hash = "5f000dd0b221e422f68ef075abe504a8ed6776ab8065ff8c173a072e95ad7307" diff --git a/pyproject.toml b/pyproject.toml index c3c751e..735af7e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,12 +7,14 @@ package-mode = false [tool.poetry.dependencies] python = "^3.10" +beautifulsoup4 = "^4.12.3" transmissionrpc-ng = "~0.13.1" requests = "^2.32.3" [tool.poetry.group.dev.dependencies] ruff = "^0.8.4" mypy = "^1.14.0" +types-beautifulsoup4 = "^4.12.0" types-requests = "^2.32.0" [tool.ruff]