refactor: ♻️ refacto 2hdp parser
All checks were successful
dl / lint (push) Successful in 1m34s
dl / docker (push) Successful in 2m39s

This commit is contained in:
Michel Roux 2024-12-21 21:58:33 +01:00
parent 6308e6790c
commit 6da2c52a06
4 changed files with 166 additions and 59 deletions

View File

@ -9,7 +9,7 @@ RUN apt-get update && \
apt-get install -y \
vim p7zip* git mc lftp rename wget curl procps psmisc \
openssh-client transmission-cli speedtest-cli \
python3-requests python3-transmissionrpc && \
python3-bs4 python3-requests python3-transmissionrpc && \
apt-get install -y --no-install-recommends mame-tools yt-dlp && \
rm -rf /var/lib/apt/lists/* && \
curl -sSL https://raw.githubusercontent.com/MatanZ/tremc/main/tremc -o /usr/local/bin/tremc && \

View File

@ -1,85 +1,133 @@
#!/usr/bin/env python3
from argparse import ArgumentParser
from csv import DictWriter
from datetime import datetime
from sys import stdout
from time import sleep
from xml.etree import ElementTree
from bs4 import BeautifulSoup
from requests import get
from transmissionrpc import Client, DefaultHTTPHandler # type: ignore
parser = ArgumentParser()
parser.add_argument("-u", "--username", required=True)
parser.add_argument("-p", "--password", required=True)
parser.add_argument(
"--keyword", action="append", default=["mhdgz"], help="Prefered words to search on names"
)
parser.add_argument("--cookie", required=True, help="Cookies to bypass CloudFlare")
parser.add_argument("--user-agent", required=True, help="User Agent to bypass CloudFlare")
parser.add_argument(
"--feed", required=True, action="append", help="RSS feed to search for new movies"
)
args = parser.parse_args()
class CustomHTTPHandler(DefaultHTTPHandler):
def request(self, url, query, headers, timeout):
headers["User-Agent"] = (
"Mozilla/5.0 (X11; Linux x86_64; rv:111.0) Gecko/20100101 Firefox/111.0"
)
def nb_files(id: int):
response = get(
f"https://www.ygg.re/engine/get_files?torrent={id}",
headers={"Cookie": args.cookie, "User-Agent": args.user_agent},
)
sleep(1)
return super().request(url, query, headers, timeout)
json = response.json()
soup = BeautifulSoup(json["html"], "html.parser")
rows = soup.find_all("tr")
length = 0
for row in rows:
columns = row.find_all("td")
if columns[1].text.strip()[-3:] == "mkv":
length += 1
return length
client = Client(
"https://torrent.crystalyx.net/transmission/rpc",
port=443,
user=args.username,
password=args.password,
http_handler=CustomHTTPHandler(),
)
def parse_size(size):
# https://stackoverflow.com/a/42865957
units = {"Ko": 10**3, "Mo": 10**6, "Go": 10**9, "To": 10**12}
number = size[:-2]
unit = size[-2:]
movies = []
torrents = client.get_torrents()
writer = DictWriter(stdout, fieldnames=["season", "title", "hash", "url"])
return int(float(number) * units[unit])
def is_valid(id: int, size: int, completed: int):
return size < parse_size("10Go") and completed > 10 and nb_files(id) == 1
writer = DictWriter(stdout, fieldnames=["title", "season", "episode", "name", "link"])
writer.writeheader()
tree = ElementTree.fromstring(get("https://feed.ausha.co/Loa7srdWGm1b").text)
for feed in args.feed:
tree = ElementTree.fromstring(get(feed).text)
loop = 0
for item in tree.findall(".//item"):
title = item.find("title")
season = item.find("itunes:season", {"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd"})
for item in tree.findall(".//item")[::-1]:
loop += 1
title = str(item.findtext("title")).strip()
season = int(
item.findtext(
"itunes:season", namespaces={"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd"}
)
or datetime.strptime(item.findtext("pubDate") or "", "%a, %d %b %Y %H:%M:%S %z").year
)
episode = int(
item.findtext(
"itunes:episode",
namespaces={"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd"},
)
or loop
)
if season is None or title is None or title.text is None:
continue
response = get(
f"https://www.ygg.re/engine/search?name={title}"
"&description=&file=&uploader=&category=2145&sub_category=2183"
"&do=search&order=asc&sort=publish_date",
headers={"Cookie": args.cookie, "User-Agent": args.user_agent},
)
sleep(1)
row = {
"title": title.text,
"season": f"Saison {season.text}",
"hash": "",
"url": f"https://www.ygg.re/engine/search?name={title.text}&description=&file=&uploader=&category=2145&sub_category=2183&option_langue:multiple[0]=4&do=search&order=asc&sort=publish_date",
}
soup = BeautifulSoup(response.text, "html.parser")
rows = soup.select("table.table tr")
column = None
for torrent in torrents:
if title.text.lower() in torrent.name.lower():
row["hash"] = torrent.hashString
break
for row in rows:
columns = row.find_all("td")
movies.append(title.text.lower())
writer.writerow(row)
if len(columns) == 0:
continue
tree = ElementTree.fromstring(
get("https://www.calvinballconsortium.fr/podcasts/leretourdujeudi/feed.xml").text
)
if any(keyword.lower() in columns[1].text.strip().lower() for keyword in args.keyword):
if is_valid(
int(columns[2].a["target"]),
parse_size(columns[5].text.strip()),
int(columns[6].text.strip()),
):
column = columns
break
for item in tree.findall(".//item"):
title = item.find("title")
if column is None:
for row in rows:
columns = row.find_all("td")
if title is None or title.text is None or title.text.lower() in movies:
continue
if len(columns) == 0:
continue
row = {
"title": title.text,
"season": "Jeudi",
"hash": "",
"url": f"https://www.ygg.re/engine/search?name={title.text}&description=&file=&uploader=&category=2145&sub_category=2183&option_langue:multiple[0]=4&do=search&order=asc&sort=publish_date",
}
if "multi" in columns[1].text.strip().lower():
if is_valid(
int(columns[2].a["target"]),
parse_size(columns[5].text.strip()),
int(columns[6].text.strip()),
):
column = columns
break
for torrent in torrents:
if title.text.lower() in torrent.name.lower():
row["hash"] = torrent.hashString
break
movies.append(title.text.lower())
writer.writerow(row)
writer.writerow(
{
"title": title,
"season": season,
"episode": episode,
"name": column[1].text.strip() if column else "",
"link": column[1].a["href"] if column else "",
}
)

59
poetry.lock generated
View File

@ -1,5 +1,26 @@
# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
[[package]]
name = "beautifulsoup4"
version = "4.12.3"
description = "Screen-scraping library"
optional = false
python-versions = ">=3.6.0"
files = [
{file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"},
{file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"},
]
[package.dependencies]
soupsieve = ">1.2"
[package.extras]
cchardet = ["cchardet"]
chardet = ["chardet"]
charset-normalizer = ["charset-normalizer"]
html5lib = ["html5lib"]
lxml = ["lxml"]
[[package]]
name = "certifi"
version = "2024.12.14"
@ -251,6 +272,17 @@ files = [
{file = "ruff-0.8.4.tar.gz", hash = "sha256:0d5f89f254836799af1615798caa5f80b7f935d7a670fad66c5007928e57ace8"},
]
[[package]]
name = "soupsieve"
version = "2.6"
description = "A modern CSS selector implementation for Beautiful Soup."
optional = false
python-versions = ">=3.8"
files = [
{file = "soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9"},
{file = "soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb"},
]
[[package]]
name = "tomli"
version = "2.2.1"
@ -303,6 +335,31 @@ files = [
{file = "transmissionrpc_ng-0.13.1-py3-none-any.whl", hash = "sha256:092954f0571392cd6ff498ea9b5a55de3b20f68fd919d6fe94e4f5e112a750bc"},
]
[[package]]
name = "types-beautifulsoup4"
version = "4.12.0.20241020"
description = "Typing stubs for beautifulsoup4"
optional = false
python-versions = ">=3.8"
files = [
{file = "types-beautifulsoup4-4.12.0.20241020.tar.gz", hash = "sha256:158370d08d0cd448bd11b132a50ff5279237a5d4b5837beba074de152a513059"},
{file = "types_beautifulsoup4-4.12.0.20241020-py3-none-any.whl", hash = "sha256:c95e66ce15a4f5f0835f7fbc5cd886321ae8294f977c495424eaf4225307fd30"},
]
[package.dependencies]
types-html5lib = "*"
[[package]]
name = "types-html5lib"
version = "1.1.11.20241018"
description = "Typing stubs for html5lib"
optional = false
python-versions = ">=3.8"
files = [
{file = "types-html5lib-1.1.11.20241018.tar.gz", hash = "sha256:98042555ff78d9e3a51c77c918b1041acbb7eb6c405408d8a9e150ff5beccafa"},
{file = "types_html5lib-1.1.11.20241018-py3-none-any.whl", hash = "sha256:3f1e064d9ed2c289001ae6392c84c93833abb0816165c6ff0abfc304a779f403"},
]
[[package]]
name = "types-requests"
version = "2.32.0.20241016"
@ -348,4 +405,4 @@ zstd = ["zstandard (>=0.18.0)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
content-hash = "b1f91153e63e166bbdef0bab71c32f2b9033b0162b752b2ae569e73f6949ec87"
content-hash = "5f000dd0b221e422f68ef075abe504a8ed6776ab8065ff8c173a072e95ad7307"

View File

@ -7,12 +7,14 @@ package-mode = false
[tool.poetry.dependencies]
python = "^3.10"
beautifulsoup4 = "^4.12.3"
transmissionrpc-ng = "~0.13.1"
requests = "^2.32.3"
[tool.poetry.group.dev.dependencies]
ruff = "^0.8.4"
mypy = "^1.14.0"
types-beautifulsoup4 = "^4.12.0"
types-requests = "^2.32.0"
[tool.ruff]