fix: improve mhdgz performance
All checks were successful
dl / lint (push) Successful in 1m45s
dl / docker (push) Successful in 2m48s

This commit is contained in:
Michel Roux 2024-12-22 18:57:16 +01:00
parent 2537713f9f
commit 97931f0c56

View File

@ -2,7 +2,7 @@
from argparse import ArgumentParser
from csv import DictWriter
from datetime import datetime
from sys import stdout
from sys import stderr, stdout
from time import sleep
from xml.etree import ElementTree
@ -10,9 +10,6 @@ from bs4 import BeautifulSoup
from requests import get
parser = ArgumentParser()
parser.add_argument(
"--keyword", action="append", default=["mhdgz"], help="Prefered words to search on names"
)
parser.add_argument("--cookie", required=True, help="Cookies to bypass CloudFlare")
parser.add_argument("--user-agent", required=True, help="User Agent to bypass CloudFlare")
parser.add_argument(
@ -56,8 +53,45 @@ def is_valid(id: int, size: int, completed: int):
return size < parse_size("10Go") and completed > 10 and nb_files(id)
def parse_ygg(title: str, page: int = 0):
movies = []
response = get(
f"https://www.ygg.re/engine/search?name={title}"
+ "&description=&file=&uploader=&category=2145&sub_category=2183"
+ "&do=search&order=asc&sort=publish_date&page="
+ str(page),
headers={"Cookie": args.cookie, "User-Agent": args.user_agent},
)
sleep(1)
soup = BeautifulSoup(response.text, "html.parser")
rows = soup.select("table.table tr")
for row in rows:
columns = row.find_all("td")
if len(columns) == 0:
continue
movies.append(columns)
return movies
mhdgz = []
page = 0
results = 1
while results > 0:
print(f"Parsing YGG page {page}...", file=stderr)
movies = parse_ygg("mhdgz", page)
mhdgz += movies
results = len(movies)
page += 50
writer = DictWriter(stdout, fieldnames=["title", "season", "episode", "name", "link"])
writer.writeheader()
stdout.flush()
for feed in args.feed:
tree = ElementTree.fromstring(get(feed).text)
@ -65,6 +99,7 @@ for feed in args.feed:
for item in tree.findall(".//item")[::-1]:
loop += 1
movie = None
title = str(item.findtext("title")).strip()
season = int(
item.findtext(
@ -80,47 +115,22 @@ for feed in args.feed:
or loop
)
response = get(
f"https://www.ygg.re/engine/search?name={title}"
"&description=&file=&uploader=&category=2145&sub_category=2183"
"&do=search&order=asc&sort=publish_date",
headers={"Cookie": args.cookie, "User-Agent": args.user_agent},
)
sleep(1)
for mhd in mhdgz:
if title.lower() in mhd[1].text.strip().lower():
movie = mhd
break
soup = BeautifulSoup(response.text, "html.parser")
rows = soup.select("table.table tr")
column = None
if not movie:
rows = parse_ygg(title)
for row in rows:
columns = row.find_all("td")
if len(columns) == 0:
continue
if any(keyword.lower() in columns[1].text.strip().lower() for keyword in args.keyword):
if is_valid(
int(columns[2].a["target"]),
parse_size(columns[5].text.strip()),
int(columns[6].text.strip()),
):
column = columns
break
if column is None:
for row in rows:
columns = row.find_all("td")
if len(columns) == 0:
continue
if "multi" in columns[1].text.strip().lower():
if "multi" in row[1].text.strip().lower():
if is_valid(
int(columns[2].a["target"]),
parse_size(columns[5].text.strip()),
int(columns[6].text.strip()),
int(row[2].a["target"]),
parse_size(row[5].text.strip()),
int(row[6].text.strip()),
):
column = columns
movie = row
break
writer.writerow(
@ -128,8 +138,8 @@ for feed in args.feed:
"title": title,
"season": season,
"episode": episode,
"name": column[1].text.strip() if column else "",
"link": column[1].a["href"] if column else "",
"name": movie[1].text.strip() if movie else "",
"link": movie[1].a["href"] if movie else "",
}
)
stdout.flush()