fix: improve mhdgz performance
All checks were successful
dl / lint (push) Successful in 1m45s
dl / docker (push) Successful in 2m48s

This commit is contained in:
Michel Roux 2024-12-22 18:57:16 +01:00
parent 2537713f9f
commit 97931f0c56

View File

@ -2,7 +2,7 @@
from argparse import ArgumentParser from argparse import ArgumentParser
from csv import DictWriter from csv import DictWriter
from datetime import datetime from datetime import datetime
from sys import stdout from sys import stderr, stdout
from time import sleep from time import sleep
from xml.etree import ElementTree from xml.etree import ElementTree
@ -10,9 +10,6 @@ from bs4 import BeautifulSoup
from requests import get from requests import get
parser = ArgumentParser() parser = ArgumentParser()
parser.add_argument(
"--keyword", action="append", default=["mhdgz"], help="Prefered words to search on names"
)
parser.add_argument("--cookie", required=True, help="Cookies to bypass CloudFlare") parser.add_argument("--cookie", required=True, help="Cookies to bypass CloudFlare")
parser.add_argument("--user-agent", required=True, help="User Agent to bypass CloudFlare") parser.add_argument("--user-agent", required=True, help="User Agent to bypass CloudFlare")
parser.add_argument( parser.add_argument(
@ -56,8 +53,45 @@ def is_valid(id: int, size: int, completed: int):
return size < parse_size("10Go") and completed > 10 and nb_files(id) return size < parse_size("10Go") and completed > 10 and nb_files(id)
def parse_ygg(title: str, page: int = 0):
movies = []
response = get(
f"https://www.ygg.re/engine/search?name={title}"
+ "&description=&file=&uploader=&category=2145&sub_category=2183"
+ "&do=search&order=asc&sort=publish_date&page="
+ str(page),
headers={"Cookie": args.cookie, "User-Agent": args.user_agent},
)
sleep(1)
soup = BeautifulSoup(response.text, "html.parser")
rows = soup.select("table.table tr")
for row in rows:
columns = row.find_all("td")
if len(columns) == 0:
continue
movies.append(columns)
return movies
mhdgz = []
page = 0
results = 1
while results > 0:
print(f"Parsing YGG page {page}...", file=stderr)
movies = parse_ygg("mhdgz", page)
mhdgz += movies
results = len(movies)
page += 50
writer = DictWriter(stdout, fieldnames=["title", "season", "episode", "name", "link"]) writer = DictWriter(stdout, fieldnames=["title", "season", "episode", "name", "link"])
writer.writeheader() writer.writeheader()
stdout.flush()
for feed in args.feed: for feed in args.feed:
tree = ElementTree.fromstring(get(feed).text) tree = ElementTree.fromstring(get(feed).text)
@ -65,6 +99,7 @@ for feed in args.feed:
for item in tree.findall(".//item")[::-1]: for item in tree.findall(".//item")[::-1]:
loop += 1 loop += 1
movie = None
title = str(item.findtext("title")).strip() title = str(item.findtext("title")).strip()
season = int( season = int(
item.findtext( item.findtext(
@ -80,47 +115,22 @@ for feed in args.feed:
or loop or loop
) )
response = get( for mhd in mhdgz:
f"https://www.ygg.re/engine/search?name={title}" if title.lower() in mhd[1].text.strip().lower():
"&description=&file=&uploader=&category=2145&sub_category=2183" movie = mhd
"&do=search&order=asc&sort=publish_date", break
headers={"Cookie": args.cookie, "User-Agent": args.user_agent},
)
sleep(1)
soup = BeautifulSoup(response.text, "html.parser") if not movie:
rows = soup.select("table.table tr") rows = parse_ygg(title)
column = None
for row in rows:
columns = row.find_all("td")
if len(columns) == 0:
continue
if any(keyword.lower() in columns[1].text.strip().lower() for keyword in args.keyword):
if is_valid(
int(columns[2].a["target"]),
parse_size(columns[5].text.strip()),
int(columns[6].text.strip()),
):
column = columns
break
if column is None:
for row in rows: for row in rows:
columns = row.find_all("td") if "multi" in row[1].text.strip().lower():
if len(columns) == 0:
continue
if "multi" in columns[1].text.strip().lower():
if is_valid( if is_valid(
int(columns[2].a["target"]), int(row[2].a["target"]),
parse_size(columns[5].text.strip()), parse_size(row[5].text.strip()),
int(columns[6].text.strip()), int(row[6].text.strip()),
): ):
column = columns movie = row
break break
writer.writerow( writer.writerow(
@ -128,8 +138,8 @@ for feed in args.feed:
"title": title, "title": title,
"season": season, "season": season,
"episode": episode, "episode": episode,
"name": column[1].text.strip() if column else "", "name": movie[1].text.strip() if movie else "",
"link": column[1].a["href"] if column else "", "link": movie[1].a["href"] if movie else "",
} }
) )
stdout.flush() stdout.flush()