refactor: ♻️ refacto 2hdp parser

2024-12-21 21:58:33 +01:00 · 2024-12-21 21:58:33 +01:00 · 6da2c52a06
commit 6da2c52a06
parent 6308e6790c
4 changed files with 166 additions and 59 deletions
--- a/2
+++ b/2
@ -9,7 +9,7 @@ RUN apt-get update && \
    apt-get install -y \
      vim p7zip* git mc lftp rename wget curl procps psmisc \
      openssh-client transmission-cli speedtest-cli \
-      python3-requests python3-transmissionrpc && \
+      python3-bs4 python3-requests python3-transmissionrpc && \
    apt-get install -y --no-install-recommends mame-tools yt-dlp && \
    rm -rf /var/lib/apt/lists/* && \
    curl -sSL https://raw.githubusercontent.com/MatanZ/tremc/main/tremc -o /usr/local/bin/tremc && \
--- a/commands/2hdp.py
+++ b/commands/2hdp.py
@ -1,85 +1,133 @@
 #!/usr/bin/env python3
 from argparse import ArgumentParser
 from csv import DictWriter
+from datetime import datetime
 from sys import stdout
+from time import sleep
 from xml.etree import ElementTree

+from bs4 import BeautifulSoup
 from requests import get
-from transmissionrpc import Client, DefaultHTTPHandler  # type: ignore

 parser = ArgumentParser()
-parser.add_argument("-u", "--username", required=True)
-parser.add_argument("-p", "--password", required=True)
+parser.add_argument(
+    "--keyword", action="append", default=["mhdgz"], help="Prefered words to search on names"
+)
+parser.add_argument("--cookie", required=True, help="Cookies to bypass CloudFlare")
+parser.add_argument("--user-agent", required=True, help="User Agent to bypass CloudFlare")
+parser.add_argument(
+    "--feed", required=True, action="append", help="RSS feed to search for new movies"
+)
 args = parser.parse_args()


-class CustomHTTPHandler(DefaultHTTPHandler):
-    def request(self, url, query, headers, timeout):
-        headers["User-Agent"] = (
-            "Mozilla/5.0 (X11; Linux x86_64; rv:111.0) Gecko/20100101 Firefox/111.0"
-        )
+def nb_files(id: int):
+    response = get(
+        f"https://www.ygg.re/engine/get_files?torrent={id}",
+        headers={"Cookie": args.cookie, "User-Agent": args.user_agent},
+    )
+    sleep(1)

-        return super().request(url, query, headers, timeout)
+    json = response.json()
+    soup = BeautifulSoup(json["html"], "html.parser")
+    rows = soup.find_all("tr")
+    length = 0
+
+    for row in rows:
+        columns = row.find_all("td")
+
+        if columns[1].text.strip()[-3:] == "mkv":
+            length += 1
+
+    return length


-client = Client(
-    "https://torrent.crystalyx.net/transmission/rpc",
-    port=443,
-    user=args.username,
-    password=args.password,
-    http_handler=CustomHTTPHandler(),
-)
+def parse_size(size):
+    # https://stackoverflow.com/a/42865957
+    units = {"Ko": 10**3, "Mo": 10**6, "Go": 10**9, "To": 10**12}
+    number = size[:-2]
+    unit = size[-2:]

-movies = []
-torrents = client.get_torrents()
-writer = DictWriter(stdout, fieldnames=["season", "title", "hash", "url"])
+    return int(float(number) * units[unit])
+
+
+def is_valid(id: int, size: int, completed: int):
+    return size < parse_size("10Go") and completed > 10 and nb_files(id) == 1
+
+
+writer = DictWriter(stdout, fieldnames=["title", "season", "episode", "name", "link"])
 writer.writeheader()

-tree = ElementTree.fromstring(get("https://feed.ausha.co/Loa7srdWGm1b").text)
+for feed in args.feed:
+    tree = ElementTree.fromstring(get(feed).text)
+    loop = 0

-for item in tree.findall(".//item"):
-    title = item.find("title")
-    season = item.find("itunes:season", {"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd"})
+    for item in tree.findall(".//item")[::-1]:
+        loop += 1
+        title = str(item.findtext("title")).strip()
+        season = int(
+            item.findtext(
+                "itunes:season", namespaces={"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd"}
+            )
+            or datetime.strptime(item.findtext("pubDate") or "", "%a, %d %b %Y %H:%M:%S %z").year
+        )
+        episode = int(
+            item.findtext(
+                "itunes:episode",
+                namespaces={"itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd"},
+            )
+            or loop
+        )

-    if season is None or title is None or title.text is None:
-        continue
+        response = get(
+            f"https://www.ygg.re/engine/search?name={title}"
+            "&description=&file=&uploader=&category=2145&sub_category=2183"
+            "&do=search&order=asc&sort=publish_date",
+            headers={"Cookie": args.cookie, "User-Agent": args.user_agent},
+        )
+        sleep(1)

-    row = {
-        "title": title.text,
-        "season": f"Saison {season.text}",
-        "hash": "",
-        "url": f"https://www.ygg.re/engine/search?name={title.text}&description=&file=&uploader=&category=2145&sub_category=2183&option_langue:multiple[0]=4&do=search&order=asc&sort=publish_date",
-    }
+        soup = BeautifulSoup(response.text, "html.parser")
+        rows = soup.select("table.table tr")
+        column = None

-    for torrent in torrents:
-        if title.text.lower() in torrent.name.lower():
-            row["hash"] = torrent.hashString
-            break
+        for row in rows:
+            columns = row.find_all("td")

-    movies.append(title.text.lower())
-    writer.writerow(row)
+            if len(columns) == 0:
+                continue

-tree = ElementTree.fromstring(
-    get("https://www.calvinballconsortium.fr/podcasts/leretourdujeudi/feed.xml").text
-)
+            if any(keyword.lower() in columns[1].text.strip().lower() for keyword in args.keyword):
+                if is_valid(
+                    int(columns[2].a["target"]),
+                    parse_size(columns[5].text.strip()),
+                    int(columns[6].text.strip()),
+                ):
+                    column = columns
+                    break

-for item in tree.findall(".//item"):
-    title = item.find("title")
+        if column is None:
+            for row in rows:
+                columns = row.find_all("td")

-    if title is None or title.text is None or title.text.lower() in movies:
-        continue
+                if len(columns) == 0:
+                    continue

-    row = {
-        "title": title.text,
-        "season": "Jeudi",
-        "hash": "",
-        "url": f"https://www.ygg.re/engine/search?name={title.text}&description=&file=&uploader=&category=2145&sub_category=2183&option_langue:multiple[0]=4&do=search&order=asc&sort=publish_date",
-    }
+                if "multi" in columns[1].text.strip().lower():
+                    if is_valid(
+                        int(columns[2].a["target"]),
+                        parse_size(columns[5].text.strip()),
+                        int(columns[6].text.strip()),
+                    ):
+                        column = columns
+                        break

-    for torrent in torrents:
-        if title.text.lower() in torrent.name.lower():
-            row["hash"] = torrent.hashString
-            break
-
-    movies.append(title.text.lower())
-    writer.writerow(row)
+        writer.writerow(
+            {
+                "title": title,
+                "season": season,
+                "episode": episode,
+                "name": column[1].text.strip() if column else "",
+                "link": column[1].a["href"] if column else "",
+            }
+        )
--- a/poetry.lock
+++ b/poetry.lock
@ -1,5 +1,26 @@
 # This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.

+[[package]]
+name = "beautifulsoup4"
+version = "4.12.3"
+description = "Screen-scraping library"
+optional = false
+python-versions = ">=3.6.0"
+files = [
+    {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"},
+    {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"},
+]
+
+[package.dependencies]
+soupsieve = ">1.2"
+
+[package.extras]
+cchardet = ["cchardet"]
+chardet = ["chardet"]
+charset-normalizer = ["charset-normalizer"]
+html5lib = ["html5lib"]
+lxml = ["lxml"]
+
 [[package]]
 name = "certifi"
 version = "2024.12.14"
@ -251,6 +272,17 @@ files = [
    {file = "ruff-0.8.4.tar.gz", hash = "sha256:0d5f89f254836799af1615798caa5f80b7f935d7a670fad66c5007928e57ace8"},
 ]

+[[package]]
+name = "soupsieve"
+version = "2.6"
+description = "A modern CSS selector implementation for Beautiful Soup."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9"},
+    {file = "soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb"},
+]
+
 [[package]]
 name = "tomli"
 version = "2.2.1"
@ -303,6 +335,31 @@ files = [
    {file = "transmissionrpc_ng-0.13.1-py3-none-any.whl", hash = "sha256:092954f0571392cd6ff498ea9b5a55de3b20f68fd919d6fe94e4f5e112a750bc"},
 ]

+[[package]]
+name = "types-beautifulsoup4"
+version = "4.12.0.20241020"
+description = "Typing stubs for beautifulsoup4"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "types-beautifulsoup4-4.12.0.20241020.tar.gz", hash = "sha256:158370d08d0cd448bd11b132a50ff5279237a5d4b5837beba074de152a513059"},
+    {file = "types_beautifulsoup4-4.12.0.20241020-py3-none-any.whl", hash = "sha256:c95e66ce15a4f5f0835f7fbc5cd886321ae8294f977c495424eaf4225307fd30"},
+]
+
+[package.dependencies]
+types-html5lib = "*"
+
+[[package]]
+name = "types-html5lib"
+version = "1.1.11.20241018"
+description = "Typing stubs for html5lib"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "types-html5lib-1.1.11.20241018.tar.gz", hash = "sha256:98042555ff78d9e3a51c77c918b1041acbb7eb6c405408d8a9e150ff5beccafa"},
+    {file = "types_html5lib-1.1.11.20241018-py3-none-any.whl", hash = "sha256:3f1e064d9ed2c289001ae6392c84c93833abb0816165c6ff0abfc304a779f403"},
+]
+
 [[package]]
 name = "types-requests"
 version = "2.32.0.20241016"
@ -348,4 +405,4 @@ zstd = ["zstandard (>=0.18.0)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "b1f91153e63e166bbdef0bab71c32f2b9033b0162b752b2ae569e73f6949ec87"
+content-hash = "5f000dd0b221e422f68ef075abe504a8ed6776ab8065ff8c173a072e95ad7307"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -7,12 +7,14 @@ package-mode = false

 [tool.poetry.dependencies]
 python = "^3.10"
+beautifulsoup4 = "^4.12.3"
 transmissionrpc-ng = "~0.13.1"
 requests = "^2.32.3"

 [tool.poetry.group.dev.dependencies]
 ruff = "^0.8.4"
 mypy = "^1.14.0"
+types-beautifulsoup4 = "^4.12.0"
 types-requests = "^2.32.0"

 [tool.ruff]