From 80eadb35fa05b74ced93d854ee540729d576410e Mon Sep 17 00:00:00 2001 From: Michel Roux Date: Thu, 16 Dec 2021 23:32:51 +0100 Subject: [PATCH] Use Google Bot to crawl YGG --- get404.py | 2 +- pynyaata/connectors/core.py | 11 ++++++----- pynyaata/connectors/yggtorrent.py | 4 +++- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/get404.py b/get404.py index 7eda786..f183b9d 100644 --- a/get404.py +++ b/get404.py @@ -4,7 +4,7 @@ from pynyaata.models import AnimeLink links = AnimeLink.query.all() for link in links: - html = curl_content(link.link, debug=False) + html = curl_content(link.link, debug=False, cloudflare=True) if html['http_code'] != 200 and html['http_code'] != 500: print('(%d) %s %s : %s' % ( diff --git a/pynyaata/connectors/core.py b/pynyaata/connectors/core.py index e6b6fdd..4645e31 100644 --- a/pynyaata/connectors/core.py +++ b/pynyaata/connectors/core.py @@ -71,16 +71,17 @@ class Cache: ConnectorCache = Cache() -def curl_content(url, params=None, ajax=False, debug=True): +def curl_content(url, params=None, ajax=False, debug=True, cloudflare=False): output = '' http_code = 500 method = 'post' if (params is not None) else 'get' + headers = {} if ajax: - headers = {'User-Agent': 'YggRobot', - 'X-Requested-With': 'XMLHttpRequest'} - else: - headers = {'User-Agent': 'YggRobot'} + headers['X-Requested-With'] = 'XMLHttpRequest' + + if cloudflare: + headers['User-Agent'] = 'Googlebot/2.1 (+http://www.google.com/bot.html)' try: if method == 'post': diff --git a/pynyaata/connectors/yggtorrent.py b/pynyaata/connectors/yggtorrent.py index 197e439..af16c80 100644 --- a/pynyaata/connectors/yggtorrent.py +++ b/pynyaata/connectors/yggtorrent.py @@ -34,7 +34,9 @@ class YggTorrent(ConnectorCore): @ConnectorCache.cache_data def search(self): if self.category: - response = curl_content(self.get_full_search_url()) + response = curl_content( + self.get_full_search_url(), cloudflare=True + ) if response['http_code'] == 200: html = BeautifulSoup(response['output'], 'html.parser')