Use Google Bot to crawl YGG

2021-12-16 23:32:51 +01:00 · 2021-12-16 23:32:51 +01:00 · 80eadb35fa
commit 80eadb35fa
parent 2a39323ce9
3 changed files with 10 additions and 7 deletions
--- a/get404.py
+++ b/get404.py
@ -4,7 +4,7 @@ from pynyaata.models import AnimeLink
 links = AnimeLink.query.all()

 for link in links:
-    html = curl_content(link.link, debug=False)
+    html = curl_content(link.link, debug=False, cloudflare=True)

    if html['http_code'] != 200 and html['http_code'] != 500:
        print('(%d) %s %s : %s' % (
--- a/pynyaata/connectors/core.py
+++ b/pynyaata/connectors/core.py
@ -71,16 +71,17 @@ class Cache:
 ConnectorCache = Cache()


-def curl_content(url, params=None, ajax=False, debug=True):
+def curl_content(url, params=None, ajax=False, debug=True, cloudflare=False):
    output = ''
    http_code = 500
    method = 'post' if (params is not None) else 'get'
+    headers = {}

    if ajax:
-        headers = {'User-Agent': 'YggRobot',
-                   'X-Requested-With': 'XMLHttpRequest'}
-    else:
-        headers = {'User-Agent': 'YggRobot'}
+        headers['X-Requested-With'] = 'XMLHttpRequest'
+
+    if cloudflare:
+        headers['User-Agent'] = 'Googlebot/2.1 (+http://www.google.com/bot.html)'

    try:
        if method == 'post':
--- a/pynyaata/connectors/yggtorrent.py
+++ b/pynyaata/connectors/yggtorrent.py
@ -34,7 +34,9 @@ class YggTorrent(ConnectorCore):
    @ConnectorCache.cache_data
    def search(self):
        if self.category:
-            response = curl_content(self.get_full_search_url())
+            response = curl_content(
+                self.get_full_search_url(), cloudflare=True
+            )

            if response['http_code'] == 200:
                html = BeautifulSoup(response['output'], 'html.parser')