Test to replace phantomjs by something lighter

This commit is contained in:
Michel Roux 2020-04-06 15:09:27 +02:00
parent 409381b5db
commit c9536f0216
4 changed files with 23 additions and 87 deletions

View File

@ -3,10 +3,11 @@ FROM debian
ENV DEBIAN_FRONTEND noninteractive
ENV LANG C.UTF-8
RUN apt-get update && apt-get -y upgrade && \
apt-get -y install python3 python3-pip locales \
RUN curl -sSL https://deb.nodesource.com/setup_12.x | bash - && \
apt-get -y upgrade && \
apt-get -y install python3 python3-pip locales nodejs \
python3-flask python3-flask-sqlalchemy python3-flask-httpauth python3-flaskext.wtf \
python3-pymysql python3-requests python3-bs4 python3-dotenv && \
apt-get -y --no-install-recommends install phantomjs && \
pip3 install cfscrape && \
printf "en_US.UTF-8 UTF-8\nfr_FR.UTF-8 UTF-8\n" > /etc/locale.gen && \
locale-gen && rm -rf /var/lib/apt/lists/*

View File

@ -5,12 +5,10 @@ from datetime import datetime, timedelta
from enum import Enum
from functools import wraps
from logging import getLogger
from subprocess import run
from sys import platform
from urllib.parse import quote
import requests
from bs4 import BeautifulSoup
from cfscrape import create_scraper
from config import IS_DEBUG, CACHE_TIMEOUT, BLACKLIST_WORDS
from models import AnimeLink
@ -138,37 +136,26 @@ class Connector(ABC):
return self
def curl_content(self, url, params=None, ajax=False):
if self.is_behind_cloudflare:
try:
qt_env = {'QT_QPA_PLATFORM': 'offscreen'} if platform == 'linux' else {}
qt_output = run('phantomjs --cookies-file=/tmp/cookies.json delay.js "%s" 5000' % url, env=qt_env,
shell=True, check=True, capture_output=True, timeout=7000)
output = qt_output.stdout
http_code = 200
except Exception as e:
output = ''
http_code = 500
if IS_DEBUG:
getLogger().exception(e)
scraper = create_scraper()
if ajax:
headers = {'X-Requested-With': 'XMLHttpRequest'}
else:
if ajax:
headers = {'X-Requested-With': 'XMLHttpRequest'}
headers = {}
try:
if params is not None:
response = scraper.post(url, params, timeout=5, headers=headers)
else:
headers = {}
response = scraper.get(url, timeout=5, headers=headers)
try:
if params is not None:
response = requests.post(url, params, timeout=5, headers=headers)
else:
response = requests.get(url, timeout=5, headers=headers)
output = response.text
http_code = response.status_code
except requests.Timeout as e:
output = ''
http_code = 500
if IS_DEBUG:
getLogger().exception(e)
output = response.text
http_code = response.status_code
except Exception as e:
output = ''
http_code = 500
if IS_DEBUG:
getLogger().exception(e)
return {'http_code': http_code, 'output': output}

View File

@ -1,53 +0,0 @@
// https://stackoverflow.com/a/41017165
"use strict";
var page = require('webpage').create(),
system = require('system'),
mustQuit = false,
canShow = false,
underAttack = false,
address, delay;
if (system.args.length < 3 || system.args.length > 5) {
console.log('Usage: delay.js URL delay');
phantom.exit(1);
} else {
address = system.args[1];
delay = system.args[2];
page.open(address, function (status) {
if (status !== 'success') {
phantom.exit(1);
} else {
window.setTimeout(function () {
if (underAttack && canShow) {
console.log(page.content);
phantom.exit();
} else {
phantom.exit(503);
}
}, delay);
window.setTimeout(function () {
if (mustQuit) {
phantom.exit(429);
} else if (!underAttack && canShow) {
console.log(page.content);
phantom.exit();
}
}, 1);
}
});
page.onResourceReceived = function (response) {
switch (response.status) {
case 200:
canShow = true;
break;
case 429:
mustQuit = true;
break;
case 503:
underAttack = true;
break;
}
};
}

View File

@ -8,3 +8,4 @@ requests==2.21.0
beautifulsoup4==4.7.1
python-dotenv==0.9.1
Werkzeug==0.14.1
cfscrape