mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-05-10 12:36:07 +02:00
Merge pull request #67 from netzbegruenung/remove-phantomjs
Replace PhantomJS with Chromedriver
This commit is contained in:
commit
634ffa4a23
32
Dockerfile
32
Dockerfile
|
@ -1,27 +1,13 @@
|
||||||
FROM debian:stretch-slim
|
FROM python:3.6-alpine3.7
|
||||||
|
|
||||||
RUN apt-get update \
|
# Note: we pin selenium to 3.8.0 because of https://github.com/SeleniumHQ/selenium/issues/5296
|
||||||
&& apt-get install -y git wget gnupg fonts-liberation libappindicator3-1 \
|
RUN echo "http://dl-4.alpinelinux.org/alpine/v3.7/main" >> /etc/apk/repositories && \
|
||||||
libasound2 libatk-bridge2.0-0 libatk1.0-0 libcairo2 libcups2 libdbus-1-3 \
|
echo "http://dl-4.alpinelinux.org/alpine/v3.7/community" >> /etc/apk/repositories && \
|
||||||
libexpat1 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libnss3 \
|
apk update && \
|
||||||
libpango-1.0-0 libpangocairo-1.0-0 libx11-6 libx11-xcb1 libxcb1 \
|
apk --no-cache add chromium chromium-chromedriver python3-dev build-base git && \
|
||||||
libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 \
|
pip3 install --upgrade pip && \
|
||||||
libxrandr2 libxrender1 libxss1 libxtst6 lsb-release xdg-utils \
|
pip3 install selenium==3.8.0 GitPython PyYAML beautifulsoup4==4.6.0 requests==2.18.4 responses==0.9.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 tenacity==5.0.2 && \
|
||||||
python3 python3-pip unzip \
|
apk del python3-dev build-base
|
||||||
&& apt-get clean \
|
|
||||||
&& wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
|
|
||||||
&& dpkg -i google-chrome-stable_current_amd64.deb \
|
|
||||||
&& rm google-chrome-stable_current_amd64.deb \
|
|
||||||
&& pip3 install GitPython idna PyYAML beautifulsoup4==4.6.0 requests==2.18.4 responses==0.9.0 selenium==3.11.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 tenacity==5.0.2 \
|
|
||||||
&& wget https://chromedriver.storage.googleapis.com/2.38/chromedriver_linux64.zip \
|
|
||||||
&& unzip chromedriver_linux64.zip \
|
|
||||||
&& rm chromedriver_linux64.zip \
|
|
||||||
&& apt-get clean
|
|
||||||
|
|
||||||
RUN wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2 \
|
|
||||||
&& tar xjf phantomjs-2.1.1-linux-x86_64.tar.bz2 \
|
|
||||||
&& mv phantomjs-2.1.1-linux-x86_64/bin/phantomjs /usr/local/bin/ \
|
|
||||||
&& rm -rf phantomjs-2.1.1-linux-x86_64
|
|
||||||
|
|
||||||
ADD spider.py /
|
ADD spider.py /
|
||||||
ADD spider_test.py /
|
ADD spider_test.py /
|
||||||
|
|
4
Makefile
4
Makefile
|
@ -4,7 +4,6 @@
|
||||||
|
|
||||||
# Build docker image
|
# Build docker image
|
||||||
dockerimage:
|
dockerimage:
|
||||||
docker pull debian:stretch-slim
|
|
||||||
docker build -t spider .
|
docker build -t spider .
|
||||||
|
|
||||||
# Create spider job queue
|
# Create spider job queue
|
||||||
|
@ -19,11 +18,12 @@ spiderjobs: dockerimage
|
||||||
# Run spider in docker image
|
# Run spider in docker image
|
||||||
spider: dockerimage
|
spider: dockerimage
|
||||||
docker run --rm -ti \
|
docker run --rm -ti \
|
||||||
|
-v $(PWD)/dev-shm:/dev/shm \
|
||||||
-v $(PWD)/webapp/dist/data:/out \
|
-v $(PWD)/webapp/dist/data:/out \
|
||||||
-v $(PWD)/secrets:/secrets \
|
-v $(PWD)/secrets:/secrets \
|
||||||
spider spider.py \
|
spider spider.py \
|
||||||
--credentials-path /secrets/datastore-writer.json \
|
--credentials-path /secrets/datastore-writer.json \
|
||||||
--loglevel debug \
|
--loglevel info \
|
||||||
spider
|
spider
|
||||||
|
|
||||||
# run spider tests
|
# run spider tests
|
||||||
|
|
12
spider.py
12
spider.py
|
@ -10,6 +10,7 @@ import random
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import statistics
|
import statistics
|
||||||
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from socket import gethostbyname_ex
|
from socket import gethostbyname_ex
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
@ -263,10 +264,17 @@ def check_responsiveness(url):
|
||||||
(1920, 1080), # Full HD horizontal
|
(1920, 1080), # Full HD horizontal
|
||||||
)
|
)
|
||||||
|
|
||||||
# Our selenium user agent using PhantomJS/Webkit as an engine
|
# Our selenium user agent using Chrome headless as an engine
|
||||||
driver = webdriver.PhantomJS()
|
chrome_options = webdriver.ChromeOptions()
|
||||||
|
chrome_options.add_argument('--headless')
|
||||||
|
chrome_options.add_argument('--disable-gpu')
|
||||||
|
chrome_options.add_argument('--no-sandbox')
|
||||||
|
chrome_options.add_argument('--disable-extensions')
|
||||||
|
driver = webdriver.Chrome(chrome_options=chrome_options)
|
||||||
|
driver.set_page_load_timeout(60)
|
||||||
driver.set_window_size(sizes[0][0], sizes[0][1])
|
driver.set_window_size(sizes[0][0], sizes[0][1])
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
for (width, height) in sizes:
|
for (width, height) in sizes:
|
||||||
driver.set_window_size(width, height)
|
driver.set_window_size(width, height)
|
||||||
|
|
Loading…
Reference in a new issue