Merge pull request #34 from netzbegruenung/dockerize

Spider in Docker-container laufen lassen
This commit is contained in:
Marian Steinbach 2018-05-03 11:51:06 +02:00 committed by GitHub
commit f44ed61bd6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 16630 additions and 16197 deletions

6
.dockerignore Normal file
View file

@ -0,0 +1,6 @@
.git
webapp
docs
secrets
temp
venv

31
Dockerfile Normal file
View file

@ -0,0 +1,31 @@
FROM debian:stretch-slim
RUN apt-get update \
&& apt-get install -y git wget gnupg fonts-liberation libappindicator3-1 \
libasound2 libatk-bridge2.0-0 libatk1.0-0 libcairo2 libcups2 libdbus-1-3 \
libexpat1 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libnss3 \
libpango-1.0-0 libpangocairo-1.0-0 libx11-6 libx11-xcb1 libxcb1 \
libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 \
libxrandr2 libxrender1 libxss1 libxtst6 lsb-release xdg-utils \
python3 python3-pip unzip \
&& apt-get clean \
&& wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
&& dpkg -i google-chrome-stable_current_amd64.deb \
&& rm google-chrome-stable_current_amd64.deb \
&& pip3 install GitPython idna PyYAML requests==2.18.4 selenium==3.11.0 smmap2==2.0.3 urllib3==1.22 certifi==2018.1.18 \
&& wget https://chromedriver.storage.googleapis.com/2.38/chromedriver_linux64.zip \
&& unzip chromedriver_linux64.zip \
&& rm chromedriver_linux64.zip \
&& apt-get clean
# TODO: move this into the above
RUN pip3 install beautifulsoup4==4.6.0
RUN wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2 \
&& tar xjf phantomjs-2.1.1-linux-x86_64.tar.bz2 \
&& mv phantomjs-2.1.1-linux-x86_64/bin/phantomjs /usr/local/bin/ \
&& rm -rf phantomjs-2.1.1-linux-x86_64
ADD spider.py /
ENTRYPOINT ["python3", "/spider.py"]

View file

@ -2,13 +2,11 @@
.PHONY: webapp
# Python venv for running the spider locally
venv:
virtualenv -p python3 venv
venv/bin/pip3 install -r requirements.txt
spider: venv
venv/bin/python ./spider.py
# Build docker image and run spider in Docker container
spider:
docker pull debian:stretch-slim
docker build -t spider .
docker run --rm -ti -v $(PWD)/webapp/dist/data:/out spider
screenshots: venv
docker pull netzbegruenung/green-spider-screenshotter:latest

File diff suppressed because it is too large Load diff

View file

@ -37,7 +37,7 @@ green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
green_direcory_data_path = 'data/countries/de'
green_directory_local_path = './cache/green-directory'
result_path = './webapp/dist/data'
result_path = '/out'
# IP address of the newthinking GCMS server
gcms_ip = "91.102.13.20"
@ -67,7 +67,7 @@ def dir_entries():
if not filepath.endswith(".yaml"):
continue
with open(filepath, 'r') as yamlfile:
with open(filepath, 'r', encoding='utf8') as yamlfile:
for doc in yaml.load_all(yamlfile):
yield doc

File diff suppressed because it is too large Load diff