WIP commit

This commit is contained in:
Marian Steinbach 2021-05-23 18:05:24 +02:00
parent d880c09e96
commit e54eb8f4b9
10 changed files with 125 additions and 57 deletions

2
.gitignore vendored
View file

@ -6,4 +6,4 @@ __pycache__
.vscode/settings.json
kubernetes/green-spider-secret.yaml
/volumes
/screenshots
/screenshots

View file

@ -1,4 +1,4 @@
FROM python:3.7-alpine3.12
FROM python:3.7-alpine3.13
WORKDIR /workdir
@ -6,10 +6,10 @@ ADD requirements.txt /workdir/
RUN echo "foobar"
RUN echo "http://dl-4.alpinelinux.org/alpine/v3.12/main" >> /etc/apk/repositories && \
echo "http://dl-4.alpinelinux.org/alpine/v3.12/community" >> /etc/apk/repositories && \
RUN echo "http://dl-4.alpinelinux.org/alpine/v3.13/main" >> /etc/apk/repositories && \
echo "http://dl-4.alpinelinux.org/alpine/v3.13/community" >> /etc/apk/repositories && \
apk update && \
apk --no-cache add chromium chromium-chromedriver python3-dev build-base git py3-lxml libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev && \
apk --no-cache add chromium chromium-chromedriver python3-dev build-base git py3-lxml libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev cargo && \
pip3 install --upgrade pip && \
pip3 install -r requirements.txt && \
apk del python3-dev build-base

4
cli.py
View file

@ -19,7 +19,7 @@ def handle_sigint(signum, frame):
if __name__ == "__main__":
signal.signal(signal.SIGINT,handle_sigint)
signal.signal(signal.SIGINT, handle_sigint)
parser = argparse.ArgumentParser()
@ -73,7 +73,7 @@ if __name__ == "__main__":
if args.command == 'manager':
import manager
manager.create_jobs(datastore_client, args.url)
manager.create_jobs(args.url)
elif args.command == 'export':

View file

@ -1,16 +0,0 @@
#!/bin/bash
# Log in to webapp server via SSH
API_TOKEN_SECRET="secrets/hetzner-api-token.sh"
test -f $API_TOKEN_SECRET || { echo >&2 "File $API_TOKEN_SECRET does not exist."; exit 1; }
source $API_TOKEN_SECRET
source devops/functions.bash
get_ip
echo "Use this command for SSH access:"
echo "ssh -o StrictHostKeyChecking=no root@${IP_IP}"
ssh -o StrictHostKeyChecking=no root@${IP_IP}

View file

@ -31,7 +31,7 @@ services:
- redis
dashboard:
image: eoranged/rq-dashboard
image: eoranged/rq-dashboard:v0.6.1
environment:
RQ_DASHBOARD_REDIS_URL: redis://redis:6379/0
networks:

3
job.py
View file

@ -1,9 +1,8 @@
"""
Dieses Script wird vom RQ Worker ausgeführt, um einen einzelnen Job aus der
Dieses Script wird vom RQ worker ausgeführt, um einen einzelnen Job aus der
Spider-Warteschlange abzuarbeiten.
"""
from pprint import pprint
import json
import os
from datetime import datetime

View file

@ -0,0 +1,67 @@
---
apiVersion: batch/v1
kind: Job
metadata:
name: green-spider-job-1
namespace: marian
labels:
app: green-spider
spec:
activeDeadlineSeconds: 120
ttlSecondsAfterFinished: 600
completions: 1
backoffLimit: 3
# Pod template
template:
metadata:
name: green-spider-job
namespace: marian
labels:
app: green-spider
spec:
restartPolicy: Never
nodeSelector:
giantswarm.io/machine-pool: 5n27k
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- green-spider
topologyKey: topology.kubernetes.io/region
containers:
- name: spider
image: quay.io/netzbegruenung/green-spider:kubernetes
imagePullPolicy: IfNotPresent
command:
- python
- cli.py
- --credentials-path=/secrets/datastore-writer.json
- --loglevel=debug
- spider
- '--job={"url":"https://www.gruene.de/","type":"PARTY","level":"DE:BUNDESVERBAND","state":null,"district":null,"city":null}'
volumeMounts:
- name: secrets
mountPath: "/secrets"
readOnly: true
- name: shared
mountPath: /dev/shm
resources:
requests:
cpu: 1000m
memory: 5000M
volumes:
- name: secrets
secret:
secretName: green-spider
items:
- key: datastore-writer.json
path: datastore-writer.json
- key: screenshots-uploader.json
path: screenshots-uploader.json
- name: shared
emptyDir: {}

18
kubernetes/psp.yaml Normal file
View file

@ -0,0 +1,18 @@
apiVersion: policy/v1beta1
kind: PodSecurityPolicy
metadata:
name: green-spider-job-psp
namespace: marian
spec:
privileged: false
seLinux:
rule: RunAsAny
supplementalGroups:
rule: RunAsAny
runAsUser:
rule: RunAsAny
fsGroup:
rule: RunAsAny
volumes:
- emptyDir
- secret

View file

@ -1,5 +1,5 @@
"""
The manager module allows to fill the job queue.
The manager module allows to fill the RQ job queue.
"""
from datetime import datetime
@ -8,16 +8,12 @@ import os
import random
import shutil
import time
import json
import sys
from git import Repo
from google.api_core.exceptions import Aborted
from google.cloud import datastore
from rq import Queue
from rq.job import Job
import redis
import tenacity
import yaml
from yaml import Loader
@ -61,7 +57,7 @@ def chunks(the_list, size):
yield the_list[i:i + size]
def create_jobs(datastore_client, url=None):
def create_jobs(url=None):
"""
Read all URLs from green directory and fill a job database
with one job per URL.
@ -143,13 +139,17 @@ def create_jobs(datastore_client, url=None):
for entry in input_entries:
try:
job = queue.enqueue('job.run',
_ = queue.enqueue('job.run',
job_timeout='300s',
at_front=random.choice([True, False]),
# keywords args passes on the job function
kwargs={
'job': entry,
})
# Print job for debugging purposes
print(json.dumps(entry))
#logging.debug("Added job with ID %s for URL %s" % (enqueued_job.id, entry['url']))
count += 1
except Exception as e:

View file

@ -1,46 +1,46 @@
beautifulsoup4==4.9.3
cachetools==4.2.0
cachetools==4.2.2
certifi==2020.12.5
cffi==1.14.4
cffi==1.14.5
chardet==3.0.4
click==7.1.2
cryptography==3.3.1
cryptography==3.4.7
cssselect==1.1.0
dnspython==1.16.0
docker==4.4.1
feedparser==5.2.1
gitdb==4.0.5
GitPython==3.1.11
google-api-core==1.24.1
google-auth==1.24.0
google-cloud-core==1.5.0
gitdb==4.0.7
GitPython==3.1.14
google-api-core==1.26.3
google-auth==1.30.0
google-cloud-core==1.6.0
google-cloud-datastore==1.15.3
google-cloud-storage==1.35.0
googleapis-common-protos==1.52.0
grpcio==1.34.0
google-cloud-storage==1.38.0
googleapis-common-protos==1.53.0
grpcio==1.37.1
html-similarity==0.3.3
httpretty==0.9.7
idna==2.10
lxml==4.6.2
lxml==4.6.3
parsel==1.6.0
protobuf==3.14.0
protobuf==3.15.8
pyasn1==0.4.8
pyasn1-modules==0.2.8
pycparser==2.20
pyOpenSSL==20.0.1
pytz==2020.5
PyYAML==5.3.1
pytz==2021.1
PyYAML==5.4.1
redis==3.5.3
requests==2.25.1
responses==0.12.1
rq==1.7.0
rsa==4.6
responses==0.13.3
rq==1.8.0
rsa==4.7.2
selenium==3.141.0
six==1.15.0
six==1.16.0
smmap==3.0.4
smmap2==2.0.5
soupsieve==2.1
soupsieve==2.2.1
tenacity==5.1.5
urllib3==1.26.2
urllib3==1.26.4
w3lib==1.22.0
websocket-client==0.57.0
websocket-client==0.59.0