mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-04-26 06:20:06 +02:00
WIP commit
This commit is contained in:
parent
d880c09e96
commit
e54eb8f4b9
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -6,4 +6,4 @@ __pycache__
|
|||
.vscode/settings.json
|
||||
kubernetes/green-spider-secret.yaml
|
||||
/volumes
|
||||
/screenshots
|
||||
/screenshots
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
FROM python:3.7-alpine3.12
|
||||
FROM python:3.7-alpine3.13
|
||||
|
||||
WORKDIR /workdir
|
||||
|
||||
|
@ -6,10 +6,10 @@ ADD requirements.txt /workdir/
|
|||
|
||||
RUN echo "foobar"
|
||||
|
||||
RUN echo "http://dl-4.alpinelinux.org/alpine/v3.12/main" >> /etc/apk/repositories && \
|
||||
echo "http://dl-4.alpinelinux.org/alpine/v3.12/community" >> /etc/apk/repositories && \
|
||||
RUN echo "http://dl-4.alpinelinux.org/alpine/v3.13/main" >> /etc/apk/repositories && \
|
||||
echo "http://dl-4.alpinelinux.org/alpine/v3.13/community" >> /etc/apk/repositories && \
|
||||
apk update && \
|
||||
apk --no-cache add chromium chromium-chromedriver python3-dev build-base git py3-lxml libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev && \
|
||||
apk --no-cache add chromium chromium-chromedriver python3-dev build-base git py3-lxml libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev cargo && \
|
||||
pip3 install --upgrade pip && \
|
||||
pip3 install -r requirements.txt && \
|
||||
apk del python3-dev build-base
|
||||
|
|
4
cli.py
4
cli.py
|
@ -19,7 +19,7 @@ def handle_sigint(signum, frame):
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
signal.signal(signal.SIGINT,handle_sigint)
|
||||
signal.signal(signal.SIGINT, handle_sigint)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
|
@ -73,7 +73,7 @@ if __name__ == "__main__":
|
|||
if args.command == 'manager':
|
||||
|
||||
import manager
|
||||
manager.create_jobs(datastore_client, args.url)
|
||||
manager.create_jobs(args.url)
|
||||
|
||||
elif args.command == 'export':
|
||||
|
||||
|
|
|
@ -1,16 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Log in to webapp server via SSH
|
||||
|
||||
API_TOKEN_SECRET="secrets/hetzner-api-token.sh"
|
||||
test -f $API_TOKEN_SECRET || { echo >&2 "File $API_TOKEN_SECRET does not exist."; exit 1; }
|
||||
source $API_TOKEN_SECRET
|
||||
|
||||
source devops/functions.bash
|
||||
|
||||
get_ip
|
||||
|
||||
echo "Use this command for SSH access:"
|
||||
echo "ssh -o StrictHostKeyChecking=no root@${IP_IP}"
|
||||
|
||||
ssh -o StrictHostKeyChecking=no root@${IP_IP}
|
|
@ -31,7 +31,7 @@ services:
|
|||
- redis
|
||||
|
||||
dashboard:
|
||||
image: eoranged/rq-dashboard
|
||||
image: eoranged/rq-dashboard:v0.6.1
|
||||
environment:
|
||||
RQ_DASHBOARD_REDIS_URL: redis://redis:6379/0
|
||||
networks:
|
||||
|
|
3
job.py
3
job.py
|
@ -1,9 +1,8 @@
|
|||
"""
|
||||
Dieses Script wird vom RQ Worker ausgeführt, um einen einzelnen Job aus der
|
||||
Dieses Script wird vom RQ worker ausgeführt, um einen einzelnen Job aus der
|
||||
Spider-Warteschlange abzuarbeiten.
|
||||
"""
|
||||
|
||||
from pprint import pprint
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
|
67
kubernetes/job-example.yaml
Normal file
67
kubernetes/job-example.yaml
Normal file
|
@ -0,0 +1,67 @@
|
|||
---
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: green-spider-job-1
|
||||
namespace: marian
|
||||
labels:
|
||||
app: green-spider
|
||||
spec:
|
||||
activeDeadlineSeconds: 120
|
||||
ttlSecondsAfterFinished: 600
|
||||
completions: 1
|
||||
backoffLimit: 3
|
||||
|
||||
# Pod template
|
||||
template:
|
||||
metadata:
|
||||
name: green-spider-job
|
||||
namespace: marian
|
||||
labels:
|
||||
app: green-spider
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
nodeSelector:
|
||||
giantswarm.io/machine-pool: 5n27k
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
- labelSelector:
|
||||
matchExpressions:
|
||||
- key: app
|
||||
operator: In
|
||||
values:
|
||||
- green-spider
|
||||
topologyKey: topology.kubernetes.io/region
|
||||
containers:
|
||||
- name: spider
|
||||
image: quay.io/netzbegruenung/green-spider:kubernetes
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
- python
|
||||
- cli.py
|
||||
- --credentials-path=/secrets/datastore-writer.json
|
||||
- --loglevel=debug
|
||||
- spider
|
||||
- '--job={"url":"https://www.gruene.de/","type":"PARTY","level":"DE:BUNDESVERBAND","state":null,"district":null,"city":null}'
|
||||
volumeMounts:
|
||||
- name: secrets
|
||||
mountPath: "/secrets"
|
||||
readOnly: true
|
||||
- name: shared
|
||||
mountPath: /dev/shm
|
||||
resources:
|
||||
requests:
|
||||
cpu: 1000m
|
||||
memory: 5000M
|
||||
volumes:
|
||||
- name: secrets
|
||||
secret:
|
||||
secretName: green-spider
|
||||
items:
|
||||
- key: datastore-writer.json
|
||||
path: datastore-writer.json
|
||||
- key: screenshots-uploader.json
|
||||
path: screenshots-uploader.json
|
||||
- name: shared
|
||||
emptyDir: {}
|
18
kubernetes/psp.yaml
Normal file
18
kubernetes/psp.yaml
Normal file
|
@ -0,0 +1,18 @@
|
|||
apiVersion: policy/v1beta1
|
||||
kind: PodSecurityPolicy
|
||||
metadata:
|
||||
name: green-spider-job-psp
|
||||
namespace: marian
|
||||
spec:
|
||||
privileged: false
|
||||
seLinux:
|
||||
rule: RunAsAny
|
||||
supplementalGroups:
|
||||
rule: RunAsAny
|
||||
runAsUser:
|
||||
rule: RunAsAny
|
||||
fsGroup:
|
||||
rule: RunAsAny
|
||||
volumes:
|
||||
- emptyDir
|
||||
- secret
|
|
@ -1,5 +1,5 @@
|
|||
"""
|
||||
The manager module allows to fill the job queue.
|
||||
The manager module allows to fill the RQ job queue.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
|
@ -8,16 +8,12 @@ import os
|
|||
import random
|
||||
import shutil
|
||||
import time
|
||||
import json
|
||||
import sys
|
||||
|
||||
from git import Repo
|
||||
from google.api_core.exceptions import Aborted
|
||||
from google.cloud import datastore
|
||||
|
||||
from rq import Queue
|
||||
from rq.job import Job
|
||||
|
||||
import redis
|
||||
import tenacity
|
||||
import yaml
|
||||
from yaml import Loader
|
||||
|
||||
|
@ -61,7 +57,7 @@ def chunks(the_list, size):
|
|||
yield the_list[i:i + size]
|
||||
|
||||
|
||||
def create_jobs(datastore_client, url=None):
|
||||
def create_jobs(url=None):
|
||||
"""
|
||||
Read all URLs from green directory and fill a job database
|
||||
with one job per URL.
|
||||
|
@ -143,13 +139,17 @@ def create_jobs(datastore_client, url=None):
|
|||
|
||||
for entry in input_entries:
|
||||
try:
|
||||
job = queue.enqueue('job.run',
|
||||
_ = queue.enqueue('job.run',
|
||||
job_timeout='300s',
|
||||
at_front=random.choice([True, False]),
|
||||
# keywords args passes on the job function
|
||||
kwargs={
|
||||
'job': entry,
|
||||
})
|
||||
|
||||
# Print job for debugging purposes
|
||||
print(json.dumps(entry))
|
||||
|
||||
#logging.debug("Added job with ID %s for URL %s" % (enqueued_job.id, entry['url']))
|
||||
count += 1
|
||||
except Exception as e:
|
||||
|
|
|
@ -1,46 +1,46 @@
|
|||
beautifulsoup4==4.9.3
|
||||
cachetools==4.2.0
|
||||
cachetools==4.2.2
|
||||
certifi==2020.12.5
|
||||
cffi==1.14.4
|
||||
cffi==1.14.5
|
||||
chardet==3.0.4
|
||||
click==7.1.2
|
||||
cryptography==3.3.1
|
||||
cryptography==3.4.7
|
||||
cssselect==1.1.0
|
||||
dnspython==1.16.0
|
||||
docker==4.4.1
|
||||
feedparser==5.2.1
|
||||
gitdb==4.0.5
|
||||
GitPython==3.1.11
|
||||
google-api-core==1.24.1
|
||||
google-auth==1.24.0
|
||||
google-cloud-core==1.5.0
|
||||
gitdb==4.0.7
|
||||
GitPython==3.1.14
|
||||
google-api-core==1.26.3
|
||||
google-auth==1.30.0
|
||||
google-cloud-core==1.6.0
|
||||
google-cloud-datastore==1.15.3
|
||||
google-cloud-storage==1.35.0
|
||||
googleapis-common-protos==1.52.0
|
||||
grpcio==1.34.0
|
||||
google-cloud-storage==1.38.0
|
||||
googleapis-common-protos==1.53.0
|
||||
grpcio==1.37.1
|
||||
html-similarity==0.3.3
|
||||
httpretty==0.9.7
|
||||
idna==2.10
|
||||
lxml==4.6.2
|
||||
lxml==4.6.3
|
||||
parsel==1.6.0
|
||||
protobuf==3.14.0
|
||||
protobuf==3.15.8
|
||||
pyasn1==0.4.8
|
||||
pyasn1-modules==0.2.8
|
||||
pycparser==2.20
|
||||
pyOpenSSL==20.0.1
|
||||
pytz==2020.5
|
||||
PyYAML==5.3.1
|
||||
pytz==2021.1
|
||||
PyYAML==5.4.1
|
||||
redis==3.5.3
|
||||
requests==2.25.1
|
||||
responses==0.12.1
|
||||
rq==1.7.0
|
||||
rsa==4.6
|
||||
responses==0.13.3
|
||||
rq==1.8.0
|
||||
rsa==4.7.2
|
||||
selenium==3.141.0
|
||||
six==1.15.0
|
||||
six==1.16.0
|
||||
smmap==3.0.4
|
||||
smmap2==2.0.5
|
||||
soupsieve==2.1
|
||||
soupsieve==2.2.1
|
||||
tenacity==5.1.5
|
||||
urllib3==1.26.2
|
||||
urllib3==1.26.4
|
||||
w3lib==1.22.0
|
||||
websocket-client==0.57.0
|
||||
websocket-client==0.59.0
|
||||
|
|
Loading…
Reference in a new issue