diff --git a/Dockerfile b/Dockerfile index 78e85ff..2c7d33f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,7 @@ RUN apt-get update \ && wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \ && dpkg -i google-chrome-stable_current_amd64.deb \ && rm google-chrome-stable_current_amd64.deb \ - && pip3 install GitPython idna PyYAML beautifulsoup4==4.6.0 requests==2.18.4 responses==0.9.0 selenium==3.11.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 \ + && pip3 install GitPython idna PyYAML beautifulsoup4==4.6.0 requests==2.18.4 responses==0.9.0 selenium==3.11.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 tenacity==5.0.2 \ && wget https://chromedriver.storage.googleapis.com/2.38/chromedriver_linux64.zip \ && unzip chromedriver_linux64.zip \ && rm chromedriver_linux64.zip \ diff --git a/spider.py b/spider.py index c6f83b1..324bc36 100644 --- a/spider.py +++ b/spider.py @@ -18,11 +18,13 @@ from urllib.parse import urlparse import requests import yaml +import tenacity from bs4 import BeautifulSoup from git import Repo from selenium import webdriver from google.cloud import datastore +from google.api_core.exceptions import Aborted from google.api_core.exceptions import InvalidArgument @@ -713,6 +715,8 @@ def check_site(entry): return result +@tenacity.retry(wait=tenacity.wait_exponential(), + retry=tenacity.retry_if_exception_type(Aborted)) def get_job_from_queue(): """ Returns a URL from the queue