From bd1c71f6626e65f5579c04b9833219fc03637b49 Mon Sep 17 00:00:00 2001 From: LxL Date: Sat, 3 Aug 2019 18:49:28 +0800 Subject: [PATCH] Support customizing DATA_PATH (#78) --- .circleci/config.yml | 75 +++++++++++-------- scrapydweb/default_settings.py | 8 +- scrapydweb/templates/scrapydweb/settings.html | 4 + scrapydweb/utils/check_app_config.py | 1 + scrapydweb/utils/setup_database.py | 2 +- scrapydweb/vars.py | 16 +++- scrapydweb/views/baseview.py | 6 +- scrapydweb/views/system/settings.py | 1 + tests/test_database.py | 12 --- tests/test_system.py | 23 ++++++ 10 files changed, 98 insertions(+), 50 deletions(-) delete mode 100644 tests/test_database.py create mode 100644 tests/test_system.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 01dd67f..933ddfa 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -22,13 +22,16 @@ jobs: use-git: type: boolean default: false - use-mysql: + set-data-path: + type: boolean + default: false + use-sqlite: type: boolean default: false use-postgresql: type: boolean default: false - use-sqlite: + use-mysql: type: boolean default: false steps: @@ -60,6 +63,35 @@ jobs: command: | python3 -m venv venv + - when: + condition: <> + steps: + - run: + name: Set DATABASE_URL to sqlite + command: | + printf "\nDATA_PATH = '"$DATA_PATH"'\n" >> scrapydweb_settings_v8.py + - when: + condition: <> + steps: + - run: + name: Set DATABASE_URL to sqlite + command: | + printf "\nDATABASE_URL = '"$DATABASE_URL"'\n" >> scrapydweb_settings_v8.py + - when: + condition: <> + steps: + - run: + name: Setup PSQL Databases + command: | + # https://discuss.circleci.com/t/multiple-postgres-databases-in-circleci-2-0/23089 + # createdb: could not connect to database template1: FATAL: role "circleci" does not exist + # sudo apt install -y postgresql-client + # createdb -h localhost scrapydweb_apscheduler -O circleci + - run: + name: Set DATABASE_URL to postgresql + command: | + # postgres://circleci@127.0.0.1:5432 + printf "\nDATABASE_URL = '"$DATABASE_URL"'\n" >> scrapydweb_settings_v8.py - when: condition: <> steps: @@ -90,31 +122,7 @@ jobs: command: | # mysql://user:passw0rd@127.0.0.1:3306 printf "\nDATABASE_URL = '"$DATABASE_URL"'\n" >> scrapydweb_settings_v8.py - cat scrapydweb_settings_v8.py - - when: - condition: <> - steps: - - run: - name: Setup PSQL Databases - command: | - # https://discuss.circleci.com/t/multiple-postgres-databases-in-circleci-2-0/23089 - # createdb: could not connect to database template1: FATAL: role "circleci" does not exist - # sudo apt install -y postgresql-client - # createdb -h localhost scrapydweb_apscheduler -O circleci - - run: - name: Set DATABASE_URL to postgresql - command: | - # postgres://circleci@127.0.0.1:5432 - printf "\nDATABASE_URL = '"$DATABASE_URL"'\n" >> scrapydweb_settings_v8.py - cat scrapydweb_settings_v8.py - - when: - condition: <> - steps: - - run: - name: Set DATABASE_URL to sqlite - command: | - printf "\nDATABASE_URL = '"$DATABASE_URL"'\n" >> scrapydweb_settings_v8.py - cat scrapydweb_settings_v8.py + - run: name: Install dependencies command: | @@ -160,13 +168,16 @@ jobs: - run: name: Generate report command: | + touch scrapydweb_settings_v8.py + cat scrapydweb_settings_v8.py + echo $DATA_PATH echo $DATABASE_URL . venv/bin/activate coverage report coverage html coverage xml - coveralls ls -la + coveralls - store_artifacts: path: htmlcov - store_artifacts: @@ -184,6 +195,7 @@ jobs: - image: circleci/python:2.7 environment: SCRAPYDWEB_TESTMODE: True + DATA_PATH: '/home/circleci/repo/scrapydweb_data' DATABASE_URL: 'sqlite:////home/circleci/repo/scrapydweb_database' py27-postgresql: <<: *test-template @@ -224,6 +236,7 @@ jobs: - image: circleci/python:3.6 environment: SCRAPYDWEB_TESTMODE: True + DATA_PATH: '/home/circleci/repo/scrapydweb_data' DATABASE_URL: 'sqlite:////home/circleci/repo/scrapydweb_database' py37-git-postgresql: <<: *test-template @@ -266,6 +279,7 @@ workflows: is-py27: true - py27-sqlite: is-py27: true + set-data-path: true use-sqlite: true - py27-postgresql: is-py27: true @@ -274,9 +288,10 @@ workflows: is-py27: true use-mysql: true - - py36-sqlite: - use-postgresql: true - py37 + - py36-sqlite: + set-data-path: true + use-sqlite: true - py37-git-postgresql: use-git: true use-postgresql: true diff --git a/scrapydweb/default_settings.py b/scrapydweb/default_settings.py index f9ddf4e..8933785 100644 --- a/scrapydweb/default_settings.py +++ b/scrapydweb/default_settings.py @@ -309,13 +309,17 @@ # for getting more information about how ScrapydWeb works, especially while debugging. VERBOSE = False -# The default is '', which means saving data of Jobs and Timer Tasks in the Python directory using SQLite. +# The default is '', which means saving all program data in the Python directory. +# e.g. 'C:/Users/username/scrapydweb_data' or '/home/username/scrapydweb_data' +DATA_PATH = '' + +# The default is '', which means saving data of Jobs and Timer Tasks in DATA_PATH using SQLite. # The data could be also saved in MySQL or PostgreSQL backend in order to improve concurrency. # To use MySQL backend, run command: pip install --upgrade pymysql # To use PostgreSQL backend, run command: pip install --upgrade psycopg2 # e.g. # 'mysql://username:password@127.0.0.1:3306' # 'postgres://username:password@127.0.0.1:5432' -# 'sqlite:///c:/Users/username' +# 'sqlite:///C:/Users/username' # 'sqlite:////home/username' DATABASE_URL = '' diff --git a/scrapydweb/templates/scrapydweb/settings.html b/scrapydweb/templates/scrapydweb/settings.html index 542d620..7ab38f4 100644 --- a/scrapydweb/templates/scrapydweb/settings.html +++ b/scrapydweb/templates/scrapydweb/settings.html @@ -152,6 +152,10 @@

System

  • DEBUG = {{ DEBUG }}

  • VERBOSE = {{ VERBOSE }}

  • +
  • +

    DATA_PATH

    +
    {{ DATA_PATH }}
    +
  • DATABASE

    {{ database_details }}
    diff --git a/scrapydweb/utils/check_app_config.py b/scrapydweb/utils/check_app_config.py index e4c8f09..a3521a7 100644 --- a/scrapydweb/utils/check_app_config.py +++ b/scrapydweb/utils/check_app_config.py @@ -250,6 +250,7 @@ def check_assert(key, default, is_instance, allow_zero=True, non_empty=False, co # logging.getLogger('apscheduler').setLevel(logging.DEBUG) # else: # logging.getLogger('apscheduler').setLevel(logging.WARNING) + check_assert('DATA_PATH', '', str) check_assert('DATABASE_URL', '', str) database_url = config.get('DATABASE_URL', '') if database_url: diff --git a/scrapydweb/utils/setup_database.py b/scrapydweb/utils/setup_database.py index 2fab24e..ab7361c 100644 --- a/scrapydweb/utils/setup_database.py +++ b/scrapydweb/utils/setup_database.py @@ -40,7 +40,7 @@ def setup_database(database_url, database_path): database_path = os.path.abspath(database_path) database_path = re.sub(r'\\', '/', database_path) database_path = re.sub(r'/$', '', database_path) - if not os.path.exists(database_path): + if not os.path.isdir(database_path): os.mkdir(database_path) if m_mysql or m_postgres: diff --git a/scrapydweb/vars.py b/scrapydweb/vars.py index 4768d2c..63544d0 100644 --- a/scrapydweb/vars.py +++ b/scrapydweb/vars.py @@ -8,6 +8,7 @@ from apscheduler.schedulers.base import STATE_PAUSED, STATE_RUNNING, STATE_STOPPED +from .default_settings import DATA_PATH as default_data_path from .default_settings import DATABASE_URL as default_database_url from .utils.setup_database import setup_database @@ -18,14 +19,23 @@ try: custom_settings_module = importlib.import_module(os.path.splitext(SCRAPYDWEB_SETTINGS_PY)[0]) except ImportError: + custom_data_path = '' custom_database_url = '' else: + custom_data_path = getattr(custom_settings_module, 'DATA_PATH', '') + custom_data_path = custom_data_path if isinstance(custom_data_path, str) else '' custom_database_url = getattr(custom_settings_module, 'DATABASE_URL', '') custom_database_url = custom_database_url if isinstance(custom_database_url, str) else '' -# For data path +# For data storage ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) -DATA_PATH = os.path.join(ROOT_DIR, 'data') + +DATA_PATH = default_data_path or custom_data_path +if DATA_PATH: + DATA_PATH = os.path.abspath(DATA_PATH) +else: + DATA_PATH = os.path.join(ROOT_DIR, 'data') + DATABASE_PATH = os.path.join(DATA_PATH, 'database') DEMO_PROJECTS_PATH = os.path.join(DATA_PATH, 'demo_projects') DEPLOY_PATH = os.path.join(DATA_PATH, 'deploy') @@ -47,7 +57,7 @@ TIMER_TASKS_HISTORY_LOG = os.path.join(HISTORY_LOG, 'timer_tasks_history.log') # For database -DATABASE_URL = custom_database_url or default_database_url or 'sqlite:///' + DATA_PATH +DATABASE_URL = custom_database_url or default_database_url or 'sqlite:///' + DATABASE_PATH results = setup_database(DATABASE_URL, DATABASE_PATH) APSCHEDULER_DATABASE_URI, SQLALCHEMY_DATABASE_URI, SQLALCHEMY_BINDS, DATABASE_PATH = results diff --git a/scrapydweb/views/baseview.py b/scrapydweb/views/baseview.py index 9f03c10..659c68c 100644 --- a/scrapydweb/views/baseview.py +++ b/scrapydweb/views/baseview.py @@ -12,8 +12,9 @@ from ..__version__ import __version__ as SCRAPYDWEB_VERSION from ..common import (get_now_string, get_response_from_view, handle_metadata, handle_slash, json_dumps, session) -from ..vars import (ALLOWED_SCRAPYD_LOG_EXTENSIONS, APSCHEDULER_DATABASE_URI, DEMO_PROJECTS_PATH, DEPLOY_PATH, - EMAIL_TRIGGER_KEYS, PARSE_PATH, LEGAL_NAME_PATTERN, SCHEDULE_ADDITIONAL, +from ..vars import (ALLOWED_SCRAPYD_LOG_EXTENSIONS, APSCHEDULER_DATABASE_URI, + DATA_PATH, DEMO_PROJECTS_PATH, DEPLOY_PATH, PARSE_PATH, + EMAIL_TRIGGER_KEYS, LEGAL_NAME_PATTERN, SCHEDULE_ADDITIONAL, SCHEDULE_PATH, STATE_PAUSED, STATE_RUNNING, STATS_PATH, STRICT_NAME_PATTERN) from ..utils.scheduler import scheduler @@ -52,6 +53,7 @@ def __init__(self, *args, **kwargs): # System self.DEBUG = app.config.get('DEBUG', False) self.VERBOSE = app.config.get('VERBOSE', False) + self.DATA_PATH = DATA_PATH self.APSCHEDULER_DATABASE_URI = APSCHEDULER_DATABASE_URI self.SQLALCHEMY_DATABASE_URI = app.config['SQLALCHEMY_DATABASE_URI'] self.SQLALCHEMY_BINDS = app.config['SQLALCHEMY_BINDS'] diff --git a/scrapydweb/views/system/settings.py b/scrapydweb/views/system/settings.py index 39ebfb5..ed03eed 100644 --- a/scrapydweb/views/system/settings.py +++ b/scrapydweb/views/system/settings.py @@ -163,6 +163,7 @@ def update_kwargs(self): # System self.kwargs['DEBUG'] = self.DEBUG self.kwargs['VERBOSE'] = self.VERBOSE + self.kwargs['DATA_PATH'] = self.DATA_PATH self.kwargs['database_details'] = self.json_dumps(dict( APSCHEDULER_DATABASE_URI=self.hide_account(self.APSCHEDULER_DATABASE_URI), SQLALCHEMY_DATABASE_URI=self.hide_account(self.SQLALCHEMY_DATABASE_URI), diff --git a/tests/test_database.py b/tests/test_database.py deleted file mode 100644 index 4c6d635..0000000 --- a/tests/test_database.py +++ /dev/null @@ -1,12 +0,0 @@ -# coding: utf-8 -import os - -from scrapydweb.vars import APSCHEDULER_DATABASE_URI, DATABASE_PATH - - -def test_sqlalchemy_database_uri(app): - database_url = os.environ.get('DATABASE_URL', 'sqlite:///' + DATABASE_PATH) - assert APSCHEDULER_DATABASE_URI.startswith(database_url) - assert app.config['SQLALCHEMY_DATABASE_URI'].startswith(database_url) - for value in app.config['SQLALCHEMY_BINDS'].values(): - assert value.startswith(database_url) diff --git a/tests/test_system.py b/tests/test_system.py new file mode 100644 index 0000000..fed26cb --- /dev/null +++ b/tests/test_system.py @@ -0,0 +1,23 @@ +# coding: utf-8 +import os +import re + +from scrapydweb.vars import APSCHEDULER_DATABASE_URI, DATA_PATH, DATABASE_PATH, ROOT_DIR + + +def test_option_data_path(app): + data_path = os.environ.get('DATA_PATH', '') + if data_path and os.environ.get('TEST_ON_CIRCLECI', 'False').lower() == 'true': + assert not os.path.isdir(os.path.join(ROOT_DIR, 'data', 'database')) + assert os.path.isdir(os.path.join(data_path or DATA_PATH, 'database')) + + +def test_option_database_url(app): + database_url = os.environ.get('DATABASE_URL', 'sqlite:///' + DATABASE_PATH) + assert APSCHEDULER_DATABASE_URI.startswith(database_url) + assert app.config['SQLALCHEMY_DATABASE_URI'].startswith(database_url) + for value in app.config['SQLALCHEMY_BINDS'].values(): + assert value.startswith(database_url) + + m = re.match(r'sqlite:///(.+)$', database_url) + assert os.path.isdir(m.group(1) if m else DATABASE_PATH)