diff --git a/appserver/Dockerfile b/appserver/Dockerfile index 66b2431483..d2e572feb9 100644 --- a/appserver/Dockerfile +++ b/appserver/Dockerfile @@ -1,133 +1,44 @@ -# ======================================== -# Base image -# ======================================== -FROM python:3.10-slim as base - -ENV LANG C.UTF-8 -ENV LC_ALL C.UTF-8 -ENV PYTHONDONTWRITEBYTECODE 1 -ENV PYTHONFAULTHANDLER 1 +FROM fedora:33 as base +LABEL app=kg-prototypes +# Install dependencies +RUN dnf install htop postgresql graphviz python-pip python3-devel vim net-tools which -y \ + && dnf groupinstall 'Development Tools' -y \ + && dnf clean packages RUN pip install pipenv +ENV N4J_USER n4j +ENV N4J_HOME /home/$N4J_USER +ENV UID 1000 +ENV GID 1000 -# ======================================== -# Build dependencies stage -# ======================================== -FROM base as build-deps - -# Install build dependencies -RUN apt-get update \ - && apt-get install -y liblmdb-dev python3-dev libxml2-dev libxslt-dev build-essential \ - && apt-get clean - -# Copy Pipfiles -COPY Pipfile Pipfile.lock ./ - -# Install Python dependencies -ARG DEV -RUN PIPENV_VENV_IN_PROJECT=1 pipenv install --deploy $(if [ "$DEV" ]; then echo --dev; fi) - - -# ======================================== -# Runtime stage -# ======================================== -FROM base -LABEL org.opencontainers.image.source https://github.com/SBRG/lifelike - -# Install runtime system dependencies -RUN apt-get update \ - && apt-get install -y libmagic-dev graphviz libgraphviz-dev curl \ - && apt-get clean - -# Copy Python virtual environment -COPY --from=build-deps /.venv /.venv -ENV PATH="/.venv/bin:$PATH" - -# Set user and workdir -WORKDIR /app -RUN useradd -m -d /app app -USER app - -# Copy application code -COPY --chown=app . . - -# Set to 1 to automatically apply any pending DB migrations at startup -ENV MIGRATE_DB= - -# Create an initial admin user -ENV INITIAL_ADMIN_EMAIL= - -# LMDB database volume -ENV LMDB_DATA_DIR=/lmdb -VOLUME /lmdb - -# LMDB download cloud storage -# ENV AZURE_ACCOUNT_STORAGE_NAME= -# ENV AZURE_ACCOUNT_STORAGE_KEY= - -# JWT Authendication -ENV JWT_SECRET=secret - -# Base URL of this app, reachable by external services -ENV APPSERVER_URL=http://localhost:5000 - -# Base URL of the frontend app, for link generation -ENV FRONTEND_URL=http://localhost:4242 - -# PostgreSQL configuration -ENV POSTGRES_HOST=postgres -ENV POSTGRES_PORT=5432 -ENV POSTGRES_USER=postgres -ENV POSTGRES_PASSWORD=postgres -ENV POSTGRES_DB=postgres - -# Neo4j configuration -ENV NEO4J_HOST=neo4j -ENV NEO4J_PORT=7687 -ENV NEO4J_AUTH=neo4j/password -ENV NEO4J_DATABASE=neo4j -ENV NEO4J_SCHEME=bolt - -# Elasticsearch configuration -ENV ELASTICSEARCH_URL=http://elasticsearch:9200 -ENV ELASTICSEARCH_FILE_INDEX=file - -# Statistical enrichment service -ENV STATISTICAL_ENRICHMENT_URL=http://statistical-enrichment:5000 - -# PDFParser service -ENV PDFPARSER_URL=http://pdfparser:7600 +# User and group creation +RUN groupadd -g $GID $N4J_USER && \ + useradd -u $UID -g $GID -G wheel --create-home --home-dir $N4J_HOME --shell /bin/bash $N4J_USER -# NLP Processing service -ENV NLP_URL=https://nlp-api.lifelike.bio/v1/predict -ENV NLP_SECRET=secret +WORKDIR $N4J_HOME -# Mailserver configuration -ENV FROM_EMAIL=lifelike@example.com +# Copy Pipfiles and install dependencies FIRST to better apply Docker layer cache +COPY --chown=1000:1000 Pipfile . +COPY --chown=1000:1000 Pipfile.lock . +RUN pipenv install --dev --deploy --system -# Sendgrid integration -ENV SENDGRID_API_KEY= +# ...then copy everything else +COPY --chown=1000:1000 . . -# Optional Sentry logging configuration -ENV SENTRY_DSN= +# TODO: We should consider breaking this apart into dev and prod +# builds, so we don't build unnecessary packages -# Optional Elastic APM configuration. -# To enable, at least ELASTIC_APM_SERVER_URL must be set -# Other available variables: https://www.elastic.co/guide/en/apm/agent/python/master/configuration.html -ENV ELASTIC_APM_SERVER_URL= -ENV ELASTIC_APM_SERVICE_NAME=appserver +# Don't lose stdin, stdout and stderr output due to buffering +ENV PYTHONUNBUFFERED 1 +ENV PYTHONPATH $N4J_HOME -# Flask env (development, testing, production) -ENV FLASK_ENV=production +# Set Python3 as the default when running "python" +RUN echo 'alias python=python3' >> ~/.bashrc && source ~/.bashrc -# Listen port -ENV PORT=5000 -EXPOSE $PORT +USER $N4J_USER -# Health check by requesting system info to /meta endpoint -HEALTHCHECK --start-period=30s \ - CMD curl -f localhost:$PORT/meta || exit 1 +# Setup flask application environment vars +ENV MAX_ALLOWED_LOGIN_FAILURES 6 -RUN chmod +x bin/docker-entrypoint.sh -ENTRYPOINT ["bin/docker-entrypoint.sh"] +CMD [ "bin/startup.sh" ] diff --git a/appserver/neo4japp/blueprints/reports.py b/appserver/neo4japp/blueprints/reports.py index 3529036c0f..c9b55d444e 100644 --- a/appserver/neo4japp/blueprints/reports.py +++ b/appserver/neo4japp/blueprints/reports.py @@ -20,7 +20,36 @@ class CopyrightInfringementReportView(MethodView): @use_args(CopyrightInfringementRequestSchema) def post(self, params: dict): - with db.session.begin_nested(): + # Try to send an email to the user and currator + send_email_exception = None + try: + message = Mail( + from_email=MESSAGE_SENDER_IDENTITY, + to_emails=params['email'], + subject=COPYRIGHT_REPORT_CONFIRMATION_EMAIL_TITLE, + html_content=COPYRIGHT_REPORT_CONFIRMATION_EMAIL_CONTENT.format( + url=params['url'], + description=params['description'], + name=params['name'], + company=params['company'], + address=params['address'], + country=params['country'], + city=params['city'], + province=params['province'], + zip=params['zip'], + phone=params['phone'], + fax=params['fax'], + email=params['email'], + ), + ) + message.add_bcc(bcc_email=LIFELIKE_EMAIL_ACCOUNT) + get_send_grid_service().send(message) + except Exception as e: + # If the email fails to send, store the exception to raise later + # after the report is saved to the database + send_email_exception = e + + try: copyright_infringement_report = CopyrightInfringementRequest( url=params['url'], description=params['description'], @@ -41,38 +70,16 @@ def post(self, params: dict): signature=params['signature'], ) db.session.add(copyright_infringement_report) - - message = Mail( - from_email=MESSAGE_SENDER_IDENTITY, - to_emails=params['email'], - subject=COPYRIGHT_REPORT_CONFIRMATION_EMAIL_TITLE, - html_content=COPYRIGHT_REPORT_CONFIRMATION_EMAIL_CONTENT.format( - url=params['url'], - description=params['description'], - name=params['name'], - company=params['company'], - address=params['address'], - country=params['country'], - city=params['city'], - province=params['province'], - zip=params['zip'], - phone=params['phone'], - fax=params['fax'], - email=params['email'], - ), - ) - message.add_bcc(bcc_email=LIFELIKE_EMAIL_ACCOUNT) - try: - get_send_grid_service().send(message) - except Exception as e: - with db.session.begin_nested(): - # If for some reason we cannot send a confirmation email, delete the row we just - # created and re-raise the error. - db.session.delete(copyright_infringement_report) - # rollback in case of error? + db.session.commit() + except Exception: + db.session.rollback() raise - - return jsonify(dict(result=copyright_infringement_report.to_dict())) + else: + return jsonify(dict(result=copyright_infringement_report.to_dict())) + finally: + # If the email failed to send, raise the exception before returning + if send_email_exception: + raise send_email_exception copyright_infringement_report_view = CopyrightInfringementReportView.as_view( diff --git a/appserver/neo4japp/blueprints/user.py b/appserver/neo4japp/blueprints/user.py index e33c85c59e..e0d7be30c2 100644 --- a/appserver/neo4japp/blueprints/user.py +++ b/appserver/neo4japp/blueprints/user.py @@ -7,6 +7,7 @@ from webargs.flaskparser import use_args from neo4japp.database import db +from neo4japp.exceptions import NotAuthorized from neo4japp.models import Projects, Files from neo4japp.schemas.filesystem import ( PublishSchema, @@ -48,6 +49,9 @@ def get(self, user_hash_id: str): @use_args(PublishSchema, locations=['json', 'form', 'files', 'mixed_form_json']) def post(self, params: dict, user_hash_id: str): + if g.current_user.has_role('admin') is False: + raise NotAuthorized() + file = Publish.create_uncommited_publication( user_hash_id, creator=g.current_user, **params ) diff --git a/appserver/neo4japp/constants.py b/appserver/neo4japp/constants.py index 1952651df5..9b1ae5a969 100644 --- a/appserver/neo4japp/constants.py +++ b/appserver/neo4japp/constants.py @@ -451,7 +451,7 @@ def is_db_name(s: str): RESET_PASSWORD_ALPHABET = RESET_PASSWORD_SYMBOLS + string.ascii_letters + string.digits # Start email constants -LIFELIKE_EMAIL_ACCOUNT = 'lifelike.science@gmail.com' +LIFELIKE_EMAIL_ACCOUNT = 'lifelike@biosustain.dtu.dk' MESSAGE_SENDER_IDENTITY = 'lifelike-account-service@lifelike.bio' MAILING_API_KEY = LocalProxy(lambda: config.get('SEND_GRID_EMAIL_API_KEY')) RESET_PASSWORD_EMAIL_TITLE = 'Lifelike: Account password reset' diff --git a/cache-invalidator/Dockerfile b/cache-invalidator/Dockerfile index 33f146b5ba..7a94eac270 100644 --- a/cache-invalidator/Dockerfile +++ b/cache-invalidator/Dockerfile @@ -1,64 +1,33 @@ -# ======================================== -# Base image -# ======================================== FROM python:3.10-slim as base +LABEL app=kg-prototypes -ENV LANG C.UTF-8 -ENV LC_ALL C.UTF-8 -ENV PYTHONDONTWRITEBYTECODE 1 -ENV PYTHONFAULTHANDLER 1 - +# Install dependencies +RUN apt-get update && apt-get install -y curl && apt-get clean RUN pip install pipenv +ENV APP_USER lifelike +ENV APP_HOME /home/$APP_USER +ENV UID 1000 +ENV GID 1000 -# ======================================== -# Build dependencies stage -# ======================================== -FROM base as build-deps - -# Copy Pipfiles -COPY Pipfile Pipfile.lock ./ - -# Install Python dependencies -ARG DEV -RUN PIPENV_VENV_IN_PROJECT=1 pipenv install --deploy $(if [ "$DEV" ]; then echo --dev; fi) - - -# ======================================== -# Runtime stage -# ======================================== -FROM base -LABEL org.opencontainers.image.source https://github.com/SBRG/lifelike - -# Copy Python virtual environment -COPY --from=build-deps /.venv /.venv -ENV PATH="/.venv/bin:$PATH" - -# Set user and working directory -WORKDIR /app -RUN useradd -m -d /app app -USER app +# User and group creation +RUN groupadd -g $GID $APP_USER && \ + useradd -u $UID -g $GID -G sudo --create-home --home-dir $APP_HOME --shell /bin/bash $APP_USER -# Copy application code -COPY --chown=app main.py ./ +WORKDIR $APP_HOME -# Neo4j configuration -ENV NEO4J_HOST=neo4j -ENV NEO4J_PORT=7687 -ENV NEO4J_AUTH=neo4j/password -ENV NEO4J_SCHEME=bolt -ENV NEO4J_DATABASE=neo4j +# Copy Pipfiles and install dependencies FIRST to better apply Docker layer cache +COPY --chown=1000:1000 Pipfile . +COPY --chown=1000:1000 Pipfile.lock . +RUN pipenv install --deploy --dev --system -# Redis cache configuration -ENV REDIS_HOST=redis -ENV REDIS_PORT=6379 -ENV REDIS_PASSWORD=password -ENV REDIS_DB=0 +# ...then copy everything else +COPY --chown=1000:1000 . . -# Default TTL for cache -ENV CACHE_TTL=86400 +# Don't lose stdin, stdout and stderr output due to buffering +ENV PYTHONUNBUFFERED 1 +ENV PYTHONPATH $APP_HOME -# Logging level -ENV LOG_LEVEL=INFO +USER $APP_USER -CMD ["python", "main.py"] +CMD [ "bin/startup.sh" ] diff --git a/client/Dockerfile b/client/Dockerfile index 268192f7ce..af7fe9281b 100644 --- a/client/Dockerfile +++ b/client/Dockerfile @@ -1,5 +1,20 @@ ARG NODE_IMAGE_TAG=node:14 +# ======================================== +# Landing page +# ======================================== +FROM $NODE_IMAGE_TAG as landing-build +WORKDIR /app + +# Install dependencies +COPY landing/package.json landing/yarn.lock ./ +RUN yarn install + +# Build landing page +COPY landing ./ +RUN yarn build + + # ================================================================== # Angular app dependencies by default used for local development # ================================================================== @@ -8,58 +23,71 @@ WORKDIR /app # Install dependencies COPY package.json yarn.lock ./ -ARG YARN_INSTALL_OPTS -RUN yarn install ${YARN_INSTALL_OPTS} - -ENV ENVIRONMENT_CONFIG development +RUN yarn install +# build time arguments for Angular environment +ARG ANGULAR_CONFIG=development +ARG CLIENT_VERSION=undefined +# default enviroment presets +ENV ENVIRONMENT_CONFIG $ANGULAR_CONFIG # ======================================== # Angular app bundle build # ======================================== FROM angular-deps as angular-build +# build time arguments for Angular environment +ARG ANGULAR_CONFIG=production +ARG CLIENT_VERSION=undefined + # Copy the code and build the app bundle COPY src ./src COPY tslint ./tslint COPY e2e ./e2e COPY *.json browserslist ./ - -ARG ANGULAR_CONFIG=production -ENV NODE_OPTIONS=--max-old-space-size=4096 -RUN yarn build --configuration=$ANGULAR_CONFIG --output-path=dist - -# When targeting this image stage, run angulat dev server -EXPOSE 4200 -HEALTHCHECK --interval=5m --timeout=10s \ - CMD curl -f localhost:4200 || exit 1 -CMD yarn dev-start - +RUN sed -i "s/__VERSION__/${CLIENT_VERSION}/" src/environments/environment.ts +RUN yarn build --configuration=$ANGULAR_CONFIG --aot --output-path=dist # ======================================== # Runtime stage - NGINX # ======================================== -FROM nginx:1.21 -LABEL org.opencontainers.image.source https://github.com/SBRG/lifelike - +FROM nginx:1.25.1 +LABEL app=kg-prototypes WORKDIR /usr/share/nginx/html -# Copy built assets -COPY --from=angular-build /app/dist ./ +# URL to proxy requests to /api +ENV APPSERVER_UPSTREAM http://appserver:5000 -# Copy nginx configuraiton template -COPY nginx.conf /etc/nginx/templates/default.conf.template +# Whether to run the app in prod mode +ENV PRODUCTION_MODE true -# appserver URL to proxy /api requests -ENV APPSERVER_URL http://appserver:5000 +# Whether we are running with valid KEGG license +ENV KEGG_ENABLED false + +# Whether to run the app with oauth login +ENV OAUTH_ENABLED false + +# OAuth issuer discovert URL +ENV OAUTH_ISSUER "" + +# Client ID of the OAuth application +ENV OAUTH_CLIENT_ID "" # List of space delimited list of non-stantdard MIME types # which are known to benefit from gzip compression (text based content) ENV GZIP_EXTRA_TYPES text/tsv vnd.lifelike.document/bioc vnd.lifelike.document/enrichment-table vnd.lifelike.document/graph vnd.lifelike.document/map -# Runtime environment configuration preset -ENV ENVIRONMENT_CONFIG production +# build time argument for Angular environment +ARG ANGULAR_CONFIG=production + +# default enviroment presets +ENV ENVIRONMENT_CONFIG $ANGULAR_CONFIG + +# Copy nginx configuraiton template +COPY nginx.conf /etc/nginx/templates/default.conf.template + +# Copy built assets +COPY --from=landing-build /app/dist ./ +COPY --from=angular-build /app/dist ./ -# Listen port -ENV PORT 80 -EXPOSE $PORT +EXPOSE 80 diff --git a/client/landing/src/index.html b/client/landing/src/index.html index 468d424815..2664fc7ce3 100644 --- a/client/landing/src/index.html +++ b/client/landing/src/index.html @@ -45,12 +45,13 @@

From Big Data
to Big Picture

picture understanding, augmenting our intelligence in solving complex problems. - + diff --git a/client/src/app/app-routing.module.ts b/client/src/app/app-routing.module.ts index 0ae44202f8..3e43ffe2af 100644 --- a/client/src/app/app-routing.module.ts +++ b/client/src/app/app-routing.module.ts @@ -4,7 +4,6 @@ import { Router, RouterModule, Routes } from '@angular/router'; import { Store } from '@ngrx/store'; import { AdminPanelComponent } from 'app/admin/components/admin-panel.component'; -import { UserFileImportComponent } from 'app/user-file-import/components/user-file-import.component'; import { VisualizationComponent } from 'app/visualization/containers/visualization/visualization.component'; import { GraphSearchComponent } from 'app/search/components/graph-search.component'; import { ObjectBrowserComponent } from 'app/file-browser/components/object-browser.component'; @@ -25,9 +24,8 @@ import { CommunityBrowserComponent } from 'app/file-browser/components/community import { BrowserComponent } from 'app/file-browser/components/browser/browser.component'; import { ContentSearchComponent } from 'app/search/components/content-search.component'; import { ObjectNavigatorComponent } from 'app/file-navigator/components/object-navigator.component'; -import { ShortestPathComponent } from 'app/shortest-path/containers/shortest-path.component'; -import {EnrichmentTableViewerComponent} from 'app/enrichment/components/table/enrichment-table-viewer.component'; -import {EnrichmentVisualisationViewerComponent} from 'app/enrichment/components/visualisation/enrichment-visualisation-viewer.component'; +import { EnrichmentTableViewerComponent } from 'app/enrichment/components/table/enrichment-table-viewer.component'; +import { EnrichmentVisualisationViewerComponent } from 'app/enrichment/components/visualisation/enrichment-visualisation-viewer.component'; import { BiocViewComponent } from 'app/bioc-viewer/components/bioc-view.component'; import { ObjectViewerComponent } from 'app/file-browser/components/object-viewer.component'; import { SankeyViewComponent } from 'app/sankey/components/sankey-view.component'; @@ -113,11 +111,6 @@ const routes: Routes = [ fontAwesomeIcon: 'search', }, }, - { - path: 'pathway-browser-prototype', - canActivate: [AuthGuard], - component: ShortestPathComponent, - }, { path: 'projects/:project_name/enrichment-table/:file_id', canActivate: [], @@ -172,15 +165,6 @@ const routes: Routes = [ fontAwesomeIcon: 'fas fa-chart-network', }, }, - { - path: 'upload', - component: UserFileImportComponent, - canActivate: [AuthGuard], - data: { - title: 'Knowledge Graph Upload', - fontAwesomeIcon: 'fas fa-chart-network', - }, - }, ], }, { diff --git a/client/src/app/app.component.html b/client/src/app/app.component.html index b92e740d4f..6328577354 100644 --- a/client/src/app/app.component.html +++ b/client/src/app/app.component.html @@ -105,9 +105,6 @@ [appAutoCloseTooltipOutOfView]="tooltipRef"> --> - - - Publish diff --git a/client/src/app/file-browser/components/published-browser/published-browser.component.ts b/client/src/app/file-browser/components/published-browser/published-browser.component.ts index bb7cbe0349..b68c26555c 100644 --- a/client/src/app/file-browser/components/published-browser/published-browser.component.ts +++ b/client/src/app/file-browser/components/published-browser/published-browser.component.ts @@ -49,6 +49,8 @@ export class PublishedBrowserComponent implements OnInit, OnDestroy { ); private loadTaskSubscription: Subscription; + readonly disablePublishMessage = + 'You do not have permission to publish files. Please contact administrator if you need this feature.'; constructor( private readonly filesystemService: FilesystemService, diff --git a/client/src/environments/development.css b/client/src/environments/development.css index 8b13789179..e69de29bb2 100644 --- a/client/src/environments/development.css +++ b/client/src/environments/development.css @@ -1 +0,0 @@ - diff --git a/client/src/environments/production.css b/client/src/environments/production.css index 8b13789179..e69de29bb2 100644 --- a/client/src/environments/production.css +++ b/client/src/environments/production.css @@ -1 +0,0 @@ - diff --git a/docs/wiki/Architecture overview.drawio.png b/docs/wiki/Architecture overview.drawio.png new file mode 100644 index 0000000000..21b24aa01b Binary files /dev/null and b/docs/wiki/Architecture overview.drawio.png differ diff --git a/docs/wiki/README.md b/docs/wiki/README.md new file mode 100644 index 0000000000..70fb17a96c --- /dev/null +++ b/docs/wiki/README.md @@ -0,0 +1,159 @@ +# Architecture overview + +![Architecture overview diagram](Architecture overview.drawio.png) +NOTE: The diagram is created with [draw.io](https://draw.io), this is editable version (copy of diagram has been embedded into the file). + +# Service functional groups + +## Authentication + +Authentication related code is deposited in Front-end and appserver. Additionally, in publish enviroment keycloak is used as OAuth2 provider. + +## Annotation + +Annotation steps rely on Front-end, appserver, pdfparser, redis, postgress and graph database. +Only two types of files are annotateble: PDF and enrichment table. + +In principle annotation pipeline works as follows for PDF files: + +1. Front-end sends request to appserver to annotate file. +2. Appserver sends request to pdfparser to extract words and their locations. +3. Load core annotation terms into LMDB +4. Load annotation inclusions from graph database +5. Load annotation exclusions from postgress +6. Search for match between words and annotation terms +7. Matches are structured into json annotation structure +8. JSON annotation structure save to postgress is scheduled to redis queque + 1. Worker process reads annotation structure from redis queque and saves it to postgress +9. Appserver returns response (to request from pt. 1) with annotation structure +10. Front-end renders annotations + +In case of enrichment table, the process is similar: + +1. Front-end sends request to appserver to annotate enrichment table. +2. Enrichment table is stringified to text +3. Appserver sends request to pdfparser to extract words from text and their locations. +4. Load core annotation terms into LMDB +5. Load annotation inclusions from graph database +6. Load annotation exclusions from postgress +7. Search for match between words and annotation terms +8. Matches are structured into json annotation structure +9. JSON annotation structure save to postgress is scheduled to redis queque + 1. Worker process reads annotation structure from redis queque and saves it to postgress +10. annotation structure is translated into json table structure containing xml snippets for annotations +11. Appserver returns response (to request from pt. 1) with json table structure containing xml snippets for annotations +12. Front-end overwrites enrichment table with xml snippets for annotations +13. Front-end renders annotations +14. Front-end sends request to appserver to save enrichment table +15. Appserver schedules saving enrichment table to postgress + 15.1. Worker process reads enrichment table from redis queque and saves it to postgress + +## Search/Indexing + +Search and indexing steps rely on Front-end, appserver, postgress and elasticsearch. + +In appserver code upon each modification to file: + +1. Redis queque task is scheduled to reindex file + 1. Worker process reads file content from redis queque + 2. File content is parsed into text + 3. Text is send to elasticsearch for indexing + +In case of search request is simply send to elasticsearch and results are returned to front-end. + +## Enrichment (table) + +Enrichment table relies on Front-end, appserver, postgress and graph database. + +1. Front-end sends request to appserver to enrich gene list. +2. Appserver sends request to graph database to get related gene information. +3. Appserver returns response (to request from pt. 1) with gene information. +4. Front-end composes request from step 1 and gene information into enrichment file. +5. Front-end sends request to appserver to save enrichment file. +6. Front-end send annotation request. + +## Statistical enrichment + +Statistical enrichment relies on Front-end, appserver, redis, graph database and statistical enrichment container. + +1. Front-end sends request to appserver to perform statistical enrichment. +2. Appserver forwards request to statistical enrichment container to perform statistical enrichment. +3. Statistical enrichment checks if any part of input data or whole request is in redis cache. +4. In case of cache hit, statistical enrichment returns results from cache. + Alternatively, we query missing input data from graph database and perform statistical enrichment (caching both intermidiate steps and final results). +5. Statistical enrichment returns results to appserver. +6. Appserver returns results to front-end. +7. Front-end renders results. + +# Services in detail + +## Front-end + +[Source code](https://github.com/SBRG/kg-prototypes/tree/master/client) +[Source code welcome page](https://github.com/SBRG/lifelike-website) + +Frontend (Typescript, Angular, Bootstrap) + +## Appserver + +[Source code](https://github.com/SBRG/kg-prototypes/tree/master/appserver) +Only endpoint for frontend / core logic + +runs postgres migrations on start + +Converts text entities to annotations + +Parse file contents for elastic search + updates elasticsearch indexes + +## PDF parser + +[Source code](https://github.com/SBRG/pdfparse) +[Source code pdfbox2](https://github.com/SBRG/pdfbox2) + +Extract text blocks from pdf/text + +## Elasticsearch + +[Source code](https://github.com/SBRG/kg-prototypes/tree/master/elasticsearch) + +## Graph database + +[Source code neo4j](https://github.com/SBRG/kg-prototypes/tree/master/neo4j) +[Source code arango](https://github.com/SBRG/kg-prototypes/tree/master/arango) + +Graphdatabase for knowledge and annotation exclusions (inclusions are in Postgres) + +## Keycloak + +[Source code](https://github.com/SBRG/lifelike-keycloak) + +## Cache-invalidator + +[Source code](https://github.com/SBRG/kg-prototypes/tree/master/cache-invalidator) +Cron like service recalculating common and expensive cache values in timely manner. + +## Redis + +Used for cache, task queue. + +## Postgress + +Main db containing users, files, annotations etc. + +## Elasticsearch + +Hold logs, metrics, indexed file contents + +## Logstash + +[Source code](https://github.com/SBRG/kg-prototypes/tree/master/logstash) + +## Metricbeat + +[Source code](https://github.com/SBRG/kg-prototypes/tree/master/metricbeat) +save docker metrics to elastic + +## Filebeat + +[Source code](https://github.com/SBRG/kg-prototypes/tree/master/filebeat) +save docker logs to elastic diff --git a/docs/wiki/binderhub/cluster-issuer-prod.yaml b/docs/wiki/binderhub/cluster-issuer-prod.yaml index ee2a1306ac..a902e81e9e 100644 --- a/docs/wiki/binderhub/cluster-issuer-prod.yaml +++ b/docs/wiki/binderhub/cluster-issuer-prod.yaml @@ -7,7 +7,7 @@ spec: # The ACME server URL server: https://acme-v02.api.letsencrypt.org/directory # Email address used for ACME registration - email: e4sanchez@eng.ucsd.edu + email: # Name of a secret used to store the ACME account private key privateKeySecretRef: name: letsencrypt-prod diff --git a/docs/wiki/binderhub/cluster-issuer-stg.yaml b/docs/wiki/binderhub/cluster-issuer-stg.yaml index b17ef68316..9f7fbac7c1 100644 --- a/docs/wiki/binderhub/cluster-issuer-stg.yaml +++ b/docs/wiki/binderhub/cluster-issuer-stg.yaml @@ -7,7 +7,7 @@ spec: # The ACME server URL server: https://acme-staging-v02.api.letsencrypt.org/directory # Email address used for ACME registration - email: e4sanchez@eng.ucsd.edu + email: # Name of a secret used to store the ACME account private key privateKeySecretRef: name: letsencrypt-staging diff --git a/docs/wiki/binderhub/config-prod.yaml b/docs/wiki/binderhub/config-prod.yaml index 600efb7c29..c62ea40f9a 100644 --- a/docs/wiki/binderhub/config-prod.yaml +++ b/docs/wiki/binderhub/config-prod.yaml @@ -1,90 +1,136 @@ config: BinderHub: auth_enabled: true - hub_url: https://jupyter-demo.lifelike.bio + cors_allow_origin: "*" + hub_url: https://jupyter.lifelike.bio + image_prefix: lifelikebinderhub.azurecr.io/binderhub/notebooks- use_registry: true - image_prefix: gcr.io/able-goods-221820/binderhub-demo GitHubRepoProvider: access_token: # Get this value from our existing Binderhub release (you can find it in the helm chart) banned_specs: - ^(?!SBRG/|\.).* -service: - type: ClusterIP - -jupyterhub: - proxy: - service: - type: ClusterIP - https: - enabled: true - type: letsencrypt ingress: - enabled: true - hosts: - - jupyter-demo.lifelike.bio annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod kubernetes.io/ingress.class: nginx kubernetes.io/tls-acme: "true" - cert-manager.io/cluster-issuer: letsencrypt-prod - tls: - - secretName: jupyter-demo-lifelike-bio-tls + enabled: true hosts: - - jupyter-demo.lifelike.bio + - binder.lifelike.bio + https: + enabled: true + type: letsencrypt + pathType: Prefix + tls: + - hosts: + - binder.lifelike.bio + secretName: binder-lifelike-bio-tls + +imageCleaner: + # Turning this off as it does not seem to work on AKS with this version of DinD + enabled: false + +jupyterhub: cull: - # Since we're using authenticated users, don't cull them. We would set this to true if we were using temp users. + concurrency: 10 + enabled: true + every: 600 + maxAge: 86400 + removeNamedServers: false + timeout: 1200 users: false - timeout: 1800 - removeNamedServers: true hub: allowNamedServers: true - namedServerLimitPerUser: 5 - shutdownOnLogout: true - redirectToServer: false config: BinderSpawner: auth_enabled: true - JupyterHub: - authenticator_class: google - Authenticator: - allowed_users: - - e4sanchez@@eng.ucsd.edu # Ethan - # You can add more users here + cors_allow_origin: "*" GoogleOAuthenticator: + admin_users: + - e4sanchez@ucsd.edu + allow_existing_users: true client_id: # Get this value from the Google Cloud Console client_secret: # Get this value from the Google Cloud Console - oauth_callback_url: https://jupyter-demo.lifelike.bio/hub/oauth_callback + hosted_domain: + - ucsd.edu + - biosustain.dtu.dk login_service: Google + oauth_callback_url: https://jupyter.lifelike.bio/hub/oauth_callback + JupyterHub: + admin_access: true + authenticator_class: google + consecutiveFailureLimit: 5 + namedServerLimitPerUser: 5 services: binder: + admin: true + apiToken: null oauth_client_id: service-binderhub oauth_no_confirm: true - oauth_redirect_uri: "https://binder-demo.lifelike.bio/oauth_callback" - loadRoles: - user: - scopes: - - self - - "access:services" - singleuser: - # to make notebook servers aware of hub - cmd: jupyterhub-singleuser - + oauth_redirect_uri: https://binder.lifelike.bio/oauth_callback ingress: - enabled: true - https: - enabled: true - type: letsencrypt - hosts: - - binder-demo.lifelike.bio annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod kubernetes.io/ingress.class: nginx kubernetes.io/tls-acme: "true" - cert-manager.io/cluster-issuer: letsencrypt-prod - tls: - - secretName: binder-demo-lifelike-bio-tls + enabled: true hosts: - - binder-demo.lifelike.bio + - jupyter.lifelike.bio + pathType: Prefix + tls: + - hosts: + - jupyter.lifelike.bio + secretName: jupyter-lifelike-bio-tls + proxy: + https: + enabled: true + type: letsencrypt + rbac: + enabled: true + scheduling: + userScheduler: + # Turning this off per: https://discourse.jupyter.org/t/singleuser-pods-stuck-in-pending/6349/11 + enabled: false + singleuser: + cmd: jupyterhub-singleuser + cpu: + guarantee: 0.1 + limit: 1 + defaultUrl: /lab + extraEnv: + # Set any required environment variables here, for example: + ARANGO_PASSWORD: you-real-password + image: + # Very important this is present, the singleuser pods need credentials to pull the Jupyterhub image + pullSecrets: [bindercred] + memory: + guarantee: 256M + limit: 40G + startTimeout: 300 + storage: + capacity: 10Gi + dynamic: + pvcNameTemplate: claim-{username}{servername} + storageAccessModes: + - ReadWriteOnce + storageClass: standard + volumeNameTemplate: volume-{username}{servername} + extraLabels: {} + extraVolumeMounts: [] + extraVolumes: [] + homeMountPath: /home/jovyan + static: + pvcName: null + subPath: "{username}" + type: none + uid: 1000 + +# Docker is not a supported runtime on AKS, so we must use DinD +dind: + enabled: true registry: - password: # You can find this defined in the existing deployment - url: https://gcr.io + url: # Get this from the desired Azure Container Registry + username: # Get this from the desired Azure Container Registry + password: # Get this from the desired Azure Container Registry diff --git a/docs/wiki/binderhub/config-stg.yaml b/docs/wiki/binderhub/config-stg.yaml index ade6611806..3fc8f025ec 100644 --- a/docs/wiki/binderhub/config-stg.yaml +++ b/docs/wiki/binderhub/config-stg.yaml @@ -1,90 +1,136 @@ config: BinderHub: auth_enabled: true - hub_url: https://jupyter-demo.lifelike.bio + cors_allow_origin: "*" + hub_url: https://jupyter.lifelike.bio + image_prefix: lifelikebinderhub.azurecr.io/binderhub/notebooks- use_registry: true - image_prefix: gcr.io/able-goods-221820/binderhub-demo GitHubRepoProvider: access_token: # Get this value from our existing Binderhub release (you can find it in the helm chart) banned_specs: - ^(?!SBRG/|\.).* -service: - type: ClusterIP - -jupyterhub: - proxy: - service: - type: ClusterIP - https: - enabled: true - type: letsencrypt ingress: - enabled: true - hosts: - - jupyter-demo.lifelike.bio annotations: + cert-manager.io/cluster-issuer: letsencrypt-stg kubernetes.io/ingress.class: nginx kubernetes.io/tls-acme: "true" - cert-manager.io/cluster-issuer: letsencrypt-staging + enabled: true + hosts: + - binder.lifelike.bio + https: + enabled: true + type: letsencrypt + pathType: Prefix tls: - - secretName: jupyter-demo-lifelike-bio-tls - hosts: - - jupyter-demo.lifelike.bio + - hosts: + - binder.lifelike.bio + secretName: binder-lifelike-bio-tls + +imageCleaner: + # Turning this off as it does not seem to work on AKS with this version of DinD + enabled: false + +jupyterhub: cull: - # Since we're using authenticated users, don't cull them. We would set this to true if we were using temp users. + concurrency: 10 + enabled: true + every: 600 + maxAge: 86400 + removeNamedServers: false + timeout: 1200 users: false - timeout: 1800 - removeNamedServers: true hub: allowNamedServers: true - namedServerLimitPerUser: 5 - shutdownOnLogout: true - redirectToServer: false config: BinderSpawner: auth_enabled: true - JupyterHub: - authenticator_class: google - Authenticator: - allowed_users: - - e4sanchez@@eng.ucsd.edu # Ethan - # You can add more users here + cors_allow_origin: "*" GoogleOAuthenticator: + admin_users: + - e4sanchez@ucsd.edu + allow_existing_users: true client_id: # Get this value from the Google Cloud Console client_secret: # Get this value from the Google Cloud Console - oauth_callback_url: https://jupyter-demo.lifelike.bio/hub/oauth_callback + hosted_domain: + - ucsd.edu + - biosustain.dtu.dk login_service: Google + oauth_callback_url: https://jupyter.lifelike.bio/hub/oauth_callback + JupyterHub: + admin_access: true + authenticator_class: google + consecutiveFailureLimit: 5 + namedServerLimitPerUser: 5 services: binder: + admin: true + apiToken: null oauth_client_id: service-binderhub oauth_no_confirm: true - oauth_redirect_uri: "https://binder-demo.lifelike.bio/oauth_callback" - loadRoles: - user: - scopes: - - self - - "access:services" - singleuser: - # to make notebook servers aware of hub - cmd: jupyterhub-singleuser - + oauth_redirect_uri: https://binder.lifelike.bio/oauth_callback ingress: + annotations: + cert-manager.io/cluster-issuer: letsencrypt-stg + kubernetes.io/ingress.class: nginx + kubernetes.io/tls-acme: "true" enabled: true + hosts: + - jupyter.lifelike.bio + pathType: Prefix + tls: + - hosts: + - jupyter.lifelike.bio + secretName: jupyter-lifelike-bio-tls + proxy: https: enabled: true type: letsencrypt - hosts: - - binder-demo.lifelike.bio - annotations: - kubernetes.io/ingress.class: nginx - kubernetes.io/tls-acme: "true" - cert-manager.io/cluster-issuer: letsencrypt-staging - tls: - - secretName: binder-demo-lifelike-bio-tls - hosts: - - binder-demo.lifelike.bio + rbac: + enabled: true + scheduling: + userScheduler: + # Turning this off per: https://discourse.jupyter.org/t/singleuser-pods-stuck-in-pending/6349/11 + enabled: false + singleuser: + cmd: jupyterhub-singleuser + cpu: + guarantee: 0.1 + limit: 1 + defaultUrl: /lab + extraEnv: + # Set any required environment variables here, for example: + ARANGO_PASSWORD: you-real-password + image: + # Very important this is present, the singleuser pods need credentials to pull the Jupyterhub image + pullSecrets: [bindercred] + memory: + guarantee: 256M + limit: 40G + startTimeout: 300 + storage: + capacity: 10Gi + dynamic: + pvcNameTemplate: claim-{username}{servername} + storageAccessModes: + - ReadWriteOnce + storageClass: standard + volumeNameTemplate: volume-{username}{servername} + extraLabels: {} + extraVolumeMounts: [] + extraVolumes: [] + homeMountPath: /home/jovyan + static: + pvcName: null + subPath: "{username}" + type: none + uid: 1000 + +# Docker is not a supported runtime on AKS, so we must use DinD +dind: + enabled: true registry: - password: # You can find this defined in the existing deployment - url: https://gcr.io + url: # Get this from the desired Azure Container Registry + username: # Get this from the desired Azure Container Registry + password: # Get this from the desired Azure Container Registry diff --git a/docs/wiki/binderhub/ingress-nginx.yaml b/docs/wiki/binderhub/ingress-nginx.yaml index c5fcd5d3b2..367c4d88c5 100644 --- a/docs/wiki/binderhub/ingress-nginx.yaml +++ b/docs/wiki/binderhub/ingress-nginx.yaml @@ -2,6 +2,13 @@ ## Ref: https://github.com/kubernetes/ingress-nginx/blob/main/charts/ingress-nginx/values.yaml controller: + # Set any nginx configurations here + config: { "proxy-body-size": "15m", "proxy-read-timeout": "15m", "proxy-send-timeout": "15m" } + service: + annotations: + # It is critical that this annotation is present, otherwise the ingress pods may fail to start! + service.beta.kubernetes.io/azure-load-balancer-health-probe-request-path: "/healthz" + # -- Used by cloud providers to connect the resulting `LoadBalancer` to a pre-existing static IP according to https://kubernetes.io/docs/concepts/services-networking/service/#loadbalancer - loadBalancerIP: + loadBalancerIP: diff --git a/docs/wiki/binderhub/setup-binderhub.md b/docs/wiki/binderhub/setup-binderhub.md index 36fa9e178c..98ccc6735c 100644 --- a/docs/wiki/binderhub/setup-binderhub.md +++ b/docs/wiki/binderhub/setup-binderhub.md @@ -12,28 +12,32 @@ - [Install cert-manager](#install-cert-manager) - [Create a Temporary Certificate Issuer](#create-a-temporary-certificate-issuer) - [Add and Install Ingress NGINX](#add-and-install-ingress-nginx) - - [Optional: Add a Config Map for the NGINX Proxy](#optional-add-a-config-map-for-the-nginx-proxy) +- [Create Container Registry Login Secret](#create-container-registry-login-secret) - [Install Binderhub](#install-binderhub) - [Verify SSL Certs are Created](#verify-ssl-certs-are-created) - [Additional Configurations](#additional-configurations) ## Introduction -This guide will walk you through the process of creating a brand-new Binderhub cluster on Google Cloud. The example configuration files can also be used for other cloud services. +This guide will walk you through the process of creating a brand new Binderhub cluster on Microsoft Azure. -If you do not have a Google Cloud account or project, you can create one by following the instructions [here](https://cloud.google.com/resource-manager/docs/creating-managing-projects). +If you do not have a Azure account or project, you can create one [here](https://azure.microsoft.com/en-us/free). -The guide also primarily uses the Google Cloud Console Terminal, so you do not need to worry about navigating around the GUI to the various resources we create. +This guide primarily uses the web browser Azure Portal to create and manage resources, but you may also use the Azure CLI instead. -If you want to install the Google Cloud CLI on your machine, consult this [guide](https://cloud.google.com/sdk/docs/install). +If you want to install the Azure CLI on your machine, consult this [guide](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli). + +You will also need to install the Kubernetes CLI in order to create some resources on the cluster. You can find instructions on how to install the CLI [here](https://kubernetes.io/docs/tasks/tools/) Finally, we will be using [Helm](https://helm.sh/) to create several Kubernetes resources. Please install Helm locally if you wish to follow this guide on a local terminal. +You may also consider installing [Lens](https://k8slens.dev/), an IDE specifically for connecting to Kubernetes clusters and viewing/managing their resources. This is not required to follow this guide, but it can be an invaluable tool for debugging any issues that may arise. + ## Caveats These intructions assume you will be using the configuration files provided alongside the guide. This will enable a few additional features not included in a vanilla BinderHub deployment. -Most notably, user authentication will be turned on, with Google as the OAuth provider. If you do not require any authentication (highly discouraged), you can use the config-with-no-auth.yaml file instead of the recommended config-stg.yaml and config-prod.yaml. +Most notably, user authentication will be turned on, with Google as the OAuth provider. Also, do note the `GithubRepoProvider` property under the top-level `config` mapping. This is currently configured to ONLY ALLOW Github repositories from within the SBRG organization to be used with the BinderHub deployment. @@ -41,24 +45,17 @@ Finally, recognize that some of the values in the BinderHub configuration may ne ## Create the Cluster -First, create the cluster we will install Binderhub on to: +First, we need to create the cluster we will install Binderhub on to. Please follow the guide [here](https://learn.microsoft.com/en-us/azure/aks/learn/quick-kubernetes-deploy-portal?tabs=azure-cli) for general instructions on how to do so. -```bash -gcloud container clusters create \ - --machine-type n1-standard-2 \ - --num-nodes 2 \ - --zone us-central1 \ - --cluster-version latest \ - -``` +**_However_**, please use the following settings for your cluster, instead of the defaults: -Next, set the admin role binding for your Google Cloud account. This will ensure you're able to make changes to the cluster we've just created: - -```bash -kubectl create clusterrolebinding cluster-admin-binding \ - --clusterrole=cluster-admin \ - --user= -``` +- **Kubernetes version**: 1.26.10 + - This is a confirmed working version of AKS with the version of Binderhub we will be using. +- **Authentication and Authorization**: Local accounts with Kubernetes RBAC +- **Network Policy**: None + - Any other setting may cause issues with the Nginx load balancer. Use another setting at your own risk! +- **Network type (plugin)**: Kubenet + - This will likely be the default, but ensure that it is set to the correct value. Other values have not been validated with the existing Binderhub configuration! ### Create a Namespace for the Binderhub Resources @@ -76,17 +73,15 @@ Before we install Binderhub, we will first install a load balancer to handle req ### Reserve a Static IP Address -The load balancer will need a static IP so we can be certain the address won't change. You can reserve a static IP on Google Cloud with the following command: +The load balancer will need a static IP so we can be certain the address won't change. You can reserve a static IP on Azure by following [these](https://learn.microsoft.com/en-us/azure/virtual-network/ip-services/create-public-ip-portal?tabs=option-1-create-public-ip-standard) instructions. -```bash -gcloud compute addresses create --region us-central1 -``` - -You can name the IP address whatever you want, but use something descriptive like "lifelike-binderhub-proxy". If you'd like a more in-depth explanation of how to reserve a static IP, please follow the [official guide](https://cloud.google.com/compute/docs/ip-addresses/reserve-static-external-ip-address). +You can name the IP address whatever you want, but use something descriptive like "lifelike-binderhub-proxy". ### Configure DNS with the Static IP -Registering a domain name is beyond the scope of this guide, but you will need a domain to use for an authenticated Binderhub server. If you are using Google Cloud, DNS zones can be configured under: Networking > Network Services > Cloud DNS. +Registering a domain name is beyond the scope of this guide, but you will need a domain to use for an authenticated Binderhub server. + +Note that for the Lifelike project we are currently using Google Cloud for our DNS nameservers. DNS zones in Google Cloud can be configured under: Networking > Network Services > Cloud DNS. If you are using a different cloud provider, DNS management will likely be found under a networking resource. Very minimally, you will need to create two new zone standards (most likely A records) for both the BinderHub server and the Jupyterhub server. For example, for the Lifelike project we have the "lifelike.bio" DNS zone, with A records for "binder.lifelike.bio" and "jupyter.lifelike.bio" pointing at the IP address "35.188.33.138". This means that the domains "binder.lifelike.bio" and "jupyter.lifelike.bio" refers to the IP address "35.188.33.138", which itself identifies the load balancer server. @@ -136,7 +131,7 @@ helm install ingress-nginx/ingress-nginx --namesp This will take a few moments. Note that you can name your ingress-nginx installation anything you want, but consider a descriptive name like "binderhub-ingress-nginx". Also, be sure to apply a custom configuration with the `-f` flag as in the example above. Minimally, your config file should include a definition for the load balancer IP. -See the "ingress-nginx.yaml" example file included in the same directory as this guide. +See the "ingress-nginx.yaml" example file included in the same directory as this guide. Please pay careful attention to the comments in this file! Omission of any properties within the config may lead to undesired behavior from the ingress. To check on the status of the ingress-nginx controller, you can run the following command: @@ -148,24 +143,26 @@ You should see something like the following: ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE -binderhub-ingress-nginx-controller LoadBalancer 10.4.7.240 35.188.33.138 80:32402/TCP,443:32177/TCP 9d +binderhub-ingress-nginx-controller LoadBalancer 10.4.7.240 80:32402/TCP,443:32177/TCP 9d ``` If the `EXTERNAL-IP` column is empty, give the load balancer a few more moments to initialize. If it remains empty, you may have forgotten to specify a load balancer IP, or the IP may be unavailable. It should be the IP adress we reserved earlier. -### Optional: Add a Config Map for the NGINX Proxy +## Create Container Registry Login Secret + +There is one more step before we install Binderhub. We have to create a Kubernetes secret resource for our cluster pods to pull/push container images from our container registry. Note that this is not normally required for Binderhub deployments, but seems to be a side effect of running Binderhub using Docker-in-Docker, which is required on Azure Kubernetes. -It's likely that you will want to specify some non-default configurations for the nginx proxy. To do this, you simply need to add a [configmap](https://kubernetes.io/docs/concepts/configuration/configmap/) resource: +To create the secret, run the following command: ```bash -kubectl apply -f your-nginx-configmap.yaml +kubectl create secret docker-registry --docker-server= --docker-username= --docker-password= -n binderhub ``` -An example has been included in the same directory as this guide. +Note the credentials we provide to the command. These can be found in the configuration for your container registry. Also note that in our example config files for Binderhub `secret-name` is expected to be "bindercred". ## Install Binderhub -Now that we have installed the NGINX controller, we can install the binderhub helm chart. First, let's make sure we have access to it. Add the helm repo if you don't have it already: +Now that we have installed the NGINX controller and created a secret credenential for our container registry, we can install the binderhub helm chart. First, let's make sure we have access to it. Add the helm repo if you don't have it already: ```bash helm repo add jupyterhub https://jupyterhub.github.io/helm-chart @@ -180,12 +177,14 @@ helm repo update Finally, let's install Binderhub. Note that we specify a few special flags in the install command: `--version` and `-f`. ```bash -helm install jupyterhub/binderhub --version=1.0.0-0.dev.git.3128.h52ffd88 --namespace=binderhub -f config-stg.yaml +helm install jupyterhub/binderhub --version=0.2.0-n886.h4169712 --namespace=binderhub -f config-stg.yaml ``` It may take a few moments for this command to complete. -You can find a list of Binderhub releases [here](https://hub.jupyter.org/helm-chart/#development-releases-binderhub). Simply copy the version you want to install, e.g. "1.0.0-0.dev.git.3128.h52ffd88", which is the version we use in the example command. +You can find a list of Binderhub releases [here](https://hub.jupyter.org/helm-chart/#development-releases-binderhub). Simply copy the version you want to install, e.g. "0.2.0-n886.h4169712", which is the version we use in the example command. + +It is **crucial** that you use this version of Binderhub! Any other version is untested and may not work with the current configuration files! `-f` lets us specify configuration values via a yaml file. In the example, we use a file named "config-stg.yaml". See the example file of the same name in the same folder as this guide. @@ -214,7 +213,7 @@ Notice the file "cluster-issuer-prod.yaml". See the file of the same name in the Then, upgrade our BinderHub deployment to use this new cluster issuer: ```bash -helm upgrade lifelike-binderhub jupyterhub/binderhub --version=1.0.0-0.dev.git.3128.h52ffd88 --namespace=lifelike-binderhub -f config-prod.yaml +helm upgrade lifelike-binderhub jupyterhub/binderhub --version=0.2.0-n886.h4169712 --namespace=lifelike-binderhub -f config-prod.yaml ``` Congratulations! Your BinderHub deployment is complete! Verify the production certificates are indeed working by returning to your BinderHub in a web browser. Also, try creating a notebook with your favorite Github repository. If you are eventually redirected to the JupyterHub page, you've successfully deployed BinderHub! diff --git a/filebeat/Dockerfile b/filebeat/Dockerfile index 620fa44ab1..57f0b14800 100644 --- a/filebeat/Dockerfile +++ b/filebeat/Dockerfile @@ -1,8 +1,7 @@ FROM docker.elastic.co/beats/filebeat:7.11.1 LABEL app=kg-prototypes -ARG CONFIG_FILE=filebeat.yml -COPY config/${CONFIG_FILE} /usr/share/filebeat/filebeat.yml +COPY config/filebeat.yml /usr/share/filebeat/filebeat.yml USER root RUN chown root:filebeat /usr/share/filebeat/filebeat.yml USER filebeat diff --git a/filebeat/config/filebeat.yml b/filebeat/config/filebeat.yml index 8ed9b8330d..6f79273a13 100644 --- a/filebeat/config/filebeat.yml +++ b/filebeat/config/filebeat.yml @@ -4,7 +4,7 @@ filebeat.autodiscover: templates: - condition: contains: - docker.container.image: kg-webserver + docker.container.image: ${WEBSERVER_IMAGE:kg-webserver} config: - type: docker containers.ids: @@ -23,7 +23,7 @@ filebeat.autodiscover: templates: - condition: contains: - docker.container.image: kg-appserver + docker.container.image: ${APPSERVER_IMAGE:kg-appserver} config: - type: container paths: @@ -37,4 +37,5 @@ processors: host: "unix:///var/run/docker.sock" output.logstash: - hosts: ["logstash:5044"] + hosts: + - ${LOGSTASH_OUTPUT_HOST:logstash:5044} diff --git a/graph-db/.gitignore b/graph-db/.gitignore index 11b80917c1..6d500b20d9 100644 --- a/graph-db/.gitignore +++ b/graph-db/.gitignore @@ -1,2 +1,142 @@ docker-compose.override.yml tmp/ +.DS_Store + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# PyCharm +.idea/ + +.vscode/settings.json +.vscode/launch.json + +# shaded POM file +dependency-reduced-pom.xml diff --git a/graph-db/README.md b/graph-db/README.md index e69de29bb2..46cc693008 100644 --- a/graph-db/README.md +++ b/graph-db/README.md @@ -0,0 +1,5 @@ +## graph-db + +`extraction`: Python code that extracts data from various sources; e.g KEGG, BioCyc, etc + +`migration`: Liquibase migration for the graph-db diff --git a/logstash/.dockerignore b/logstash/.dockerignore new file mode 100644 index 0000000000..1d1fe94df4 --- /dev/null +++ b/logstash/.dockerignore @@ -0,0 +1 @@ +Dockerfile \ No newline at end of file diff --git a/statistical-enrichment/Dockerfile b/statistical-enrichment/Dockerfile index a9e34baaf2..6503159e4f 100644 --- a/statistical-enrichment/Dockerfile +++ b/statistical-enrichment/Dockerfile @@ -1,80 +1,34 @@ -# ======================================== -# Base image -# ======================================== -FROM python:3.10-slim as base +FROM python:3.8-buster +LABEL app=kg-prototypes -ENV LANG C.UTF-8 -ENV LC_ALL C.UTF-8 -ENV PYTHONDONTWRITEBYTECODE 1 -ENV PYTHONFAULTHANDLER 1 - -RUN pip install pipenv - - -# ======================================== -# Build dependencies stage -# ======================================== -FROM base as build-deps - -# Copy Pipfiles -COPY Pipfile Pipfile.lock ./ - -# Install Python dependencies -ARG DEV -RUN PIPENV_VENV_IN_PROJECT=1 pipenv install --deploy $(if [ "$DEV" ]; then echo --dev; fi) +ENV _USER user +ENV _HOME /home/$_USER +ENV UID 1000 +ENV GID 1000 +# Root acces only if in root group +RUN echo "auth required pam_wheel.so" >> /etc/pam.d/su -# ======================================== -# Runtime stage -# ======================================== -FROM base -LABEL org.opencontainers.image.source https://github.com/SBRG/lifelike - -# Install curl for self healthchecks -RUN apt-get update && apt-get install -y curl && apt-get clean - -# Copy Python virtual environment -COPY --from=build-deps /.venv /.venv -ENV PATH="/.venv/bin:$PATH" - -# Set user and working directory -WORKDIR /app -RUN useradd -m -d /app app -USER app - -# Copy application code -COPY --chown=app . . +# Install dependencies +RUN pip install pipenv -# Neo4j configuration -ENV NEO4J_HOST=neo4j -ENV NEO4J_PORT=7687 -ENV NEO4J_AUTH=neo4j/password -ENV NEO4J_SCHEME=bolt +# User and group creation +RUN addgroup --system wheel && \ + groupadd -g $GID $_USER && \ + useradd -u $UID -g $GID -G wheel --create-home --shell /bin/bash $_USER -# Redis cache configuration -ENV REDIS_HOST=redis -ENV REDIS_PORT=6379 -ENV REDIS_PASSWORD=password -ENV REDIS_DB=0 +WORKDIR $_HOME -# Default TTL for cache -ENV CACHE_TTL=86400 +# Copy Pipfiles and install dependencies FIRST to better apply Docker layer cache +COPY --chown=$UID:$GID Pipfile . +COPY --chown=$UID:$GID Pipfile.lock . -# Optional Elastic APM configuration. -# To enable, at least ELASTIC_APM_SERVER_URL must be set -# Other available variables: https://www.elastic.co/guide/en/apm/agent/python/master/configuration.html -ENV ELASTIC_APM_SERVER_URL= -ENV ELASTIC_APM_SERVICE_NAME=statistical-enrichment +RUN pipenv install --dev --deploy --system -# Flask env [development, testing, production] -ENV FLASK_ENV=production +ENV FLASK_APP=statistical_enrichment -# Listen port -ENV PORT=5000 -EXPOSE $PORT +COPY --chown=$UID:$GID . . -# Healtcheck -HEALTHCHECK --start-period=15s \ - CMD curl -f localhost:$PORT/healthz || exit 1 +USER $_USER -CMD bin/startup.sh +CMD [ "bin/startup.sh" ]